3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * @file libavcodec/dsputil.c
32 #include "simple_idct.h"
37 #include "mpegvideo.h"
45 void ff_spatial_dwt(int *buffer
, int width
, int height
, int stride
, int type
, int decomposition_count
);
48 void ff_ea_idct_put_c(uint8_t *dest
, int linesize
, DCTELEM
*block
);
51 void ff_bink_idct_c (DCTELEM
*block
);
52 void ff_bink_idct_add_c(uint8_t *dest
, int linesize
, DCTELEM
*block
);
53 void ff_bink_idct_put_c(uint8_t *dest
, int linesize
, DCTELEM
*block
);
55 uint8_t ff_cropTbl
[256 + 2 * MAX_NEG_CROP
] = {0, };
56 uint32_t ff_squareTbl
[512] = {0, };
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL/255 * 0x7f)
60 #define pb_80 (~0UL/255 * 0x80)
62 const uint8_t ff_zigzag_direct
[64] = {
63 0, 1, 8, 16, 9, 2, 3, 10,
64 17, 24, 32, 25, 18, 11, 4, 5,
65 12, 19, 26, 33, 40, 48, 41, 34,
66 27, 20, 13, 6, 7, 14, 21, 28,
67 35, 42, 49, 56, 57, 50, 43, 36,
68 29, 22, 15, 23, 30, 37, 44, 51,
69 58, 59, 52, 45, 38, 31, 39, 46,
70 53, 60, 61, 54, 47, 55, 62, 63
73 /* Specific zigzag scan for 248 idct. NOTE that unlike the
74 specification, we interleave the fields */
75 const uint8_t ff_zigzag248_direct
[64] = {
76 0, 8, 1, 9, 16, 24, 2, 10,
77 17, 25, 32, 40, 48, 56, 33, 41,
78 18, 26, 3, 11, 4, 12, 19, 27,
79 34, 42, 49, 57, 50, 58, 35, 43,
80 20, 28, 5, 13, 6, 14, 21, 29,
81 36, 44, 51, 59, 52, 60, 37, 45,
82 22, 30, 7, 15, 23, 31, 38, 46,
83 53, 61, 54, 62, 39, 47, 55, 63,
86 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
87 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16
)[64];
89 const uint8_t ff_alternate_horizontal_scan
[64] = {
90 0, 1, 2, 3, 8, 9, 16, 17,
91 10, 11, 4, 5, 6, 7, 15, 14,
92 13, 12, 19, 18, 24, 25, 32, 33,
93 26, 27, 20, 21, 22, 23, 28, 29,
94 30, 31, 34, 35, 40, 41, 48, 49,
95 42, 43, 36, 37, 38, 39, 44, 45,
96 46, 47, 50, 51, 56, 57, 58, 59,
97 52, 53, 54, 55, 60, 61, 62, 63,
100 const uint8_t ff_alternate_vertical_scan
[64] = {
101 0, 8, 16, 24, 1, 9, 2, 10,
102 17, 25, 32, 40, 48, 56, 57, 49,
103 41, 33, 26, 18, 3, 11, 4, 12,
104 19, 27, 34, 42, 50, 58, 35, 43,
105 51, 59, 20, 28, 5, 13, 6, 14,
106 21, 29, 36, 44, 52, 60, 37, 45,
107 53, 61, 22, 30, 7, 15, 23, 31,
108 38, 46, 54, 62, 39, 47, 55, 63,
111 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
112 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
113 const uint32_t ff_inverse
[257]={
114 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
115 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
116 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
117 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
118 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
119 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
120 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
121 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
122 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
123 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
124 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
125 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
126 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
127 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
128 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
129 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
130 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
131 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
132 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
133 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
134 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
135 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
136 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
137 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
138 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
139 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
140 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
141 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
142 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
143 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
144 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
145 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation
[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161 static const uint8_t idct_sse2_row_perm
[8] = {0, 4, 1, 5, 2, 6, 3, 7};
163 void ff_init_scantable(uint8_t *permutation
, ScanTable
*st
, const uint8_t *src_scantable
){
167 st
->scantable
= src_scantable
;
171 j
= src_scantable
[i
];
172 st
->permutated
[i
] = permutation
[j
];
181 j
= st
->permutated
[i
];
183 st
->raster_end
[i
]= end
;
187 static int pix_sum_c(uint8_t * pix
, int line_size
)
192 for (i
= 0; i
< 16; i
++) {
193 for (j
= 0; j
< 16; j
+= 8) {
204 pix
+= line_size
- 16;
209 static int pix_norm1_c(uint8_t * pix
, int line_size
)
212 uint32_t *sq
= ff_squareTbl
+ 256;
215 for (i
= 0; i
< 16; i
++) {
216 for (j
= 0; j
< 16; j
+= 8) {
227 #if LONG_MAX > 2147483647
228 register uint64_t x
=*(uint64_t*)pix
;
230 s
+= sq
[(x
>>8)&0xff];
231 s
+= sq
[(x
>>16)&0xff];
232 s
+= sq
[(x
>>24)&0xff];
233 s
+= sq
[(x
>>32)&0xff];
234 s
+= sq
[(x
>>40)&0xff];
235 s
+= sq
[(x
>>48)&0xff];
236 s
+= sq
[(x
>>56)&0xff];
238 register uint32_t x
=*(uint32_t*)pix
;
240 s
+= sq
[(x
>>8)&0xff];
241 s
+= sq
[(x
>>16)&0xff];
242 s
+= sq
[(x
>>24)&0xff];
243 x
=*(uint32_t*)(pix
+4);
245 s
+= sq
[(x
>>8)&0xff];
246 s
+= sq
[(x
>>16)&0xff];
247 s
+= sq
[(x
>>24)&0xff];
252 pix
+= line_size
- 16;
257 static void bswap_buf(uint32_t *dst
, const uint32_t *src
, int w
){
260 for(i
=0; i
+8<=w
; i
+=8){
261 dst
[i
+0]= bswap_32(src
[i
+0]);
262 dst
[i
+1]= bswap_32(src
[i
+1]);
263 dst
[i
+2]= bswap_32(src
[i
+2]);
264 dst
[i
+3]= bswap_32(src
[i
+3]);
265 dst
[i
+4]= bswap_32(src
[i
+4]);
266 dst
[i
+5]= bswap_32(src
[i
+5]);
267 dst
[i
+6]= bswap_32(src
[i
+6]);
268 dst
[i
+7]= bswap_32(src
[i
+7]);
271 dst
[i
+0]= bswap_32(src
[i
+0]);
275 static int sse4_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
)
278 uint32_t *sq
= ff_squareTbl
+ 256;
281 for (i
= 0; i
< h
; i
++) {
282 s
+= sq
[pix1
[0] - pix2
[0]];
283 s
+= sq
[pix1
[1] - pix2
[1]];
284 s
+= sq
[pix1
[2] - pix2
[2]];
285 s
+= sq
[pix1
[3] - pix2
[3]];
292 static int sse8_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
)
295 uint32_t *sq
= ff_squareTbl
+ 256;
298 for (i
= 0; i
< h
; i
++) {
299 s
+= sq
[pix1
[0] - pix2
[0]];
300 s
+= sq
[pix1
[1] - pix2
[1]];
301 s
+= sq
[pix1
[2] - pix2
[2]];
302 s
+= sq
[pix1
[3] - pix2
[3]];
303 s
+= sq
[pix1
[4] - pix2
[4]];
304 s
+= sq
[pix1
[5] - pix2
[5]];
305 s
+= sq
[pix1
[6] - pix2
[6]];
306 s
+= sq
[pix1
[7] - pix2
[7]];
313 static int sse16_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
316 uint32_t *sq
= ff_squareTbl
+ 256;
319 for (i
= 0; i
< h
; i
++) {
320 s
+= sq
[pix1
[ 0] - pix2
[ 0]];
321 s
+= sq
[pix1
[ 1] - pix2
[ 1]];
322 s
+= sq
[pix1
[ 2] - pix2
[ 2]];
323 s
+= sq
[pix1
[ 3] - pix2
[ 3]];
324 s
+= sq
[pix1
[ 4] - pix2
[ 4]];
325 s
+= sq
[pix1
[ 5] - pix2
[ 5]];
326 s
+= sq
[pix1
[ 6] - pix2
[ 6]];
327 s
+= sq
[pix1
[ 7] - pix2
[ 7]];
328 s
+= sq
[pix1
[ 8] - pix2
[ 8]];
329 s
+= sq
[pix1
[ 9] - pix2
[ 9]];
330 s
+= sq
[pix1
[10] - pix2
[10]];
331 s
+= sq
[pix1
[11] - pix2
[11]];
332 s
+= sq
[pix1
[12] - pix2
[12]];
333 s
+= sq
[pix1
[13] - pix2
[13]];
334 s
+= sq
[pix1
[14] - pix2
[14]];
335 s
+= sq
[pix1
[15] - pix2
[15]];
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int w
, int h
, int type
){
347 const int dec_count
= w
==8 ?
3 : 4;
350 static const int scale
[2][2][4][4]={
354 {268, 239, 239, 213},
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
367 {275, 245, 245, 218},
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
380 for (i
= 0; i
< h
; i
++) {
381 for (j
= 0; j
< w
; j
+=4) {
382 tmp
[32*i
+j
+0] = (pix1
[j
+0] - pix2
[j
+0])<<4;
383 tmp
[32*i
+j
+1] = (pix1
[j
+1] - pix2
[j
+1])<<4;
384 tmp
[32*i
+j
+2] = (pix1
[j
+2] - pix2
[j
+2])<<4;
385 tmp
[32*i
+j
+3] = (pix1
[j
+3] - pix2
[j
+3])<<4;
391 ff_spatial_dwt(tmp
, w
, h
, 32, type
, dec_count
);
395 for(level
=0; level
<dec_count
; level
++){
396 for(ori
= level ?
1 : 0; ori
<4; ori
++){
397 int size
= w
>>(dec_count
-level
);
398 int sx
= (ori
&1) ? size
: 0;
399 int stride
= 32<<(dec_count
-level
);
400 int sy
= (ori
&2) ? stride
>>1 : 0;
402 for(i
=0; i
<size
; i
++){
403 for(j
=0; j
<size
; j
++){
404 int v
= tmp
[sx
+ sy
+ i
*stride
+ j
] * scale
[type
][dec_count
-3][level
][ori
];
414 static int w53_8_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
415 return w_c(v
, pix1
, pix2
, line_size
, 8, h
, 1);
418 static int w97_8_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
419 return w_c(v
, pix1
, pix2
, line_size
, 8, h
, 0);
422 static int w53_16_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
423 return w_c(v
, pix1
, pix2
, line_size
, 16, h
, 1);
426 static int w97_16_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
427 return w_c(v
, pix1
, pix2
, line_size
, 16, h
, 0);
430 int w53_32_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
431 return w_c(v
, pix1
, pix2
, line_size
, 32, h
, 1);
434 int w97_32_c(void *v
, uint8_t * pix1
, uint8_t * pix2
, int line_size
, int h
){
435 return w_c(v
, pix1
, pix2
, line_size
, 32, h
, 0);
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf
, int wrap
, int width
, int height
, int w
)
443 uint8_t *ptr
, *last_line
;
446 last_line
= buf
+ (height
- 1) * wrap
;
449 memcpy(buf
- (i
+ 1) * wrap
, buf
, width
);
450 memcpy(last_line
+ (i
+ 1) * wrap
, last_line
, width
);
454 for(i
=0;i
<height
;i
++) {
455 memset(ptr
- w
, ptr
[0], w
);
456 memset(ptr
+ width
, ptr
[width
-1], w
);
461 memset(buf
- (i
+ 1) * wrap
- w
, buf
[0], w
); /* top left */
462 memset(buf
- (i
+ 1) * wrap
+ width
, buf
[width
-1], w
); /* top right */
463 memset(last_line
+ (i
+ 1) * wrap
- w
, last_line
[0], w
); /* top left */
464 memset(last_line
+ (i
+ 1) * wrap
+ width
, last_line
[width
-1], w
); /* top right */
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
480 void ff_emulated_edge_mc(uint8_t *buf
, uint8_t *src
, int linesize
, int block_w
, int block_h
,
481 int src_x
, int src_y
, int w
, int h
){
483 int start_y
, start_x
, end_y
, end_x
;
486 src
+= (h
-1-src_y
)*linesize
;
488 }else if(src_y
<=-block_h
){
489 src
+= (1-block_h
-src_y
)*linesize
;
495 }else if(src_x
<=-block_w
){
496 src
+= (1-block_w
-src_x
);
500 start_y
= FFMAX(0, -src_y
);
501 start_x
= FFMAX(0, -src_x
);
502 end_y
= FFMIN(block_h
, h
-src_y
);
503 end_x
= FFMIN(block_w
, w
-src_x
);
505 // copy existing part
506 for(y
=start_y
; y
<end_y
; y
++){
507 for(x
=start_x
; x
<end_x
; x
++){
508 buf
[x
+ y
*linesize
]= src
[x
+ y
*linesize
];
513 for(y
=0; y
<start_y
; y
++){
514 for(x
=start_x
; x
<end_x
; x
++){
515 buf
[x
+ y
*linesize
]= buf
[x
+ start_y
*linesize
];
520 for(y
=end_y
; y
<block_h
; y
++){
521 for(x
=start_x
; x
<end_x
; x
++){
522 buf
[x
+ y
*linesize
]= buf
[x
+ (end_y
-1)*linesize
];
526 for(y
=0; y
<block_h
; y
++){
528 for(x
=0; x
<start_x
; x
++){
529 buf
[x
+ y
*linesize
]= buf
[start_x
+ y
*linesize
];
533 for(x
=end_x
; x
<block_w
; x
++){
534 buf
[x
+ y
*linesize
]= buf
[end_x
- 1 + y
*linesize
];
539 static void get_pixels_c(DCTELEM
*restrict block
, const uint8_t *pixels
, int line_size
)
543 /* read the pixels */
545 block
[0] = pixels
[0];
546 block
[1] = pixels
[1];
547 block
[2] = pixels
[2];
548 block
[3] = pixels
[3];
549 block
[4] = pixels
[4];
550 block
[5] = pixels
[5];
551 block
[6] = pixels
[6];
552 block
[7] = pixels
[7];
558 static void diff_pixels_c(DCTELEM
*restrict block
, const uint8_t *s1
,
559 const uint8_t *s2
, int stride
){
562 /* read the pixels */
564 block
[0] = s1
[0] - s2
[0];
565 block
[1] = s1
[1] - s2
[1];
566 block
[2] = s1
[2] - s2
[2];
567 block
[3] = s1
[3] - s2
[3];
568 block
[4] = s1
[4] - s2
[4];
569 block
[5] = s1
[5] - s2
[5];
570 block
[6] = s1
[6] - s2
[6];
571 block
[7] = s1
[7] - s2
[7];
579 static void put_pixels_clamped_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
583 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
585 /* read the pixels */
587 pixels
[0] = cm
[block
[0]];
588 pixels
[1] = cm
[block
[1]];
589 pixels
[2] = cm
[block
[2]];
590 pixels
[3] = cm
[block
[3]];
591 pixels
[4] = cm
[block
[4]];
592 pixels
[5] = cm
[block
[5]];
593 pixels
[6] = cm
[block
[6]];
594 pixels
[7] = cm
[block
[7]];
601 static void put_pixels_clamped4_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
605 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
607 /* read the pixels */
609 pixels
[0] = cm
[block
[0]];
610 pixels
[1] = cm
[block
[1]];
611 pixels
[2] = cm
[block
[2]];
612 pixels
[3] = cm
[block
[3]];
619 static void put_pixels_clamped2_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
623 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
625 /* read the pixels */
627 pixels
[0] = cm
[block
[0]];
628 pixels
[1] = cm
[block
[1]];
635 static void put_signed_pixels_clamped_c(const DCTELEM
*block
,
636 uint8_t *restrict pixels
,
641 for (i
= 0; i
< 8; i
++) {
642 for (j
= 0; j
< 8; j
++) {
645 else if (*block
> 127)
648 *pixels
= (uint8_t)(*block
+ 128);
652 pixels
+= (line_size
- 8);
656 static void put_pixels_nonclamped_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
661 /* read the pixels */
663 pixels
[0] = block
[0];
664 pixels
[1] = block
[1];
665 pixels
[2] = block
[2];
666 pixels
[3] = block
[3];
667 pixels
[4] = block
[4];
668 pixels
[5] = block
[5];
669 pixels
[6] = block
[6];
670 pixels
[7] = block
[7];
677 static void add_pixels_clamped_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
681 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
683 /* read the pixels */
685 pixels
[0] = cm
[pixels
[0] + block
[0]];
686 pixels
[1] = cm
[pixels
[1] + block
[1]];
687 pixels
[2] = cm
[pixels
[2] + block
[2]];
688 pixels
[3] = cm
[pixels
[3] + block
[3]];
689 pixels
[4] = cm
[pixels
[4] + block
[4]];
690 pixels
[5] = cm
[pixels
[5] + block
[5]];
691 pixels
[6] = cm
[pixels
[6] + block
[6]];
692 pixels
[7] = cm
[pixels
[7] + block
[7]];
698 static void add_pixels_clamped4_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
702 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
704 /* read the pixels */
706 pixels
[0] = cm
[pixels
[0] + block
[0]];
707 pixels
[1] = cm
[pixels
[1] + block
[1]];
708 pixels
[2] = cm
[pixels
[2] + block
[2]];
709 pixels
[3] = cm
[pixels
[3] + block
[3]];
715 static void add_pixels_clamped2_c(const DCTELEM
*block
, uint8_t *restrict pixels
,
719 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
721 /* read the pixels */
723 pixels
[0] = cm
[pixels
[0] + block
[0]];
724 pixels
[1] = cm
[pixels
[1] + block
[1]];
730 static void add_pixels8_c(uint8_t *restrict pixels
, DCTELEM
*block
, int line_size
)
734 pixels
[0] += block
[0];
735 pixels
[1] += block
[1];
736 pixels
[2] += block
[2];
737 pixels
[3] += block
[3];
738 pixels
[4] += block
[4];
739 pixels
[5] += block
[5];
740 pixels
[6] += block
[6];
741 pixels
[7] += block
[7];
747 static void add_pixels4_c(uint8_t *restrict pixels
, DCTELEM
*block
, int line_size
)
751 pixels
[0] += block
[0];
752 pixels
[1] += block
[1];
753 pixels
[2] += block
[2];
754 pixels
[3] += block
[3];
760 static int sum_abs_dctelem_c(DCTELEM
*block
)
764 sum
+= FFABS(block
[i
]);
768 static void fill_block16_c(uint8_t *block
, uint8_t value
, int line_size
, int h
)
772 for (i
= 0; i
< h
; i
++) {
773 memset(block
, value
, 16);
778 static void fill_block8_c(uint8_t *block
, uint8_t value
, int line_size
, int h
)
782 for (i
= 0; i
< h
; i
++) {
783 memset(block
, value
, 8);
788 static void scale_block_c(const uint8_t src
[64]/*align 8*/, uint8_t *dst
/*align 8*/, int linesize
)
791 uint16_t *dst1
= dst
;
792 uint16_t *dst2
= dst
+ linesize
;
794 for (j
= 0; j
< 8; j
++) {
795 for (i
= 0; i
< 8; i
++) {
796 dst1
[i
] = dst2
[i
] = src
[i
] * 0x0101;
806 #define PIXOP2(OPNAME, OP) \
807 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
811 OP(*((uint64_t*)block), AV_RN64(pixels));\
817 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
821 const uint64_t a= AV_RN64(pixels );\
822 const uint64_t b= AV_RN64(pixels+1);\
823 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
829 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
833 const uint64_t a= AV_RN64(pixels );\
834 const uint64_t b= AV_RN64(pixels+1);\
835 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
841 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845 const uint64_t a= AV_RN64(pixels );\
846 const uint64_t b= AV_RN64(pixels+line_size);\
847 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
853 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
857 const uint64_t a= AV_RN64(pixels );\
858 const uint64_t b= AV_RN64(pixels+line_size);\
859 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
865 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
868 const uint64_t a= AV_RN64(pixels );\
869 const uint64_t b= AV_RN64(pixels+1);\
870 uint64_t l0= (a&0x0303030303030303ULL)\
871 + (b&0x0303030303030303ULL)\
872 + 0x0202020202020202ULL;\
873 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
874 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
878 for(i=0; i<h; i+=2){\
879 uint64_t a= AV_RN64(pixels );\
880 uint64_t b= AV_RN64(pixels+1);\
881 l1= (a&0x0303030303030303ULL)\
882 + (b&0x0303030303030303ULL);\
883 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
884 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
885 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
888 a= AV_RN64(pixels );\
889 b= AV_RN64(pixels+1);\
890 l0= (a&0x0303030303030303ULL)\
891 + (b&0x0303030303030303ULL)\
892 + 0x0202020202020202ULL;\
893 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
894 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
895 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
901 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
904 const uint64_t a= AV_RN64(pixels );\
905 const uint64_t b= AV_RN64(pixels+1);\
906 uint64_t l0= (a&0x0303030303030303ULL)\
907 + (b&0x0303030303030303ULL)\
908 + 0x0101010101010101ULL;\
909 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
910 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
914 for(i=0; i<h; i+=2){\
915 uint64_t a= AV_RN64(pixels );\
916 uint64_t b= AV_RN64(pixels+1);\
917 l1= (a&0x0303030303030303ULL)\
918 + (b&0x0303030303030303ULL);\
919 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
920 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
921 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
924 a= AV_RN64(pixels );\
925 b= AV_RN64(pixels+1);\
926 l0= (a&0x0303030303030303ULL)\
927 + (b&0x0303030303030303ULL)\
928 + 0x0101010101010101ULL;\
929 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
930 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
931 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
937 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
938 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
939 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
940 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
941 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
942 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
943 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
945 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
946 #else // 64 bit variant
948 #define PIXOP2(OPNAME, OP) \
949 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
952 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
957 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
965 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
968 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
969 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
974 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
975 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
978 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
979 int src_stride1, int src_stride2, int h){\
983 a= AV_RN32(&src1[i*src_stride1 ]);\
984 b= AV_RN32(&src2[i*src_stride2 ]);\
985 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
986 a= AV_RN32(&src1[i*src_stride1+4]);\
987 b= AV_RN32(&src2[i*src_stride2+4]);\
988 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
992 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
993 int src_stride1, int src_stride2, int h){\
997 a= AV_RN32(&src1[i*src_stride1 ]);\
998 b= AV_RN32(&src2[i*src_stride2 ]);\
999 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1000 a= AV_RN32(&src1[i*src_stride1+4]);\
1001 b= AV_RN32(&src2[i*src_stride2+4]);\
1002 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1006 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1007 int src_stride1, int src_stride2, int h){\
1009 for(i=0; i<h; i++){\
1011 a= AV_RN32(&src1[i*src_stride1 ]);\
1012 b= AV_RN32(&src2[i*src_stride2 ]);\
1013 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1017 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1018 int src_stride1, int src_stride2, int h){\
1020 for(i=0; i<h; i++){\
1022 a= AV_RN16(&src1[i*src_stride1 ]);\
1023 b= AV_RN16(&src2[i*src_stride2 ]);\
1024 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1028 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1029 int src_stride1, int src_stride2, int h){\
1030 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1031 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1034 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1035 int src_stride1, int src_stride2, int h){\
1036 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1037 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1040 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1044 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1045 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1048 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1052 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1056 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1057 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1059 for(i=0; i<h; i++){\
1060 uint32_t a, b, c, d, l0, l1, h0, h1;\
1061 a= AV_RN32(&src1[i*src_stride1]);\
1062 b= AV_RN32(&src2[i*src_stride2]);\
1063 c= AV_RN32(&src3[i*src_stride3]);\
1064 d= AV_RN32(&src4[i*src_stride4]);\
1065 l0= (a&0x03030303UL)\
1068 h0= ((a&0xFCFCFCFCUL)>>2)\
1069 + ((b&0xFCFCFCFCUL)>>2);\
1070 l1= (c&0x03030303UL)\
1071 + (d&0x03030303UL);\
1072 h1= ((c&0xFCFCFCFCUL)>>2)\
1073 + ((d&0xFCFCFCFCUL)>>2);\
1074 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1075 a= AV_RN32(&src1[i*src_stride1+4]);\
1076 b= AV_RN32(&src2[i*src_stride2+4]);\
1077 c= AV_RN32(&src3[i*src_stride3+4]);\
1078 d= AV_RN32(&src4[i*src_stride4+4]);\
1079 l0= (a&0x03030303UL)\
1082 h0= ((a&0xFCFCFCFCUL)>>2)\
1083 + ((b&0xFCFCFCFCUL)>>2);\
1084 l1= (c&0x03030303UL)\
1085 + (d&0x03030303UL);\
1086 h1= ((c&0xFCFCFCFCUL)>>2)\
1087 + ((d&0xFCFCFCFCUL)>>2);\
1088 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1092 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1093 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1096 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1097 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1100 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1104 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1108 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1109 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1111 for(i=0; i<h; i++){\
1112 uint32_t a, b, c, d, l0, l1, h0, h1;\
1113 a= AV_RN32(&src1[i*src_stride1]);\
1114 b= AV_RN32(&src2[i*src_stride2]);\
1115 c= AV_RN32(&src3[i*src_stride3]);\
1116 d= AV_RN32(&src4[i*src_stride4]);\
1117 l0= (a&0x03030303UL)\
1120 h0= ((a&0xFCFCFCFCUL)>>2)\
1121 + ((b&0xFCFCFCFCUL)>>2);\
1122 l1= (c&0x03030303UL)\
1123 + (d&0x03030303UL);\
1124 h1= ((c&0xFCFCFCFCUL)>>2)\
1125 + ((d&0xFCFCFCFCUL)>>2);\
1126 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1127 a= AV_RN32(&src1[i*src_stride1+4]);\
1128 b= AV_RN32(&src2[i*src_stride2+4]);\
1129 c= AV_RN32(&src3[i*src_stride3+4]);\
1130 d= AV_RN32(&src4[i*src_stride4+4]);\
1131 l0= (a&0x03030303UL)\
1134 h0= ((a&0xFCFCFCFCUL)>>2)\
1135 + ((b&0xFCFCFCFCUL)>>2);\
1136 l1= (c&0x03030303UL)\
1137 + (d&0x03030303UL);\
1138 h1= ((c&0xFCFCFCFCUL)>>2)\
1139 + ((d&0xFCFCFCFCUL)>>2);\
1140 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1143 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1144 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1145 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1146 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1148 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1149 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1150 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1151 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1154 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1156 int i, a0, b0, a1, b1;\
1163 for(i=0; i<h; i+=2){\
1169 block[0]= (a1+a0)>>2; /* FIXME non put */\
1170 block[1]= (b1+b0)>>2;\
1180 block[0]= (a1+a0)>>2;\
1181 block[1]= (b1+b0)>>2;\
1187 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1190 const uint32_t a= AV_RN32(pixels );\
1191 const uint32_t b= AV_RN32(pixels+1);\
1192 uint32_t l0= (a&0x03030303UL)\
1195 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1196 + ((b&0xFCFCFCFCUL)>>2);\
1200 for(i=0; i<h; i+=2){\
1201 uint32_t a= AV_RN32(pixels );\
1202 uint32_t b= AV_RN32(pixels+1);\
1203 l1= (a&0x03030303UL)\
1204 + (b&0x03030303UL);\
1205 h1= ((a&0xFCFCFCFCUL)>>2)\
1206 + ((b&0xFCFCFCFCUL)>>2);\
1207 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1210 a= AV_RN32(pixels );\
1211 b= AV_RN32(pixels+1);\
1212 l0= (a&0x03030303UL)\
1215 h0= ((a&0xFCFCFCFCUL)>>2)\
1216 + ((b&0xFCFCFCFCUL)>>2);\
1217 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1223 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1226 for(j=0; j<2; j++){\
1228 const uint32_t a= AV_RN32(pixels );\
1229 const uint32_t b= AV_RN32(pixels+1);\
1230 uint32_t l0= (a&0x03030303UL)\
1233 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1234 + ((b&0xFCFCFCFCUL)>>2);\
1238 for(i=0; i<h; i+=2){\
1239 uint32_t a= AV_RN32(pixels );\
1240 uint32_t b= AV_RN32(pixels+1);\
1241 l1= (a&0x03030303UL)\
1242 + (b&0x03030303UL);\
1243 h1= ((a&0xFCFCFCFCUL)>>2)\
1244 + ((b&0xFCFCFCFCUL)>>2);\
1245 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1248 a= AV_RN32(pixels );\
1249 b= AV_RN32(pixels+1);\
1250 l0= (a&0x03030303UL)\
1253 h0= ((a&0xFCFCFCFCUL)>>2)\
1254 + ((b&0xFCFCFCFCUL)>>2);\
1255 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1259 pixels+=4-line_size*(h+1);\
1260 block +=4-line_size*h;\
1264 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1267 for(j=0; j<2; j++){\
1269 const uint32_t a= AV_RN32(pixels );\
1270 const uint32_t b= AV_RN32(pixels+1);\
1271 uint32_t l0= (a&0x03030303UL)\
1274 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1275 + ((b&0xFCFCFCFCUL)>>2);\
1279 for(i=0; i<h; i+=2){\
1280 uint32_t a= AV_RN32(pixels );\
1281 uint32_t b= AV_RN32(pixels+1);\
1282 l1= (a&0x03030303UL)\
1283 + (b&0x03030303UL);\
1284 h1= ((a&0xFCFCFCFCUL)>>2)\
1285 + ((b&0xFCFCFCFCUL)>>2);\
1286 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1289 a= AV_RN32(pixels );\
1290 b= AV_RN32(pixels+1);\
1291 l0= (a&0x03030303UL)\
1294 h0= ((a&0xFCFCFCFCUL)>>2)\
1295 + ((b&0xFCFCFCFCUL)>>2);\
1296 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1300 pixels+=4-line_size*(h+1);\
1301 block +=4-line_size*h;\
1305 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1306 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1307 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1308 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1309 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1310 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1311 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1312 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1314 #define op_avg(a, b) a = rnd_avg32(a, b)
1316 #define op_put(a, b) a = b
1323 #define avg2(a,b) ((a+b+1)>>1)
1324 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1326 static void put_no_rnd_pixels16_l2_c(uint8_t *dst
, const uint8_t *a
, const uint8_t *b
, int stride
, int h
){
1327 put_no_rnd_pixels16_l2(dst
, a
, b
, stride
, stride
, stride
, h
);
1330 static void put_no_rnd_pixels8_l2_c(uint8_t *dst
, const uint8_t *a
, const uint8_t *b
, int stride
, int h
){
1331 put_no_rnd_pixels8_l2(dst
, a
, b
, stride
, stride
, stride
, h
);
1334 static void gmc1_c(uint8_t *dst
, uint8_t *src
, int stride
, int h
, int x16
, int y16
, int rounder
)
1336 const int A
=(16-x16
)*(16-y16
);
1337 const int B
=( x16
)*(16-y16
);
1338 const int C
=(16-x16
)*( y16
);
1339 const int D
=( x16
)*( y16
);
1344 dst
[0]= (A
*src
[0] + B
*src
[1] + C
*src
[stride
+0] + D
*src
[stride
+1] + rounder
)>>8;
1345 dst
[1]= (A
*src
[1] + B
*src
[2] + C
*src
[stride
+1] + D
*src
[stride
+2] + rounder
)>>8;
1346 dst
[2]= (A
*src
[2] + B
*src
[3] + C
*src
[stride
+2] + D
*src
[stride
+3] + rounder
)>>8;
1347 dst
[3]= (A
*src
[3] + B
*src
[4] + C
*src
[stride
+3] + D
*src
[stride
+4] + rounder
)>>8;
1348 dst
[4]= (A
*src
[4] + B
*src
[5] + C
*src
[stride
+4] + D
*src
[stride
+5] + rounder
)>>8;
1349 dst
[5]= (A
*src
[5] + B
*src
[6] + C
*src
[stride
+5] + D
*src
[stride
+6] + rounder
)>>8;
1350 dst
[6]= (A
*src
[6] + B
*src
[7] + C
*src
[stride
+6] + D
*src
[stride
+7] + rounder
)>>8;
1351 dst
[7]= (A
*src
[7] + B
*src
[8] + C
*src
[stride
+7] + D
*src
[stride
+8] + rounder
)>>8;
1357 void ff_gmc_c(uint8_t *dst
, uint8_t *src
, int stride
, int h
, int ox
, int oy
,
1358 int dxx
, int dxy
, int dyx
, int dyy
, int shift
, int r
, int width
, int height
)
1361 const int s
= 1<<shift
;
1371 for(x
=0; x
<8; x
++){ //XXX FIXME optimize
1372 int src_x
, src_y
, frac_x
, frac_y
, index
;
1376 frac_x
= src_x
&(s
-1);
1377 frac_y
= src_y
&(s
-1);
1381 if((unsigned)src_x
< width
){
1382 if((unsigned)src_y
< height
){
1383 index
= src_x
+ src_y
*stride
;
1384 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_x
)
1385 + src
[index
+1]* frac_x
)*(s
-frac_y
)
1386 + ( src
[index
+stride
]*(s
-frac_x
)
1387 + src
[index
+stride
+1]* frac_x
)* frac_y
1390 index
= src_x
+ av_clip(src_y
, 0, height
)*stride
;
1391 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_x
)
1392 + src
[index
+1]* frac_x
)*s
1396 if((unsigned)src_y
< height
){
1397 index
= av_clip(src_x
, 0, width
) + src_y
*stride
;
1398 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_y
)
1399 + src
[index
+stride
]* frac_y
)*s
1402 index
= av_clip(src_x
, 0, width
) + av_clip(src_y
, 0, height
)*stride
;
1403 dst
[y
*stride
+ x
]= src
[index
];
1415 static inline void put_tpel_pixels_mc00_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1417 case 2: put_pixels2_c (dst
, src
, stride
, height
); break;
1418 case 4: put_pixels4_c (dst
, src
, stride
, height
); break;
1419 case 8: put_pixels8_c (dst
, src
, stride
, height
); break;
1420 case 16:put_pixels16_c(dst
, src
, stride
, height
); break;
1424 static inline void put_tpel_pixels_mc10_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1426 for (i
=0; i
< height
; i
++) {
1427 for (j
=0; j
< width
; j
++) {
1428 dst
[j
] = (683*(2*src
[j
] + src
[j
+1] + 1)) >> 11;
1435 static inline void put_tpel_pixels_mc20_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1437 for (i
=0; i
< height
; i
++) {
1438 for (j
=0; j
< width
; j
++) {
1439 dst
[j
] = (683*(src
[j
] + 2*src
[j
+1] + 1)) >> 11;
1446 static inline void put_tpel_pixels_mc01_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1448 for (i
=0; i
< height
; i
++) {
1449 for (j
=0; j
< width
; j
++) {
1450 dst
[j
] = (683*(2*src
[j
] + src
[j
+stride
] + 1)) >> 11;
1457 static inline void put_tpel_pixels_mc11_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1459 for (i
=0; i
< height
; i
++) {
1460 for (j
=0; j
< width
; j
++) {
1461 dst
[j
] = (2731*(4*src
[j
] + 3*src
[j
+1] + 3*src
[j
+stride
] + 2*src
[j
+stride
+1] + 6)) >> 15;
1468 static inline void put_tpel_pixels_mc12_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1470 for (i
=0; i
< height
; i
++) {
1471 for (j
=0; j
< width
; j
++) {
1472 dst
[j
] = (2731*(3*src
[j
] + 2*src
[j
+1] + 4*src
[j
+stride
] + 3*src
[j
+stride
+1] + 6)) >> 15;
1479 static inline void put_tpel_pixels_mc02_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1481 for (i
=0; i
< height
; i
++) {
1482 for (j
=0; j
< width
; j
++) {
1483 dst
[j
] = (683*(src
[j
] + 2*src
[j
+stride
] + 1)) >> 11;
1490 static inline void put_tpel_pixels_mc21_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1492 for (i
=0; i
< height
; i
++) {
1493 for (j
=0; j
< width
; j
++) {
1494 dst
[j
] = (2731*(3*src
[j
] + 4*src
[j
+1] + 2*src
[j
+stride
] + 3*src
[j
+stride
+1] + 6)) >> 15;
1501 static inline void put_tpel_pixels_mc22_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1503 for (i
=0; i
< height
; i
++) {
1504 for (j
=0; j
< width
; j
++) {
1505 dst
[j
] = (2731*(2*src
[j
] + 3*src
[j
+1] + 3*src
[j
+stride
] + 4*src
[j
+stride
+1] + 6)) >> 15;
1512 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1514 case 2: avg_pixels2_c (dst
, src
, stride
, height
); break;
1515 case 4: avg_pixels4_c (dst
, src
, stride
, height
); break;
1516 case 8: avg_pixels8_c (dst
, src
, stride
, height
); break;
1517 case 16:avg_pixels16_c(dst
, src
, stride
, height
); break;
1521 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1523 for (i
=0; i
< height
; i
++) {
1524 for (j
=0; j
< width
; j
++) {
1525 dst
[j
] = (dst
[j
] + ((683*(2*src
[j
] + src
[j
+1] + 1)) >> 11) + 1) >> 1;
1532 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1534 for (i
=0; i
< height
; i
++) {
1535 for (j
=0; j
< width
; j
++) {
1536 dst
[j
] = (dst
[j
] + ((683*(src
[j
] + 2*src
[j
+1] + 1)) >> 11) + 1) >> 1;
1543 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1545 for (i
=0; i
< height
; i
++) {
1546 for (j
=0; j
< width
; j
++) {
1547 dst
[j
] = (dst
[j
] + ((683*(2*src
[j
] + src
[j
+stride
] + 1)) >> 11) + 1) >> 1;
1554 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1556 for (i
=0; i
< height
; i
++) {
1557 for (j
=0; j
< width
; j
++) {
1558 dst
[j
] = (dst
[j
] + ((2731*(4*src
[j
] + 3*src
[j
+1] + 3*src
[j
+stride
] + 2*src
[j
+stride
+1] + 6)) >> 15) + 1) >> 1;
1565 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1567 for (i
=0; i
< height
; i
++) {
1568 for (j
=0; j
< width
; j
++) {
1569 dst
[j
] = (dst
[j
] + ((2731*(3*src
[j
] + 2*src
[j
+1] + 4*src
[j
+stride
] + 3*src
[j
+stride
+1] + 6)) >> 15) + 1) >> 1;
1576 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1578 for (i
=0; i
< height
; i
++) {
1579 for (j
=0; j
< width
; j
++) {
1580 dst
[j
] = (dst
[j
] + ((683*(src
[j
] + 2*src
[j
+stride
] + 1)) >> 11) + 1) >> 1;
1587 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1589 for (i
=0; i
< height
; i
++) {
1590 for (j
=0; j
< width
; j
++) {
1591 dst
[j
] = (dst
[j
] + ((2731*(3*src
[j
] + 4*src
[j
+1] + 2*src
[j
+stride
] + 3*src
[j
+stride
+1] + 6)) >> 15) + 1) >> 1;
1598 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst
, const uint8_t *src
, int stride
, int width
, int height
){
1600 for (i
=0; i
< height
; i
++) {
1601 for (j
=0; j
< width
; j
++) {
1602 dst
[j
] = (dst
[j
] + ((2731*(2*src
[j
] + 3*src
[j
+1] + 3*src
[j
+stride
] + 4*src
[j
+stride
+1] + 6)) >> 15) + 1) >> 1;
1609 #define TPEL_WIDTH(width)\
1610 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1611 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1612 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1613 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1614 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1615 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1616 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1617 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1618 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1620 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1622 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1624 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1626 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1630 #define H264_CHROMA_MC(OPNAME, OP)\
1631 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1632 const int A=(8-x)*(8-y);\
1633 const int B=( x)*(8-y);\
1634 const int C=(8-x)*( y);\
1635 const int D=( x)*( y);\
1638 assert(x<8 && y<8 && x>=0 && y>=0);\
1641 for(i=0; i<h; i++){\
1642 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1643 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1649 const int step= C ? stride : 1;\
1650 for(i=0; i<h; i++){\
1651 OP(dst[0], (A*src[0] + E*src[step+0]));\
1652 OP(dst[1], (A*src[1] + E*src[step+1]));\
1659 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1660 const int A=(8-x)*(8-y);\
1661 const int B=( x)*(8-y);\
1662 const int C=(8-x)*( y);\
1663 const int D=( x)*( y);\
1666 assert(x<8 && y<8 && x>=0 && y>=0);\
1669 for(i=0; i<h; i++){\
1670 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1671 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1672 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1673 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1679 const int step= C ? stride : 1;\
1680 for(i=0; i<h; i++){\
1681 OP(dst[0], (A*src[0] + E*src[step+0]));\
1682 OP(dst[1], (A*src[1] + E*src[step+1]));\
1683 OP(dst[2], (A*src[2] + E*src[step+2]));\
1684 OP(dst[3], (A*src[3] + E*src[step+3]));\
1691 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1692 const int A=(8-x)*(8-y);\
1693 const int B=( x)*(8-y);\
1694 const int C=(8-x)*( y);\
1695 const int D=( x)*( y);\
1698 assert(x<8 && y<8 && x>=0 && y>=0);\
1701 for(i=0; i<h; i++){\
1702 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1703 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1704 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1705 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1706 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1707 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1708 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1709 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1715 const int step= C ? stride : 1;\
1716 for(i=0; i<h; i++){\
1717 OP(dst[0], (A*src[0] + E*src[step+0]));\
1718 OP(dst[1], (A*src[1] + E*src[step+1]));\
1719 OP(dst[2], (A*src[2] + E*src[step+2]));\
1720 OP(dst[3], (A*src[3] + E*src[step+3]));\
1721 OP(dst[4], (A*src[4] + E*src[step+4]));\
1722 OP(dst[5], (A*src[5] + E*src[step+5]));\
1723 OP(dst[6], (A*src[6] + E*src[step+6]));\
1724 OP(dst[7], (A*src[7] + E*src[step+7]));\
1731 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1732 #define op_put(a, b) a = (((b) + 32)>>6)
1734 H264_CHROMA_MC(put_
, op_put
)
1735 H264_CHROMA_MC(avg_
, op_avg
)
1739 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst
/*align 8*/, uint8_t *src
/*align 1*/, int stride
, int h
, int x
, int y
){
1740 const int A
=(8-x
)*(8-y
);
1741 const int B
=( x
)*(8-y
);
1742 const int C
=(8-x
)*( y
);
1743 const int D
=( x
)*( y
);
1746 assert(x
<8 && y
<8 && x
>=0 && y
>=0);
1750 dst
[0] = (A
*src
[0] + B
*src
[1] + C
*src
[stride
+0] + D
*src
[stride
+1] + 32 - 4) >> 6;
1751 dst
[1] = (A
*src
[1] + B
*src
[2] + C
*src
[stride
+1] + D
*src
[stride
+2] + 32 - 4) >> 6;
1752 dst
[2] = (A
*src
[2] + B
*src
[3] + C
*src
[stride
+2] + D
*src
[stride
+3] + 32 - 4) >> 6;
1753 dst
[3] = (A
*src
[3] + B
*src
[4] + C
*src
[stride
+3] + D
*src
[stride
+4] + 32 - 4) >> 6;
1754 dst
[4] = (A
*src
[4] + B
*src
[5] + C
*src
[stride
+4] + D
*src
[stride
+5] + 32 - 4) >> 6;
1755 dst
[5] = (A
*src
[5] + B
*src
[6] + C
*src
[stride
+5] + D
*src
[stride
+6] + 32 - 4) >> 6;
1756 dst
[6] = (A
*src
[6] + B
*src
[7] + C
*src
[stride
+6] + D
*src
[stride
+7] + 32 - 4) >> 6;
1757 dst
[7] = (A
*src
[7] + B
*src
[8] + C
*src
[stride
+7] + D
*src
[stride
+8] + 32 - 4) >> 6;
1763 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst
/*align 8*/, uint8_t *src
/*align 1*/, int stride
, int h
, int x
, int y
){
1764 const int A
=(8-x
)*(8-y
);
1765 const int B
=( x
)*(8-y
);
1766 const int C
=(8-x
)*( y
);
1767 const int D
=( x
)*( y
);
1770 assert(x
<8 && y
<8 && x
>=0 && y
>=0);
1774 dst
[0] = avg2(dst
[0], ((A
*src
[0] + B
*src
[1] + C
*src
[stride
+0] + D
*src
[stride
+1] + 32 - 4) >> 6));
1775 dst
[1] = avg2(dst
[1], ((A
*src
[1] + B
*src
[2] + C
*src
[stride
+1] + D
*src
[stride
+2] + 32 - 4) >> 6));
1776 dst
[2] = avg2(dst
[2], ((A
*src
[2] + B
*src
[3] + C
*src
[stride
+2] + D
*src
[stride
+3] + 32 - 4) >> 6));
1777 dst
[3] = avg2(dst
[3], ((A
*src
[3] + B
*src
[4] + C
*src
[stride
+3] + D
*src
[stride
+4] + 32 - 4) >> 6));
1778 dst
[4] = avg2(dst
[4], ((A
*src
[4] + B
*src
[5] + C
*src
[stride
+4] + D
*src
[stride
+5] + 32 - 4) >> 6));
1779 dst
[5] = avg2(dst
[5], ((A
*src
[5] + B
*src
[6] + C
*src
[stride
+5] + D
*src
[stride
+6] + 32 - 4) >> 6));
1780 dst
[6] = avg2(dst
[6], ((A
*src
[6] + B
*src
[7] + C
*src
[stride
+6] + D
*src
[stride
+7] + 32 - 4) >> 6));
1781 dst
[7] = avg2(dst
[7], ((A
*src
[7] + B
*src
[8] + C
*src
[stride
+7] + D
*src
[stride
+8] + 32 - 4) >> 6));
1787 #define QPEL_MC(r, OPNAME, RND, OP) \
1788 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1789 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1793 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1794 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1795 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1796 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1797 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1798 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1799 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1800 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1806 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1812 const int src0= src[0*srcStride];\
1813 const int src1= src[1*srcStride];\
1814 const int src2= src[2*srcStride];\
1815 const int src3= src[3*srcStride];\
1816 const int src4= src[4*srcStride];\
1817 const int src5= src[5*srcStride];\
1818 const int src6= src[6*srcStride];\
1819 const int src7= src[7*srcStride];\
1820 const int src8= src[8*srcStride];\
1821 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1822 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1823 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1824 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1825 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1826 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1827 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1828 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1834 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1835 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1840 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1841 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1842 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1843 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1844 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1845 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1846 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1847 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1848 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1849 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1850 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1851 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1852 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1853 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1854 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1855 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1861 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1862 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1867 const int src0= src[0*srcStride];\
1868 const int src1= src[1*srcStride];\
1869 const int src2= src[2*srcStride];\
1870 const int src3= src[3*srcStride];\
1871 const int src4= src[4*srcStride];\
1872 const int src5= src[5*srcStride];\
1873 const int src6= src[6*srcStride];\
1874 const int src7= src[7*srcStride];\
1875 const int src8= src[8*srcStride];\
1876 const int src9= src[9*srcStride];\
1877 const int src10= src[10*srcStride];\
1878 const int src11= src[11*srcStride];\
1879 const int src12= src[12*srcStride];\
1880 const int src13= src[13*srcStride];\
1881 const int src14= src[14*srcStride];\
1882 const int src15= src[15*srcStride];\
1883 const int src16= src[16*srcStride];\
1884 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1885 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1886 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1887 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1888 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1889 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1890 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1891 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1892 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1893 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1894 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1895 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1896 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1897 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1898 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1899 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1905 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1906 OPNAME ## pixels8_c(dst, src, stride, 8);\
1909 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1911 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1912 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1915 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1916 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1919 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1921 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1922 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1925 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1926 uint8_t full[16*9];\
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1930 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1933 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1934 uint8_t full[16*9];\
1935 copy_block9(full, src, 16, stride, 9);\
1936 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1939 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[16*9];\
1942 copy_block9(full, src, 16, stride, 9);\
1943 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1944 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1946 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[16*9];\
1950 uint8_t halfHV[64];\
1951 copy_block9(full, src, 16, stride, 9);\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1954 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1955 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1957 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t full[16*9];\
1960 uint8_t halfHV[64];\
1961 copy_block9(full, src, 16, stride, 9);\
1962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1967 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[16*9];\
1971 uint8_t halfHV[64];\
1972 copy_block9(full, src, 16, stride, 9);\
1973 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1974 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1975 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1976 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1978 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[16*9];\
1981 uint8_t halfHV[64];\
1982 copy_block9(full, src, 16, stride, 9);\
1983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1984 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1986 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1988 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[16*9];\
1992 uint8_t halfHV[64];\
1993 copy_block9(full, src, 16, stride, 9);\
1994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1995 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1997 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1999 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t full[16*9];\
2002 uint8_t halfHV[64];\
2003 copy_block9(full, src, 16, stride, 9);\
2004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2005 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2007 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2009 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2013 uint8_t halfHV[64];\
2014 copy_block9(full, src, 16, stride, 9);\
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2020 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[16*9];\
2023 uint8_t halfHV[64];\
2024 copy_block9(full, src, 16, stride, 9);\
2025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2026 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2028 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2030 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t halfHV[64];\
2033 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2034 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2035 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2037 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2039 uint8_t halfHV[64];\
2040 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2042 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2044 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[16*9];\
2048 uint8_t halfHV[64];\
2049 copy_block9(full, src, 16, stride, 9);\
2050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2051 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2053 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2055 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t full[16*9];\
2058 copy_block9(full, src, 16, stride, 9);\
2059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2060 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2061 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2063 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t full[16*9];\
2067 uint8_t halfHV[64];\
2068 copy_block9(full, src, 16, stride, 9);\
2069 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2070 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2071 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2072 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2074 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t full[16*9];\
2077 copy_block9(full, src, 16, stride, 9);\
2078 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2079 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2080 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2082 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2085 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2087 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2088 OPNAME ## pixels16_c(dst, src, stride, 16);\
2091 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2093 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2094 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2097 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2098 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2101 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2103 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2104 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2107 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2108 uint8_t full[24*17];\
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2112 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2115 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2116 uint8_t full[24*17];\
2117 copy_block17(full, src, 24, stride, 17);\
2118 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2121 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t full[24*17];\
2124 copy_block17(full, src, 24, stride, 17);\
2125 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2126 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2128 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2129 uint8_t full[24*17];\
2130 uint8_t halfH[272];\
2131 uint8_t halfV[256];\
2132 uint8_t halfHV[256];\
2133 copy_block17(full, src, 24, stride, 17);\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2136 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2137 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2139 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2140 uint8_t full[24*17];\
2141 uint8_t halfH[272];\
2142 uint8_t halfHV[256];\
2143 copy_block17(full, src, 24, stride, 17);\
2144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2149 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2150 uint8_t full[24*17];\
2151 uint8_t halfH[272];\
2152 uint8_t halfV[256];\
2153 uint8_t halfHV[256];\
2154 copy_block17(full, src, 24, stride, 17);\
2155 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2156 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2157 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2158 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2160 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2161 uint8_t full[24*17];\
2162 uint8_t halfH[272];\
2163 uint8_t halfHV[256];\
2164 copy_block17(full, src, 24, stride, 17);\
2165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2166 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2168 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2170 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2171 uint8_t full[24*17];\
2172 uint8_t halfH[272];\
2173 uint8_t halfV[256];\
2174 uint8_t halfHV[256];\
2175 copy_block17(full, src, 24, stride, 17);\
2176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2177 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2178 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2179 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2181 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2182 uint8_t full[24*17];\
2183 uint8_t halfH[272];\
2184 uint8_t halfHV[256];\
2185 copy_block17(full, src, 24, stride, 17);\
2186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2187 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2189 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2191 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfV[256];\
2195 uint8_t halfHV[256];\
2196 copy_block17(full, src, 24, stride, 17);\
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2202 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2203 uint8_t full[24*17];\
2204 uint8_t halfH[272];\
2205 uint8_t halfHV[256];\
2206 copy_block17(full, src, 24, stride, 17);\
2207 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2208 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2209 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2210 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2212 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2213 uint8_t halfH[272];\
2214 uint8_t halfHV[256];\
2215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2217 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2219 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2220 uint8_t halfH[272];\
2221 uint8_t halfHV[256];\
2222 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2224 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2226 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2227 uint8_t full[24*17];\
2228 uint8_t halfH[272];\
2229 uint8_t halfV[256];\
2230 uint8_t halfHV[256];\
2231 copy_block17(full, src, 24, stride, 17);\
2232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2233 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2235 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2237 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t full[24*17];\
2239 uint8_t halfH[272];\
2240 copy_block17(full, src, 24, stride, 17);\
2241 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2242 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2243 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2245 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[24*17];\
2247 uint8_t halfH[272];\
2248 uint8_t halfV[256];\
2249 uint8_t halfHV[256];\
2250 copy_block17(full, src, 24, stride, 17);\
2251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2252 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2253 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2254 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2256 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[24*17];\
2258 uint8_t halfH[272];\
2259 copy_block17(full, src, 24, stride, 17);\
2260 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2261 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2262 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2264 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2265 uint8_t halfH[272];\
2266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2267 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2270 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2271 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2272 #define op_put(a, b) a = cm[((b) + 16)>>5]
2273 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2275 QPEL_MC(0, put_
, _
, op_put
)
2276 QPEL_MC(1, put_no_rnd_
, _no_rnd_
, op_put_no_rnd
)
2277 QPEL_MC(0, avg_
, _
, op_avg
)
2278 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2280 #undef op_avg_no_rnd
2282 #undef op_put_no_rnd
2285 #define H264_LOWPASS(OPNAME, OP, OP2) \
2286 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2288 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2292 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2293 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2299 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2301 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2305 const int srcB= src[-2*srcStride];\
2306 const int srcA= src[-1*srcStride];\
2307 const int src0= src[0 *srcStride];\
2308 const int src1= src[1 *srcStride];\
2309 const int src2= src[2 *srcStride];\
2310 const int src3= src[3 *srcStride];\
2311 const int src4= src[4 *srcStride];\
2312 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2313 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2319 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2324 src -= 2*srcStride;\
2325 for(i=0; i<h+5; i++)\
2327 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2328 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2332 tmp -= tmpStride*(h+5-2);\
2335 const int tmpB= tmp[-2*tmpStride];\
2336 const int tmpA= tmp[-1*tmpStride];\
2337 const int tmp0= tmp[0 *tmpStride];\
2338 const int tmp1= tmp[1 *tmpStride];\
2339 const int tmp2= tmp[2 *tmpStride];\
2340 const int tmp3= tmp[3 *tmpStride];\
2341 const int tmp4= tmp[4 *tmpStride];\
2342 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2343 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2348 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2350 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2354 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2355 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2356 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2357 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2363 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2365 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2369 const int srcB= src[-2*srcStride];\
2370 const int srcA= src[-1*srcStride];\
2371 const int src0= src[0 *srcStride];\
2372 const int src1= src[1 *srcStride];\
2373 const int src2= src[2 *srcStride];\
2374 const int src3= src[3 *srcStride];\
2375 const int src4= src[4 *srcStride];\
2376 const int src5= src[5 *srcStride];\
2377 const int src6= src[6 *srcStride];\
2378 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2379 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2380 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2381 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2387 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2392 src -= 2*srcStride;\
2393 for(i=0; i<h+5; i++)\
2395 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2396 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2397 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2398 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2402 tmp -= tmpStride*(h+5-2);\
2405 const int tmpB= tmp[-2*tmpStride];\
2406 const int tmpA= tmp[-1*tmpStride];\
2407 const int tmp0= tmp[0 *tmpStride];\
2408 const int tmp1= tmp[1 *tmpStride];\
2409 const int tmp2= tmp[2 *tmpStride];\
2410 const int tmp3= tmp[3 *tmpStride];\
2411 const int tmp4= tmp[4 *tmpStride];\
2412 const int tmp5= tmp[5 *tmpStride];\
2413 const int tmp6= tmp[6 *tmpStride];\
2414 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2415 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2416 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2417 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2423 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2425 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2429 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2430 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2431 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2432 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2433 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2434 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2435 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2436 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2442 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2444 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2448 const int srcB= src[-2*srcStride];\
2449 const int srcA= src[-1*srcStride];\
2450 const int src0= src[0 *srcStride];\
2451 const int src1= src[1 *srcStride];\
2452 const int src2= src[2 *srcStride];\
2453 const int src3= src[3 *srcStride];\
2454 const int src4= src[4 *srcStride];\
2455 const int src5= src[5 *srcStride];\
2456 const int src6= src[6 *srcStride];\
2457 const int src7= src[7 *srcStride];\
2458 const int src8= src[8 *srcStride];\
2459 const int src9= src[9 *srcStride];\
2460 const int src10=src[10*srcStride];\
2461 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2462 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2463 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2464 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2465 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2466 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2467 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2468 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2474 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2477 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2479 src -= 2*srcStride;\
2480 for(i=0; i<h+5; i++)\
2482 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2483 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2484 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2485 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2486 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2487 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2488 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2489 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2493 tmp -= tmpStride*(h+5-2);\
2496 const int tmpB= tmp[-2*tmpStride];\
2497 const int tmpA= tmp[-1*tmpStride];\
2498 const int tmp0= tmp[0 *tmpStride];\
2499 const int tmp1= tmp[1 *tmpStride];\
2500 const int tmp2= tmp[2 *tmpStride];\
2501 const int tmp3= tmp[3 *tmpStride];\
2502 const int tmp4= tmp[4 *tmpStride];\
2503 const int tmp5= tmp[5 *tmpStride];\
2504 const int tmp6= tmp[6 *tmpStride];\
2505 const int tmp7= tmp[7 *tmpStride];\
2506 const int tmp8= tmp[8 *tmpStride];\
2507 const int tmp9= tmp[9 *tmpStride];\
2508 const int tmp10=tmp[10*tmpStride];\
2509 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2510 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2511 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2512 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2513 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2514 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2515 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2516 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2522 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2523 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2524 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2525 src += 8*srcStride;\
2526 dst += 8*dstStride;\
2527 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2528 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2531 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2532 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2533 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2534 src += 8*srcStride;\
2535 dst += 8*dstStride;\
2536 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2537 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2540 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2541 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2542 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2543 src += 8*srcStride;\
2544 dst += 8*dstStride;\
2545 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2546 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2549 #define H264_MC(OPNAME, SIZE) \
2550 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2551 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2555 uint8_t half[SIZE*SIZE];\
2556 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2557 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2560 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2561 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2564 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2565 uint8_t half[SIZE*SIZE];\
2566 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2567 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2570 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2571 uint8_t full[SIZE*(SIZE+5)];\
2572 uint8_t * const full_mid= full + SIZE*2;\
2573 uint8_t half[SIZE*SIZE];\
2574 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2575 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2576 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2579 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2580 uint8_t full[SIZE*(SIZE+5)];\
2581 uint8_t * const full_mid= full + SIZE*2;\
2582 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2583 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2586 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2587 uint8_t full[SIZE*(SIZE+5)];\
2588 uint8_t * const full_mid= full + SIZE*2;\
2589 uint8_t half[SIZE*SIZE];\
2590 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2591 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2592 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2595 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2596 uint8_t full[SIZE*(SIZE+5)];\
2597 uint8_t * const full_mid= full + SIZE*2;\
2598 uint8_t halfH[SIZE*SIZE];\
2599 uint8_t halfV[SIZE*SIZE];\
2600 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2601 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2602 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2603 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2606 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2607 uint8_t full[SIZE*(SIZE+5)];\
2608 uint8_t * const full_mid= full + SIZE*2;\
2609 uint8_t halfH[SIZE*SIZE];\
2610 uint8_t halfV[SIZE*SIZE];\
2611 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2612 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2613 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2614 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2618 uint8_t full[SIZE*(SIZE+5)];\
2619 uint8_t * const full_mid= full + SIZE*2;\
2620 uint8_t halfH[SIZE*SIZE];\
2621 uint8_t halfV[SIZE*SIZE];\
2622 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2623 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2624 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2629 uint8_t full[SIZE*(SIZE+5)];\
2630 uint8_t * const full_mid= full + SIZE*2;\
2631 uint8_t halfH[SIZE*SIZE];\
2632 uint8_t halfV[SIZE*SIZE];\
2633 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2634 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2635 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2636 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2639 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2640 int16_t tmp[SIZE*(SIZE+5)];\
2641 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2644 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2645 int16_t tmp[SIZE*(SIZE+5)];\
2646 uint8_t halfH[SIZE*SIZE];\
2647 uint8_t halfHV[SIZE*SIZE];\
2648 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2649 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2650 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2653 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2654 int16_t tmp[SIZE*(SIZE+5)];\
2655 uint8_t halfH[SIZE*SIZE];\
2656 uint8_t halfHV[SIZE*SIZE];\
2657 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2658 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2659 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2662 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2663 uint8_t full[SIZE*(SIZE+5)];\
2664 uint8_t * const full_mid= full + SIZE*2;\
2665 int16_t tmp[SIZE*(SIZE+5)];\
2666 uint8_t halfV[SIZE*SIZE];\
2667 uint8_t halfHV[SIZE*SIZE];\
2668 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2669 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2670 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2671 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2674 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2675 uint8_t full[SIZE*(SIZE+5)];\
2676 uint8_t * const full_mid= full + SIZE*2;\
2677 int16_t tmp[SIZE*(SIZE+5)];\
2678 uint8_t halfV[SIZE*SIZE];\
2679 uint8_t halfHV[SIZE*SIZE];\
2680 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2681 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2682 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2683 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2686 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2687 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2688 #define op_put(a, b) a = cm[((b) + 16)>>5]
2689 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2690 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2692 H264_LOWPASS(put_
, op_put
, op2_put
)
2693 H264_LOWPASS(avg_
, op_avg
, op2_avg
)
2708 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2709 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2710 #define H264_WEIGHT(W,H) \
2711 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2713 offset <<= log2_denom; \
2714 if(log2_denom) offset += 1<<(log2_denom-1); \
2715 for(y=0; y<H; y++, block += stride){ \
2718 if(W==2) continue; \
2721 if(W==4) continue; \
2726 if(W==8) continue; \
2737 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2739 offset = ((offset + 1) | 1) << log2_denom; \
2740 for(y=0; y<H; y++, dst += stride, src += stride){ \
2743 if(W==2) continue; \
2746 if(W==4) continue; \
2751 if(W==8) continue; \
2778 static void wmv2_mspel8_h_lowpass(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
){
2779 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
2783 dst
[0]= cm
[(9*(src
[0] + src
[1]) - (src
[-1] + src
[2]) + 8)>>4];
2784 dst
[1]= cm
[(9*(src
[1] + src
[2]) - (src
[ 0] + src
[3]) + 8)>>4];
2785 dst
[2]= cm
[(9*(src
[2] + src
[3]) - (src
[ 1] + src
[4]) + 8)>>4];
2786 dst
[3]= cm
[(9*(src
[3] + src
[4]) - (src
[ 2] + src
[5]) + 8)>>4];
2787 dst
[4]= cm
[(9*(src
[4] + src
[5]) - (src
[ 3] + src
[6]) + 8)>>4];
2788 dst
[5]= cm
[(9*(src
[5] + src
[6]) - (src
[ 4] + src
[7]) + 8)>>4];
2789 dst
[6]= cm
[(9*(src
[6] + src
[7]) - (src
[ 5] + src
[8]) + 8)>>4];
2790 dst
[7]= cm
[(9*(src
[7] + src
[8]) - (src
[ 6] + src
[9]) + 8)>>4];
2796 #if CONFIG_CAVS_DECODER
2798 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
) {
2799 put_pixels8_c(dst
, src
, stride
, 8);
2801 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
) {
2802 avg_pixels8_c(dst
, src
, stride
, 8);
2804 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
) {
2805 put_pixels16_c(dst
, src
, stride
, 16);
2807 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
) {
2808 avg_pixels16_c(dst
, src
, stride
, 16);
2810 #endif /* CONFIG_CAVS_DECODER */
2812 #if CONFIG_VC1_DECODER
2814 void ff_put_vc1_mspel_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
, int rnd
) {
2815 put_pixels8_c(dst
, src
, stride
, 8);
2817 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst
, uint8_t *src
, int stride
, int rnd
) {
2818 avg_pixels8_c(dst
, src
, stride
, 8);
2820 #endif /* CONFIG_VC1_DECODER */
2823 void ff_h264dspenc_init(DSPContext
* c
, AVCodecContext
*avctx
);
2825 #if CONFIG_RV40_DECODER
2826 static void put_rv40_qpel16_mc33_c(uint8_t *dst
, uint8_t *src
, int stride
){
2827 put_pixels16_xy2_c(dst
, src
, stride
, 16);
2829 static void avg_rv40_qpel16_mc33_c(uint8_t *dst
, uint8_t *src
, int stride
){
2830 avg_pixels16_xy2_c(dst
, src
, stride
, 16);
2832 static void put_rv40_qpel8_mc33_c(uint8_t *dst
, uint8_t *src
, int stride
){
2833 put_pixels8_xy2_c(dst
, src
, stride
, 8);
2835 static void avg_rv40_qpel8_mc33_c(uint8_t *dst
, uint8_t *src
, int stride
){
2836 avg_pixels8_xy2_c(dst
, src
, stride
, 8);
2838 #endif /* CONFIG_RV40_DECODER */
2840 static void wmv2_mspel8_v_lowpass(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int w
){
2841 uint8_t *cm
= ff_cropTbl
+ MAX_NEG_CROP
;
2845 const int src_1
= src
[ -srcStride
];
2846 const int src0
= src
[0 ];
2847 const int src1
= src
[ srcStride
];
2848 const int src2
= src
[2*srcStride
];
2849 const int src3
= src
[3*srcStride
];
2850 const int src4
= src
[4*srcStride
];
2851 const int src5
= src
[5*srcStride
];
2852 const int src6
= src
[6*srcStride
];
2853 const int src7
= src
[7*srcStride
];
2854 const int src8
= src
[8*srcStride
];
2855 const int src9
= src
[9*srcStride
];
2856 dst
[0*dstStride
]= cm
[(9*(src0
+ src1
) - (src_1
+ src2
) + 8)>>4];
2857 dst
[1*dstStride
]= cm
[(9*(src1
+ src2
) - (src0
+ src3
) + 8)>>4];
2858 dst
[2*dstStride
]= cm
[(9*(src2
+ src3
) - (src1
+ src4
) + 8)>>4];
2859 dst
[3*dstStride
]= cm
[(9*(src3
+ src4
) - (src2
+ src5
) + 8)>>4];
2860 dst
[4*dstStride
]= cm
[(9*(src4
+ src5
) - (src3
+ src6
) + 8)>>4];
2861 dst
[5*dstStride
]= cm
[(9*(src5
+ src6
) - (src4
+ src7
) + 8)>>4];
2862 dst
[6*dstStride
]= cm
[(9*(src6
+ src7
) - (src5
+ src8
) + 8)>>4];
2863 dst
[7*dstStride
]= cm
[(9*(src7
+ src8
) - (src6
+ src9
) + 8)>>4];
2869 static void put_mspel8_mc00_c (uint8_t *dst
, uint8_t *src
, int stride
){
2870 put_pixels8_c(dst
, src
, stride
, 8);
2873 static void put_mspel8_mc10_c(uint8_t *dst
, uint8_t *src
, int stride
){
2875 wmv2_mspel8_h_lowpass(half
, src
, 8, stride
, 8);
2876 put_pixels8_l2(dst
, src
, half
, stride
, stride
, 8, 8);
2879 static void put_mspel8_mc20_c(uint8_t *dst
, uint8_t *src
, int stride
){
2880 wmv2_mspel8_h_lowpass(dst
, src
, stride
, stride
, 8);
2883 static void put_mspel8_mc30_c(uint8_t *dst
, uint8_t *src
, int stride
){
2885 wmv2_mspel8_h_lowpass(half
, src
, 8, stride
, 8);
2886 put_pixels8_l2(dst
, src
+1, half
, stride
, stride
, 8, 8);
2889 static void put_mspel8_mc02_c(uint8_t *dst
, uint8_t *src
, int stride
){
2890 wmv2_mspel8_v_lowpass(dst
, src
, stride
, stride
, 8);
2893 static void put_mspel8_mc12_c(uint8_t *dst
, uint8_t *src
, int stride
){
2897 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
2898 wmv2_mspel8_v_lowpass(halfV
, src
, 8, stride
, 8);
2899 wmv2_mspel8_v_lowpass(halfHV
, halfH
+8, 8, 8, 8);
2900 put_pixels8_l2(dst
, halfV
, halfHV
, stride
, 8, 8, 8);
2902 static void put_mspel8_mc32_c(uint8_t *dst
, uint8_t *src
, int stride
){
2906 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
2907 wmv2_mspel8_v_lowpass(halfV
, src
+1, 8, stride
, 8);
2908 wmv2_mspel8_v_lowpass(halfHV
, halfH
+8, 8, 8, 8);
2909 put_pixels8_l2(dst
, halfV
, halfHV
, stride
, 8, 8, 8);
2911 static void put_mspel8_mc22_c(uint8_t *dst
, uint8_t *src
, int stride
){
2913 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
2914 wmv2_mspel8_v_lowpass(dst
, halfH
+8, stride
, 8, 8);
2917 static void h263_v_loop_filter_c(uint8_t *src
, int stride
, int qscale
){
2918 if(CONFIG_H263_DECODER
|| CONFIG_H263_ENCODER
) {
2920 const int strength
= ff_h263_loop_filter_strength
[qscale
];
2924 int p0
= src
[x
-2*stride
];
2925 int p1
= src
[x
-1*stride
];
2926 int p2
= src
[x
+0*stride
];
2927 int p3
= src
[x
+1*stride
];
2928 int d
= (p0
- p3
+ 4*(p2
- p1
)) / 8;
2930 if (d
<-2*strength
) d1
= 0;
2931 else if(d
<- strength
) d1
=-2*strength
- d
;
2932 else if(d
< strength
) d1
= d
;
2933 else if(d
< 2*strength
) d1
= 2*strength
- d
;
2938 if(p1
&256) p1
= ~(p1
>>31);
2939 if(p2
&256) p2
= ~(p2
>>31);
2941 src
[x
-1*stride
] = p1
;
2942 src
[x
+0*stride
] = p2
;
2946 d2
= av_clip((p0
-p3
)/4, -ad1
, ad1
);
2948 src
[x
-2*stride
] = p0
- d2
;
2949 src
[x
+ stride
] = p3
+ d2
;
2954 static void h263_h_loop_filter_c(uint8_t *src
, int stride
, int qscale
){
2955 if(CONFIG_H263_DECODER
|| CONFIG_H263_ENCODER
) {
2957 const int strength
= ff_h263_loop_filter_strength
[qscale
];
2961 int p0
= src
[y
*stride
-2];
2962 int p1
= src
[y
*stride
-1];
2963 int p2
= src
[y
*stride
+0];
2964 int p3
= src
[y
*stride
+1];
2965 int d
= (p0
- p3
+ 4*(p2
- p1
)) / 8;
2967 if (d
<-2*strength
) d1
= 0;
2968 else if(d
<- strength
) d1
=-2*strength
- d
;
2969 else if(d
< strength
) d1
= d
;
2970 else if(d
< 2*strength
) d1
= 2*strength
- d
;
2975 if(p1
&256) p1
= ~(p1
>>31);
2976 if(p2
&256) p2
= ~(p2
>>31);
2978 src
[y
*stride
-1] = p1
;
2979 src
[y
*stride
+0] = p2
;
2983 d2
= av_clip((p0
-p3
)/4, -ad1
, ad1
);
2985 src
[y
*stride
-2] = p0
- d2
;
2986 src
[y
*stride
+1] = p3
+ d2
;
2991 static void h261_loop_filter_c(uint8_t *src
, int stride
){
2996 temp
[x
] = 4*src
[x
];
2997 temp
[x
+ 7*8] = 4*src
[x
+ 7*stride
];
3001 xy
= y
* stride
+ x
;
3003 temp
[yz
] = src
[xy
- stride
] + 2*src
[xy
] + src
[xy
+ stride
];
3008 src
[ y
*stride
] = (temp
[ y
*8] + 2)>>2;
3009 src
[7+y
*stride
] = (temp
[7+y
*8] + 2)>>2;
3011 xy
= y
* stride
+ x
;
3013 src
[xy
] = (temp
[yz
-1] + 2*temp
[yz
] + temp
[yz
+1] + 8)>>4;
3018 static av_always_inline av_flatten
void h264_loop_filter_luma_c(uint8_t *pix
, int xstride
, int ystride
, int alpha
, int beta
, int8_t *tc0
)
3021 for( i
= 0; i
< 4; i
++ ) {
3026 for( d
= 0; d
< 4; d
++ ) {
3027 const int p0
= pix
[-1*xstride
];
3028 const int p1
= pix
[-2*xstride
];
3029 const int p2
= pix
[-3*xstride
];
3030 const int q0
= pix
[0];
3031 const int q1
= pix
[1*xstride
];
3032 const int q2
= pix
[2*xstride
];
3034 if( FFABS( p0
- q0
) < alpha
&&
3035 FFABS( p1
- p0
) < beta
&&
3036 FFABS( q1
- q0
) < beta
) {
3041 if( FFABS( p2
- p0
) < beta
) {
3043 pix
[-2*xstride
] = p1
+ av_clip( (( p2
+ ( ( p0
+ q0
+ 1 ) >> 1 ) ) >> 1) - p1
, -tc0
[i
], tc0
[i
] );
3046 if( FFABS( q2
- q0
) < beta
) {
3048 pix
[ xstride
] = q1
+ av_clip( (( q2
+ ( ( p0
+ q0
+ 1 ) >> 1 ) ) >> 1) - q1
, -tc0
[i
], tc0
[i
] );
3052 i_delta
= av_clip( (((q0
- p0
) << 2) + (p1
- q1
) + 4) >> 3, -tc
, tc
);
3053 pix
[-xstride
] = av_clip_uint8( p0
+ i_delta
); /* p0' */
3054 pix
[0] = av_clip_uint8( q0
- i_delta
); /* q0' */
3060 static void h264_v_loop_filter_luma_c(uint8_t *pix
, int stride
, int alpha
, int beta
, int8_t *tc0
)
3062 h264_loop_filter_luma_c(pix
, stride
, 1, alpha
, beta
, tc0
);
3064 static void h264_h_loop_filter_luma_c(uint8_t *pix
, int stride
, int alpha
, int beta
, int8_t *tc0
)
3066 h264_loop_filter_luma_c(pix
, 1, stride
, alpha
, beta
, tc0
);
3069 static av_always_inline av_flatten
void h264_loop_filter_luma_intra_c(uint8_t *pix
, int xstride
, int ystride
, int alpha
, int beta
)
3072 for( d
= 0; d
< 16; d
++ ) {
3073 const int p2
= pix
[-3*xstride
];
3074 const int p1
= pix
[-2*xstride
];
3075 const int p0
= pix
[-1*xstride
];
3077 const int q0
= pix
[ 0*xstride
];
3078 const int q1
= pix
[ 1*xstride
];
3079 const int q2
= pix
[ 2*xstride
];
3081 if( FFABS( p0
- q0
) < alpha
&&
3082 FFABS( p1
- p0
) < beta
&&
3083 FFABS( q1
- q0
) < beta
) {
3085 if(FFABS( p0
- q0
) < (( alpha
>> 2 ) + 2 )){
3086 if( FFABS( p2
- p0
) < beta
)
3088 const int p3
= pix
[-4*xstride
];
3090 pix
[-1*xstride
] = ( p2
+ 2*p1
+ 2*p0
+ 2*q0
+ q1
+ 4 ) >> 3;
3091 pix
[-2*xstride
] = ( p2
+ p1
+ p0
+ q0
+ 2 ) >> 2;
3092 pix
[-3*xstride
] = ( 2*p3
+ 3*p2
+ p1
+ p0
+ q0
+ 4 ) >> 3;
3095 pix
[-1*xstride
] = ( 2*p1
+ p0
+ q1
+ 2 ) >> 2;
3097 if( FFABS( q2
- q0
) < beta
)
3099 const int q3
= pix
[3*xstride
];
3101 pix
[0*xstride
] = ( p1
+ 2*p0
+ 2*q0
+ 2*q1
+ q2
+ 4 ) >> 3;
3102 pix
[1*xstride
] = ( p0
+ q0
+ q1
+ q2
+ 2 ) >> 2;
3103 pix
[2*xstride
] = ( 2*q3
+ 3*q2
+ q1
+ q0
+ p0
+ 4 ) >> 3;
3106 pix
[0*xstride
] = ( 2*q1
+ q0
+ p1
+ 2 ) >> 2;
3110 pix
[-1*xstride
] = ( 2*p1
+ p0
+ q1
+ 2 ) >> 2;
3111 pix
[ 0*xstride
] = ( 2*q1
+ q0
+ p1
+ 2 ) >> 2;
3117 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix
, int stride
, int alpha
, int beta
)
3119 h264_loop_filter_luma_intra_c(pix
, stride
, 1, alpha
, beta
);
3121 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix
, int stride
, int alpha
, int beta
)
3123 h264_loop_filter_luma_intra_c(pix
, 1, stride
, alpha
, beta
);
3126 static av_always_inline av_flatten
void h264_loop_filter_chroma_c(uint8_t *pix
, int xstride
, int ystride
, int alpha
, int beta
, int8_t *tc0
)
3129 for( i
= 0; i
< 4; i
++ ) {
3130 const int tc
= tc0
[i
];
3135 for( d
= 0; d
< 2; d
++ ) {
3136 const int p0
= pix
[-1*xstride
];
3137 const int p1
= pix
[-2*xstride
];
3138 const int q0
= pix
[0];
3139 const int q1
= pix
[1*xstride
];
3141 if( FFABS( p0
- q0
) < alpha
&&
3142 FFABS( p1
- p0
) < beta
&&
3143 FFABS( q1
- q0
) < beta
) {
3145 int delta
= av_clip( (((q0
- p0
) << 2) + (p1
- q1
) + 4) >> 3, -tc
, tc
);
3147 pix
[-xstride
] = av_clip_uint8( p0
+ delta
); /* p0' */
3148 pix
[0] = av_clip_uint8( q0
- delta
); /* q0' */
3154 static void h264_v_loop_filter_chroma_c(uint8_t *pix
, int stride
, int alpha
, int beta
, int8_t *tc0
)
3156 h264_loop_filter_chroma_c(pix
, stride
, 1, alpha
, beta
, tc0
);
3158 static void h264_h_loop_filter_chroma_c(uint8_t *pix
, int stride
, int alpha
, int beta
, int8_t *tc0
)
3160 h264_loop_filter_chroma_c(pix
, 1, stride
, alpha
, beta
, tc0
);
3163 static av_always_inline av_flatten
void h264_loop_filter_chroma_intra_c(uint8_t *pix
, int xstride
, int ystride
, int alpha
, int beta
)
3166 for( d
= 0; d
< 8; d
++ ) {
3167 const int p0
= pix
[-1*xstride
];
3168 const int p1
= pix
[-2*xstride
];
3169 const int q0
= pix
[0];
3170 const int q1
= pix
[1*xstride
];
3172 if( FFABS( p0
- q0
) < alpha
&&
3173 FFABS( p1
- p0
) < beta
&&
3174 FFABS( q1
- q0
) < beta
) {
3176 pix
[-xstride
] = ( 2*p1
+ p0
+ q1
+ 2 ) >> 2; /* p0' */
3177 pix
[0] = ( 2*q1
+ q0
+ p1
+ 2 ) >> 2; /* q0' */
3182 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix
, int stride
, int alpha
, int beta
)
3184 h264_loop_filter_chroma_intra_c(pix
, stride
, 1, alpha
, beta
);
3186 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix
, int stride
, int alpha
, int beta
)
3188 h264_loop_filter_chroma_intra_c(pix
, 1, stride
, alpha
, beta
);
3191 static inline int pix_abs16_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3197 s
+= abs(pix1
[0] - pix2
[0]);
3198 s
+= abs(pix1
[1] - pix2
[1]);
3199 s
+= abs(pix1
[2] - pix2
[2]);
3200 s
+= abs(pix1
[3] - pix2
[3]);
3201 s
+= abs(pix1
[4] - pix2
[4]);
3202 s
+= abs(pix1
[5] - pix2
[5]);
3203 s
+= abs(pix1
[6] - pix2
[6]);
3204 s
+= abs(pix1
[7] - pix2
[7]);
3205 s
+= abs(pix1
[8] - pix2
[8]);
3206 s
+= abs(pix1
[9] - pix2
[9]);
3207 s
+= abs(pix1
[10] - pix2
[10]);
3208 s
+= abs(pix1
[11] - pix2
[11]);
3209 s
+= abs(pix1
[12] - pix2
[12]);
3210 s
+= abs(pix1
[13] - pix2
[13]);
3211 s
+= abs(pix1
[14] - pix2
[14]);
3212 s
+= abs(pix1
[15] - pix2
[15]);
3219 static int pix_abs16_x2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3225 s
+= abs(pix1
[0] - avg2(pix2
[0], pix2
[1]));
3226 s
+= abs(pix1
[1] - avg2(pix2
[1], pix2
[2]));
3227 s
+= abs(pix1
[2] - avg2(pix2
[2], pix2
[3]));
3228 s
+= abs(pix1
[3] - avg2(pix2
[3], pix2
[4]));
3229 s
+= abs(pix1
[4] - avg2(pix2
[4], pix2
[5]));
3230 s
+= abs(pix1
[5] - avg2(pix2
[5], pix2
[6]));
3231 s
+= abs(pix1
[6] - avg2(pix2
[6], pix2
[7]));
3232 s
+= abs(pix1
[7] - avg2(pix2
[7], pix2
[8]));
3233 s
+= abs(pix1
[8] - avg2(pix2
[8], pix2
[9]));
3234 s
+= abs(pix1
[9] - avg2(pix2
[9], pix2
[10]));
3235 s
+= abs(pix1
[10] - avg2(pix2
[10], pix2
[11]));
3236 s
+= abs(pix1
[11] - avg2(pix2
[11], pix2
[12]));
3237 s
+= abs(pix1
[12] - avg2(pix2
[12], pix2
[13]));
3238 s
+= abs(pix1
[13] - avg2(pix2
[13], pix2
[14]));
3239 s
+= abs(pix1
[14] - avg2(pix2
[14], pix2
[15]));
3240 s
+= abs(pix1
[15] - avg2(pix2
[15], pix2
[16]));
3247 static int pix_abs16_y2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3250 uint8_t *pix3
= pix2
+ line_size
;
3254 s
+= abs(pix1
[0] - avg2(pix2
[0], pix3
[0]));
3255 s
+= abs(pix1
[1] - avg2(pix2
[1], pix3
[1]));
3256 s
+= abs(pix1
[2] - avg2(pix2
[2], pix3
[2]));
3257 s
+= abs(pix1
[3] - avg2(pix2
[3], pix3
[3]));
3258 s
+= abs(pix1
[4] - avg2(pix2
[4], pix3
[4]));
3259 s
+= abs(pix1
[5] - avg2(pix2
[5], pix3
[5]));
3260 s
+= abs(pix1
[6] - avg2(pix2
[6], pix3
[6]));
3261 s
+= abs(pix1
[7] - avg2(pix2
[7], pix3
[7]));
3262 s
+= abs(pix1
[8] - avg2(pix2
[8], pix3
[8]));
3263 s
+= abs(pix1
[9] - avg2(pix2
[9], pix3
[9]));
3264 s
+= abs(pix1
[10] - avg2(pix2
[10], pix3
[10]));
3265 s
+= abs(pix1
[11] - avg2(pix2
[11], pix3
[11]));
3266 s
+= abs(pix1
[12] - avg2(pix2
[12], pix3
[12]));
3267 s
+= abs(pix1
[13] - avg2(pix2
[13], pix3
[13]));
3268 s
+= abs(pix1
[14] - avg2(pix2
[14], pix3
[14]));
3269 s
+= abs(pix1
[15] - avg2(pix2
[15], pix3
[15]));
3277 static int pix_abs16_xy2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3280 uint8_t *pix3
= pix2
+ line_size
;
3284 s
+= abs(pix1
[0] - avg4(pix2
[0], pix2
[1], pix3
[0], pix3
[1]));
3285 s
+= abs(pix1
[1] - avg4(pix2
[1], pix2
[2], pix3
[1], pix3
[2]));
3286 s
+= abs(pix1
[2] - avg4(pix2
[2], pix2
[3], pix3
[2], pix3
[3]));
3287 s
+= abs(pix1
[3] - avg4(pix2
[3], pix2
[4], pix3
[3], pix3
[4]));
3288 s
+= abs(pix1
[4] - avg4(pix2
[4], pix2
[5], pix3
[4], pix3
[5]));
3289 s
+= abs(pix1
[5] - avg4(pix2
[5], pix2
[6], pix3
[5], pix3
[6]));
3290 s
+= abs(pix1
[6] - avg4(pix2
[6], pix2
[7], pix3
[6], pix3
[7]));
3291 s
+= abs(pix1
[7] - avg4(pix2
[7], pix2
[8], pix3
[7], pix3
[8]));
3292 s
+= abs(pix1
[8] - avg4(pix2
[8], pix2
[9], pix3
[8], pix3
[9]));
3293 s
+= abs(pix1
[9] - avg4(pix2
[9], pix2
[10], pix3
[9], pix3
[10]));
3294 s
+= abs(pix1
[10] - avg4(pix2
[10], pix2
[11], pix3
[10], pix3
[11]));
3295 s
+= abs(pix1
[11] - avg4(pix2
[11], pix2
[12], pix3
[11], pix3
[12]));
3296 s
+= abs(pix1
[12] - avg4(pix2
[12], pix2
[13], pix3
[12], pix3
[13]));
3297 s
+= abs(pix1
[13] - avg4(pix2
[13], pix2
[14], pix3
[13], pix3
[14]));
3298 s
+= abs(pix1
[14] - avg4(pix2
[14], pix2
[15], pix3
[14], pix3
[15]));
3299 s
+= abs(pix1
[15] - avg4(pix2
[15], pix2
[16], pix3
[15], pix3
[16]));
3307 static inline int pix_abs8_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3313 s
+= abs(pix1
[0] - pix2
[0]);
3314 s
+= abs(pix1
[1] - pix2
[1]);
3315 s
+= abs(pix1
[2] - pix2
[2]);
3316 s
+= abs(pix1
[3] - pix2
[3]);
3317 s
+= abs(pix1
[4] - pix2
[4]);
3318 s
+= abs(pix1
[5] - pix2
[5]);
3319 s
+= abs(pix1
[6] - pix2
[6]);
3320 s
+= abs(pix1
[7] - pix2
[7]);
3327 static int pix_abs8_x2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3333 s
+= abs(pix1
[0] - avg2(pix2
[0], pix2
[1]));
3334 s
+= abs(pix1
[1] - avg2(pix2
[1], pix2
[2]));
3335 s
+= abs(pix1
[2] - avg2(pix2
[2], pix2
[3]));
3336 s
+= abs(pix1
[3] - avg2(pix2
[3], pix2
[4]));
3337 s
+= abs(pix1
[4] - avg2(pix2
[4], pix2
[5]));
3338 s
+= abs(pix1
[5] - avg2(pix2
[5], pix2
[6]));
3339 s
+= abs(pix1
[6] - avg2(pix2
[6], pix2
[7]));
3340 s
+= abs(pix1
[7] - avg2(pix2
[7], pix2
[8]));
3347 static int pix_abs8_y2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3350 uint8_t *pix3
= pix2
+ line_size
;
3354 s
+= abs(pix1
[0] - avg2(pix2
[0], pix3
[0]));
3355 s
+= abs(pix1
[1] - avg2(pix2
[1], pix3
[1]));
3356 s
+= abs(pix1
[2] - avg2(pix2
[2], pix3
[2]));
3357 s
+= abs(pix1
[3] - avg2(pix2
[3], pix3
[3]));
3358 s
+= abs(pix1
[4] - avg2(pix2
[4], pix3
[4]));
3359 s
+= abs(pix1
[5] - avg2(pix2
[5], pix3
[5]));
3360 s
+= abs(pix1
[6] - avg2(pix2
[6], pix3
[6]));
3361 s
+= abs(pix1
[7] - avg2(pix2
[7], pix3
[7]));
3369 static int pix_abs8_xy2_c(void *v
, uint8_t *pix1
, uint8_t *pix2
, int line_size
, int h
)
3372 uint8_t *pix3
= pix2
+ line_size
;
3376 s
+= abs(pix1
[0] - avg4(pix2
[0], pix2
[1], pix3
[0], pix3
[1]));
3377 s
+= abs(pix1
[1] - avg4(pix2
[1], pix2
[2], pix3
[1], pix3
[2]));
3378 s
+= abs(pix1
[2] - avg4(pix2
[2], pix2
[3], pix3
[2], pix3
[3]));
3379 s
+= abs(pix1
[3] - avg4(pix2
[3], pix2
[4], pix3
[3], pix3
[4]));
3380 s
+= abs(pix1
[4] - avg4(pix2
[4], pix2
[5], pix3
[4], pix3
[5]));
3381 s
+= abs(pix1
[5] - avg4(pix2
[5], pix2
[6], pix3
[5], pix3
[6]));
3382 s
+= abs(pix1
[6] - avg4(pix2
[6], pix2
[7], pix3
[6], pix3
[7]));
3383 s
+= abs(pix1
[7] - avg4(pix2
[7], pix2
[8], pix3
[7], pix3
[8]));
3391 static int nsse16_c(void *v
, uint8_t *s1
, uint8_t *s2
, int stride
, int h
){
3392 MpegEncContext
*c
= v
;
3398 for(x
=0; x
<16; x
++){
3399 score1
+= (s1
[x
] - s2
[x
])*(s1
[x
] - s2
[x
]);
3402 for(x
=0; x
<15; x
++){
3403 score2
+= FFABS( s1
[x
] - s1
[x
+stride
]
3404 - s1
[x
+1] + s1
[x
+1+stride
])
3405 -FFABS( s2
[x
] - s2
[x
+stride
]
3406 - s2
[x
+1] + s2
[x
+1+stride
]);
3413 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
3414 else return score1
+ FFABS(score2
)*8;
3417 static int nsse8_c(void *v
, uint8_t *s1
, uint8_t *s2
, int stride
, int h
){
3418 MpegEncContext
*c
= v
;
3425 score1
+= (s1
[x
] - s2
[x
])*(s1
[x
] - s2
[x
]);
3429 score2
+= FFABS( s1
[x
] - s1
[x
+stride
]
3430 - s1
[x
+1] + s1
[x
+1+stride
])
3431 -FFABS( s2
[x
] - s2
[x
+stride
]
3432 - s2
[x
+1] + s2
[x
+1+stride
]);
3439 if(c
) return score1
+ FFABS(score2
)*c
->avctx
->nsse_weight
;
3440 else return score1
+ FFABS(score2
)*8;
3443 static int try_8x8basis_c(int16_t rem
[64], int16_t weight
[64], int16_t basis
[64], int scale
){
3447 for(i
=0; i
<8*8; i
++){
3448 int b
= rem
[i
] + ((basis
[i
]*scale
+ (1<<(BASIS_SHIFT
- RECON_SHIFT
-1)))>>(BASIS_SHIFT
- RECON_SHIFT
));