Add FFprobe tool.
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
406792e7 3 * Copyright (c) 2000, 2001 Fabrice Bellard
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
7b94177e
DB
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
b78e7197
DB
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
FB
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
b78e7197 13 * version 2.1 of the License, or (at your option) any later version.
de6d9b64 14 *
b78e7197 15 * FFmpeg is distributed in the hope that it will be useful,
de6d9b64 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
de6d9b64 19 *
ff4ec49e 20 * You should have received a copy of the GNU Lesser General Public
b78e7197 21 * License along with FFmpeg; if not, write to the Free Software
5509bffa 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
de6d9b64 23 */
115329f1 24
983e3246 25/**
bad5537e 26 * @file libavcodec/dsputil.c
983e3246
MN
27 * DSP utils
28 */
115329f1 29
de6d9b64
FB
30#include "avcodec.h"
31#include "dsputil.h"
b0368839 32#include "simple_idct.h"
65e4c8c9 33#include "faandct.h"
6f08c541 34#include "faanidct.h"
199436b9 35#include "mathops.h"
059715a4 36#include "snow.h"
af818f7a
DB
37#include "mpegvideo.h"
38#include "config.h"
5596c60c 39
88730be6
MR
40/* snow.c */
41void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42
2dac4acf
LM
43/* vorbis.c */
44void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45
ac2e5564
LM
46/* ac3dec.c */
47void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48
fde82ca7
JR
49/* lpc.c */
50void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
6810b93a 51
4a9ca0a2
LM
52/* pngdec.c */
53void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54
28245435
PR
55/* eaidct.c */
56void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57
55fde95e 58uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
1d503957 59uint32_t ff_squareTbl[512] = {0, };
de6d9b64 60
917f55cc
LM
61// 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
62#define pb_7f (~0UL/255 * 0x7f)
63#define pb_80 (~0UL/255 * 0x80)
469bd7b1 64
0c1a9eda 65const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
66 0, 1, 8, 16, 9, 2, 3, 10,
67 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 68 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 69 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
70 35, 42, 49, 56, 57, 50, 43, 36,
71 29, 22, 15, 23, 30, 37, 44, 51,
72 58, 59, 52, 45, 38, 31, 39, 46,
73 53, 60, 61, 54, 47, 55, 62, 63
74};
75
10acc479
RS
76/* Specific zigzag scan for 248 idct. NOTE that unlike the
77 specification, we interleave the fields */
78const uint8_t ff_zigzag248_direct[64] = {
79 0, 8, 1, 9, 16, 24, 2, 10,
80 17, 25, 32, 40, 48, 56, 33, 41,
81 18, 26, 3, 11, 4, 12, 19, 27,
82 34, 42, 49, 57, 50, 58, 35, 43,
83 20, 28, 5, 13, 6, 14, 21, 29,
84 36, 44, 51, 59, 52, 60, 37, 45,
85 22, 30, 7, 15, 23, 31, 38, 46,
86 53, 61, 54, 62, 39, 47, 55, 63,
87};
88
2f349de2 89/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
e62a55b9 90DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
2f349de2 91
0c1a9eda 92const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 93 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 94 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 95 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 96 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 97 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 98 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 99 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
100 52, 53, 54, 55, 60, 61, 62, 63,
101};
102
0c1a9eda 103const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 104 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 105 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 106 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 107 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 108 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 109 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 110 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
111 38, 46, 54, 62, 39, 47, 55, 63,
112};
113
1a918c08
LM
114/* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
115 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
116const uint32_t ff_inverse[257]={
115329f1
DB
117 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
118 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
119 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
120 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
121 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
122 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
123 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
124 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
125 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
126 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
127 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
128 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
129 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
130 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
131 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
132 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
133 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
134 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
135 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
136 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
137 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
138 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
139 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
140 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
141 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
142 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
143 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
144 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
145 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
146 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
147 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2 148 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
1a918c08 149 16777216
2f349de2
MN
150};
151
b0368839
MN
152/* Input permutation for the simple_idct_mmx */
153static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
154 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
155 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
156 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
157 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
158 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
159 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
160 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
161 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
162};
163
0e956ba2
AS
164static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
165
4c79b95c
AJ
166void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
167 int i;
168 int end;
169
170 st->scantable= src_scantable;
171
172 for(i=0; i<64; i++){
173 int j;
174 j = src_scantable[i];
175 st->permutated[i] = permutation[j];
b250f9c6 176#if ARCH_PPC
4c79b95c
AJ
177 st->inverse[j] = i;
178#endif
179 }
180
181 end=-1;
182 for(i=0; i<64; i++){
183 int j;
184 j = st->permutated[i];
185 if(j>end) end=j;
186 st->raster_end[i]= end;
187 }
188}
189
0c1a9eda 190static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
191{
192 int s, i, j;
193
194 s = 0;
195 for (i = 0; i < 16; i++) {
bb270c08
DB
196 for (j = 0; j < 16; j += 8) {
197 s += pix[0];
198 s += pix[1];
199 s += pix[2];
200 s += pix[3];
201 s += pix[4];
202 s += pix[5];
203 s += pix[6];
204 s += pix[7];
205 pix += 8;
206 }
207 pix += line_size - 16;
3aa102be
MN
208 }
209 return s;
210}
211
0c1a9eda 212static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
213{
214 int s, i, j;
1d503957 215 uint32_t *sq = ff_squareTbl + 256;
3aa102be
MN
216
217 s = 0;
218 for (i = 0; i < 16; i++) {
bb270c08 219 for (j = 0; j < 16; j += 8) {
2a006cd3 220#if 0
bb270c08
DB
221 s += sq[pix[0]];
222 s += sq[pix[1]];
223 s += sq[pix[2]];
224 s += sq[pix[3]];
225 s += sq[pix[4]];
226 s += sq[pix[5]];
227 s += sq[pix[6]];
228 s += sq[pix[7]];
2a006cd3
FL
229#else
230#if LONG_MAX > 2147483647
bb270c08
DB
231 register uint64_t x=*(uint64_t*)pix;
232 s += sq[x&0xff];
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
2a006cd3
FL
236 s += sq[(x>>32)&0xff];
237 s += sq[(x>>40)&0xff];
238 s += sq[(x>>48)&0xff];
239 s += sq[(x>>56)&0xff];
240#else
bb270c08
DB
241 register uint32_t x=*(uint32_t*)pix;
242 s += sq[x&0xff];
243 s += sq[(x>>8)&0xff];
244 s += sq[(x>>16)&0xff];
245 s += sq[(x>>24)&0xff];
2a006cd3
FL
246 x=*(uint32_t*)(pix+4);
247 s += sq[x&0xff];
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
251#endif
252#endif
bb270c08
DB
253 pix += 8;
254 }
255 pix += line_size - 16;
3aa102be
MN
256 }
257 return s;
258}
259
96711ecf 260static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
3d2e8cce 261 int i;
115329f1 262
3d2e8cce
MN
263 for(i=0; i+8<=w; i+=8){
264 dst[i+0]= bswap_32(src[i+0]);
265 dst[i+1]= bswap_32(src[i+1]);
266 dst[i+2]= bswap_32(src[i+2]);
267 dst[i+3]= bswap_32(src[i+3]);
268 dst[i+4]= bswap_32(src[i+4]);
269 dst[i+5]= bswap_32(src[i+5]);
270 dst[i+6]= bswap_32(src[i+6]);
271 dst[i+7]= bswap_32(src[i+7]);
272 }
273 for(;i<w; i++){
274 dst[i+0]= bswap_32(src[i+0]);
275 }
276}
3aa102be 277
26efc54e
MN
278static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
279{
280 int s, i;
1d503957 281 uint32_t *sq = ff_squareTbl + 256;
26efc54e
MN
282
283 s = 0;
284 for (i = 0; i < h; i++) {
285 s += sq[pix1[0] - pix2[0]];
286 s += sq[pix1[1] - pix2[1]];
287 s += sq[pix1[2] - pix2[2]];
288 s += sq[pix1[3] - pix2[3]];
289 pix1 += line_size;
290 pix2 += line_size;
291 }
292 return s;
293}
294
bb198e19 295static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
296{
297 int s, i;
1d503957 298 uint32_t *sq = ff_squareTbl + 256;
1457ab52
MN
299
300 s = 0;
bb198e19 301 for (i = 0; i < h; i++) {
1457ab52
MN
302 s += sq[pix1[0] - pix2[0]];
303 s += sq[pix1[1] - pix2[1]];
304 s += sq[pix1[2] - pix2[2]];
305 s += sq[pix1[3] - pix2[3]];
306 s += sq[pix1[4] - pix2[4]];
307 s += sq[pix1[5] - pix2[5]];
308 s += sq[pix1[6] - pix2[6]];
309 s += sq[pix1[7] - pix2[7]];
310 pix1 += line_size;
311 pix2 += line_size;
312 }
313 return s;
314}
315
bb198e19 316static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 317{
6b026927 318 int s, i;
1d503957 319 uint32_t *sq = ff_squareTbl + 256;
9c76bd48
BF
320
321 s = 0;
bb198e19 322 for (i = 0; i < h; i++) {
6b026927
FH
323 s += sq[pix1[ 0] - pix2[ 0]];
324 s += sq[pix1[ 1] - pix2[ 1]];
325 s += sq[pix1[ 2] - pix2[ 2]];
326 s += sq[pix1[ 3] - pix2[ 3]];
327 s += sq[pix1[ 4] - pix2[ 4]];
328 s += sq[pix1[ 5] - pix2[ 5]];
329 s += sq[pix1[ 6] - pix2[ 6]];
330 s += sq[pix1[ 7] - pix2[ 7]];
331 s += sq[pix1[ 8] - pix2[ 8]];
332 s += sq[pix1[ 9] - pix2[ 9]];
333 s += sq[pix1[10] - pix2[10]];
334 s += sq[pix1[11] - pix2[11]];
335 s += sq[pix1[12] - pix2[12]];
336 s += sq[pix1[13] - pix2[13]];
337 s += sq[pix1[14] - pix2[14]];
338 s += sq[pix1[15] - pix2[15]];
2a006cd3 339
6b026927
FH
340 pix1 += line_size;
341 pix2 += line_size;
9c76bd48
BF
342 }
343 return s;
344}
345
26efc54e 346
b250f9c6 347#if CONFIG_SNOW_ENCODER //dwt is in snow.c
3a6fc8fa 348static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
26efc54e
MN
349 int s, i, j;
350 const int dec_count= w==8 ? 3 : 4;
871371a7 351 int tmp[32*32];
26efc54e 352 int level, ori;
115329f1 353 static const int scale[2][2][4][4]={
26efc54e
MN
354 {
355 {
871371a7 356 // 9/7 8x8 dec=3
26efc54e
MN
357 {268, 239, 239, 213},
358 { 0, 224, 224, 152},
359 { 0, 135, 135, 110},
360 },{
871371a7 361 // 9/7 16x16 or 32x32 dec=4
26efc54e
MN
362 {344, 310, 310, 280},
363 { 0, 320, 320, 228},
364 { 0, 175, 175, 136},
365 { 0, 129, 129, 102},
366 }
367 },{
871371a7
LM
368 {
369 // 5/3 8x8 dec=3
26efc54e
MN
370 {275, 245, 245, 218},
371 { 0, 230, 230, 156},
372 { 0, 138, 138, 113},
373 },{
871371a7 374 // 5/3 16x16 or 32x32 dec=4
26efc54e
MN
375 {352, 317, 317, 286},
376 { 0, 328, 328, 233},
377 { 0, 180, 180, 140},
378 { 0, 132, 132, 105},
379 }
380 }
381 };
26efc54e
MN
382
383 for (i = 0; i < h; i++) {
384 for (j = 0; j < w; j+=4) {
871371a7
LM
385 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
386 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
387 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
388 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
26efc54e
MN
389 }
390 pix1 += line_size;
391 pix2 += line_size;
392 }
8b975b7c 393
871371a7 394 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
26efc54e
MN
395
396 s=0;
871371a7 397 assert(w==h);
26efc54e
MN
398 for(level=0; level<dec_count; level++){
399 for(ori= level ? 1 : 0; ori<4; ori++){
871371a7
LM
400 int size= w>>(dec_count-level);
401 int sx= (ori&1) ? size : 0;
402 int stride= 32<<(dec_count-level);
26efc54e 403 int sy= (ori&2) ? stride>>1 : 0;
115329f1 404
26efc54e
MN
405 for(i=0; i<size; i++){
406 for(j=0; j<size; j++){
407 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
c26abfa5 408 s += FFABS(v);
26efc54e
MN
409 }
410 }
411 }
412 }
115329f1 413 assert(s>=0);
871371a7 414 return s>>9;
26efc54e
MN
415}
416
417static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 1);
419}
420
421static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 8, h, 0);
423}
424
425static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 1);
427}
428
429static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 16, h, 0);
431}
432
486497e0 433int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
434 return w_c(v, pix1, pix2, line_size, 32, h, 1);
435}
436
486497e0 437int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
438 return w_c(v, pix1, pix2, line_size, 32, h, 0);
439}
3a6fc8fa 440#endif
871371a7 441
5a6a9e78
AJ
442/* draw the edges of width 'w' of an image of size width, height */
443//FIXME check that this is ok for mpeg4 interlaced
444static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
445{
446 uint8_t *ptr, *last_line;
447 int i;
448
449 last_line = buf + (height - 1) * wrap;
450 for(i=0;i<w;i++) {
451 /* top and bottom */
452 memcpy(buf - (i + 1) * wrap, buf, width);
453 memcpy(last_line + (i + 1) * wrap, last_line, width);
454 }
455 /* left and right */
456 ptr = buf;
457 for(i=0;i<height;i++) {
458 memset(ptr - w, ptr[0], w);
459 memset(ptr + width, ptr[width-1], w);
460 ptr += wrap;
461 }
462 /* corners */
463 for(i=0;i<w;i++) {
464 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
465 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
466 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
467 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468 }
469}
470
288a44fb
AJ
471/**
472 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
473 * @param buf destination buffer
474 * @param src source buffer
475 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
476 * @param block_w width of block
477 * @param block_h height of block
478 * @param src_x x coordinate of the top left sample of the block in the source buffer
479 * @param src_y y coordinate of the top left sample of the block in the source buffer
480 * @param w width of the source buffer
481 * @param h height of the source buffer
482 */
483void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
484 int src_x, int src_y, int w, int h){
485 int x, y;
486 int start_y, start_x, end_y, end_x;
487
488 if(src_y>= h){
489 src+= (h-1-src_y)*linesize;
490 src_y=h-1;
491 }else if(src_y<=-block_h){
492 src+= (1-block_h-src_y)*linesize;
493 src_y=1-block_h;
494 }
495 if(src_x>= w){
496 src+= (w-1-src_x);
497 src_x=w-1;
498 }else if(src_x<=-block_w){
499 src+= (1-block_w-src_x);
500 src_x=1-block_w;
501 }
502
503 start_y= FFMAX(0, -src_y);
504 start_x= FFMAX(0, -src_x);
505 end_y= FFMIN(block_h, h-src_y);
506 end_x= FFMIN(block_w, w-src_x);
507
508 // copy existing part
509 for(y=start_y; y<end_y; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= src[x + y*linesize];
512 }
513 }
514
515 //top
516 for(y=0; y<start_y; y++){
517 for(x=start_x; x<end_x; x++){
518 buf[x + y*linesize]= buf[x + start_y*linesize];
519 }
520 }
521
522 //bottom
523 for(y=end_y; y<block_h; y++){
524 for(x=start_x; x<end_x; x++){
525 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526 }
527 }
528
529 for(y=0; y<block_h; y++){
530 //left
531 for(x=0; x<start_x; x++){
532 buf[x + y*linesize]= buf[start_x + y*linesize];
533 }
534
535 //right
536 for(x=end_x; x<block_w; x++){
537 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538 }
539 }
540}
541
0c1a9eda 542static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 543{
de6d9b64
FB
544 int i;
545
546 /* read the pixels */
de6d9b64 547 for(i=0;i<8;i++) {
c13e1abd
FH
548 block[0] = pixels[0];
549 block[1] = pixels[1];
550 block[2] = pixels[2];
551 block[3] = pixels[3];
552 block[4] = pixels[4];
553 block[5] = pixels[5];
554 block[6] = pixels[6];
555 block[7] = pixels[7];
556 pixels += line_size;
557 block += 8;
de6d9b64
FB
558 }
559}
560
0c1a9eda 561static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 562 const uint8_t *s2, int stride){
9dbcbd92
MN
563 int i;
564
565 /* read the pixels */
9dbcbd92 566 for(i=0;i<8;i++) {
c13e1abd
FH
567 block[0] = s1[0] - s2[0];
568 block[1] = s1[1] - s2[1];
569 block[2] = s1[2] - s2[2];
570 block[3] = s1[3] - s2[3];
571 block[4] = s1[4] - s2[4];
572 block[5] = s1[5] - s2[5];
573 block[6] = s1[6] - s2[6];
574 block[7] = s1[7] - s2[7];
9dbcbd92
MN
575 s1 += stride;
576 s2 += stride;
c13e1abd 577 block += 8;
9dbcbd92
MN
578 }
579}
580
581
0c1a9eda 582static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 583 int line_size)
de6d9b64 584{
de6d9b64 585 int i;
55fde95e 586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 587
de6d9b64 588 /* read the pixels */
de6d9b64 589 for(i=0;i<8;i++) {
c13e1abd
FH
590 pixels[0] = cm[block[0]];
591 pixels[1] = cm[block[1]];
592 pixels[2] = cm[block[2]];
593 pixels[3] = cm[block[3]];
594 pixels[4] = cm[block[4]];
595 pixels[5] = cm[block[5]];
596 pixels[6] = cm[block[6]];
597 pixels[7] = cm[block[7]];
598
599 pixels += line_size;
600 block += 8;
de6d9b64
FB
601 }
602}
603
178fcca8 604static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 605 int line_size)
178fcca8
MN
606{
607 int i;
55fde95e 608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 609
178fcca8
MN
610 /* read the pixels */
611 for(i=0;i<4;i++) {
612 pixels[0] = cm[block[0]];
613 pixels[1] = cm[block[1]];
614 pixels[2] = cm[block[2]];
615 pixels[3] = cm[block[3]];
616
617 pixels += line_size;
618 block += 8;
619 }
620}
621
9ca358b9 622static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 623 int line_size)
9ca358b9
MN
624{
625 int i;
55fde95e 626 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 627
9ca358b9
MN
628 /* read the pixels */
629 for(i=0;i<2;i++) {
630 pixels[0] = cm[block[0]];
631 pixels[1] = cm[block[1]];
632
633 pixels += line_size;
634 block += 8;
635 }
636}
637
115329f1 638static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
639 uint8_t *restrict pixels,
640 int line_size)
641{
642 int i, j;
643
644 for (i = 0; i < 8; i++) {
645 for (j = 0; j < 8; j++) {
646 if (*block < -128)
647 *pixels = 0;
648 else if (*block > 127)
649 *pixels = 255;
650 else
651 *pixels = (uint8_t)(*block + 128);
652 block++;
653 pixels++;
654 }
655 pixels += (line_size - 8);
656 }
657}
658
0c1a9eda 659static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 660 int line_size)
de6d9b64 661{
de6d9b64 662 int i;
55fde95e 663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 664
de6d9b64 665 /* read the pixels */
de6d9b64 666 for(i=0;i<8;i++) {
c13e1abd
FH
667 pixels[0] = cm[pixels[0] + block[0]];
668 pixels[1] = cm[pixels[1] + block[1]];
669 pixels[2] = cm[pixels[2] + block[2]];
670 pixels[3] = cm[pixels[3] + block[3]];
671 pixels[4] = cm[pixels[4] + block[4]];
672 pixels[5] = cm[pixels[5] + block[5]];
673 pixels[6] = cm[pixels[6] + block[6]];
674 pixels[7] = cm[pixels[7] + block[7]];
675 pixels += line_size;
676 block += 8;
de6d9b64
FB
677 }
678}
178fcca8
MN
679
680static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
681 int line_size)
682{
683 int i;
55fde95e 684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 685
178fcca8
MN
686 /* read the pixels */
687 for(i=0;i<4;i++) {
688 pixels[0] = cm[pixels[0] + block[0]];
689 pixels[1] = cm[pixels[1] + block[1]];
690 pixels[2] = cm[pixels[2] + block[2]];
691 pixels[3] = cm[pixels[3] + block[3]];
692 pixels += line_size;
693 block += 8;
694 }
695}
9ca358b9
MN
696
697static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
698 int line_size)
699{
700 int i;
55fde95e 701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 702
9ca358b9
MN
703 /* read the pixels */
704 for(i=0;i<2;i++) {
705 pixels[0] = cm[pixels[0] + block[0]];
706 pixels[1] = cm[pixels[1] + block[1]];
707 pixels += line_size;
708 block += 8;
709 }
710}
36940eca
LM
711
712static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
713{
714 int i;
715 for(i=0;i<8;i++) {
716 pixels[0] += block[0];
717 pixels[1] += block[1];
718 pixels[2] += block[2];
719 pixels[3] += block[3];
720 pixels[4] += block[4];
721 pixels[5] += block[5];
722 pixels[6] += block[6];
723 pixels[7] += block[7];
724 pixels += line_size;
725 block += 8;
726 }
727}
728
729static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
730{
731 int i;
732 for(i=0;i<4;i++) {
733 pixels[0] += block[0];
734 pixels[1] += block[1];
735 pixels[2] += block[2];
736 pixels[3] += block[3];
737 pixels += line_size;
738 block += 4;
739 }
740}
741
1edbfe19
LM
742static int sum_abs_dctelem_c(DCTELEM *block)
743{
744 int sum=0, i;
745 for(i=0; i<64; i++)
746 sum+= FFABS(block[i]);
747 return sum;
748}
749
59fe111e
MN
750#if 0
751
752#define PIXOP2(OPNAME, OP) \
b3184779 753static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
754{\
755 int i;\
756 for(i=0; i<h; i++){\
905694d9 757 OP(*((uint64_t*)block), AV_RN64(pixels));\
59fe111e
MN
758 pixels+=line_size;\
759 block +=line_size;\
760 }\
761}\
762\
45553457 763static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
764{\
765 int i;\
766 for(i=0; i<h; i++){\
905694d9
RS
767 const uint64_t a= AV_RN64(pixels );\
768 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
769 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
770 pixels+=line_size;\
771 block +=line_size;\
772 }\
773}\
774\
45553457 775static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
776{\
777 int i;\
778 for(i=0; i<h; i++){\
905694d9
RS
779 const uint64_t a= AV_RN64(pixels );\
780 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
781 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
782 pixels+=line_size;\
783 block +=line_size;\
784 }\
785}\
786\
45553457 787static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
788{\
789 int i;\
790 for(i=0; i<h; i++){\
905694d9
RS
791 const uint64_t a= AV_RN64(pixels );\
792 const uint64_t b= AV_RN64(pixels+line_size);\
59fe111e
MN
793 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
794 pixels+=line_size;\
795 block +=line_size;\
796 }\
797}\
798\
45553457 799static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
800{\
801 int i;\
802 for(i=0; i<h; i++){\
905694d9
RS
803 const uint64_t a= AV_RN64(pixels );\
804 const uint64_t b= AV_RN64(pixels+line_size);\
59fe111e
MN
805 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
806 pixels+=line_size;\
807 block +=line_size;\
808 }\
809}\
810\
45553457 811static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
812{\
813 int i;\
905694d9
RS
814 const uint64_t a= AV_RN64(pixels );\
815 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
816 uint64_t l0= (a&0x0303030303030303ULL)\
817 + (b&0x0303030303030303ULL)\
818 + 0x0202020202020202ULL;\
819 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821 uint64_t l1,h1;\
822\
823 pixels+=line_size;\
824 for(i=0; i<h; i+=2){\
905694d9
RS
825 uint64_t a= AV_RN64(pixels );\
826 uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
827 l1= (a&0x0303030303030303ULL)\
828 + (b&0x0303030303030303ULL);\
829 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
832 pixels+=line_size;\
833 block +=line_size;\
905694d9
RS
834 a= AV_RN64(pixels );\
835 b= AV_RN64(pixels+1);\
59fe111e
MN
836 l0= (a&0x0303030303030303ULL)\
837 + (b&0x0303030303030303ULL)\
838 + 0x0202020202020202ULL;\
839 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
840 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
841 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
842 pixels+=line_size;\
843 block +=line_size;\
844 }\
845}\
846\
45553457 847static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
848{\
849 int i;\
905694d9
RS
850 const uint64_t a= AV_RN64(pixels );\
851 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
852 uint64_t l0= (a&0x0303030303030303ULL)\
853 + (b&0x0303030303030303ULL)\
854 + 0x0101010101010101ULL;\
855 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857 uint64_t l1,h1;\
858\
859 pixels+=line_size;\
860 for(i=0; i<h; i+=2){\
905694d9
RS
861 uint64_t a= AV_RN64(pixels );\
862 uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
863 l1= (a&0x0303030303030303ULL)\
864 + (b&0x0303030303030303ULL);\
865 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
868 pixels+=line_size;\
869 block +=line_size;\
905694d9
RS
870 a= AV_RN64(pixels );\
871 b= AV_RN64(pixels+1);\
59fe111e
MN
872 l0= (a&0x0303030303030303ULL)\
873 + (b&0x0303030303030303ULL)\
874 + 0x0101010101010101ULL;\
875 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
878 pixels+=line_size;\
879 block +=line_size;\
880 }\
881}\
882\
45553457
ZK
883CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
884CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
885CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
886CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
887CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
888CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
889CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
890
891#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
892#else // 64 bit variant
893
894#define PIXOP2(OPNAME, OP) \
669ac79c
MN
895static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 int i;\
897 for(i=0; i<h; i++){\
905694d9 898 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
669ac79c
MN
899 pixels+=line_size;\
900 block +=line_size;\
901 }\
902}\
0da71265
MN
903static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 int i;\
905 for(i=0; i<h; i++){\
905694d9 906 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
0da71265
MN
907 pixels+=line_size;\
908 block +=line_size;\
909 }\
910}\
45553457 911static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
912 int i;\
913 for(i=0; i<h; i++){\
905694d9
RS
914 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
915 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
59fe111e
MN
916 pixels+=line_size;\
917 block +=line_size;\
918 }\
919}\
45553457
ZK
920static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 922}\
59fe111e 923\
b3184779
MN
924static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
925 int src_stride1, int src_stride2, int h){\
59fe111e
MN
926 int i;\
927 for(i=0; i<h; i++){\
b3184779 928 uint32_t a,b;\
905694d9
RS
929 a= AV_RN32(&src1[i*src_stride1 ]);\
930 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 931 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
905694d9
RS
932 a= AV_RN32(&src1[i*src_stride1+4]);\
933 b= AV_RN32(&src2[i*src_stride2+4]);\
d8085ea7 934 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
935 }\
936}\
937\
b3184779
MN
938static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
939 int src_stride1, int src_stride2, int h){\
59fe111e
MN
940 int i;\
941 for(i=0; i<h; i++){\
b3184779 942 uint32_t a,b;\
905694d9
RS
943 a= AV_RN32(&src1[i*src_stride1 ]);\
944 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 945 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
905694d9
RS
946 a= AV_RN32(&src1[i*src_stride1+4]);\
947 b= AV_RN32(&src2[i*src_stride2+4]);\
d8085ea7 948 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
949 }\
950}\
951\
0da71265
MN
952static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
953 int src_stride1, int src_stride2, int h){\
954 int i;\
955 for(i=0; i<h; i++){\
956 uint32_t a,b;\
905694d9
RS
957 a= AV_RN32(&src1[i*src_stride1 ]);\
958 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 959 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
960 }\
961}\
962\
669ac79c
MN
963static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
964 int src_stride1, int src_stride2, int h){\
965 int i;\
966 for(i=0; i<h; i++){\
967 uint32_t a,b;\
905694d9
RS
968 a= AV_RN16(&src1[i*src_stride1 ]);\
969 b= AV_RN16(&src2[i*src_stride2 ]);\
669ac79c
MN
970 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
971 }\
972}\
973\
b3184779
MN
974static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
975 int src_stride1, int src_stride2, int h){\
976 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
977 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
978}\
979\
980static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
981 int src_stride1, int src_stride2, int h){\
982 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
983 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
984}\
985\
45553457 986static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
987 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
988}\
989\
45553457 990static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
991 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
992}\
993\
45553457 994static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
995 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
996}\
997\
45553457 998static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
999 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1000}\
1001\
1002static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
1004 int i;\
1005 for(i=0; i<h; i++){\
b3184779 1006 uint32_t a, b, c, d, l0, l1, h0, h1;\
905694d9
RS
1007 a= AV_RN32(&src1[i*src_stride1]);\
1008 b= AV_RN32(&src2[i*src_stride2]);\
1009 c= AV_RN32(&src3[i*src_stride3]);\
1010 d= AV_RN32(&src4[i*src_stride4]);\
b3184779
MN
1011 l0= (a&0x03030303UL)\
1012 + (b&0x03030303UL)\
1013 + 0x02020202UL;\
1014 h0= ((a&0xFCFCFCFCUL)>>2)\
1015 + ((b&0xFCFCFCFCUL)>>2);\
1016 l1= (c&0x03030303UL)\
1017 + (d&0x03030303UL);\
1018 h1= ((c&0xFCFCFCFCUL)>>2)\
1019 + ((d&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
905694d9
RS
1021 a= AV_RN32(&src1[i*src_stride1+4]);\
1022 b= AV_RN32(&src2[i*src_stride2+4]);\
1023 c= AV_RN32(&src3[i*src_stride3+4]);\
1024 d= AV_RN32(&src4[i*src_stride4+4]);\
b3184779
MN
1025 l0= (a&0x03030303UL)\
1026 + (b&0x03030303UL)\
1027 + 0x02020202UL;\
1028 h0= ((a&0xFCFCFCFCUL)>>2)\
1029 + ((b&0xFCFCFCFCUL)>>2);\
1030 l1= (c&0x03030303UL)\
1031 + (d&0x03030303UL);\
1032 h1= ((c&0xFCFCFCFCUL)>>2)\
1033 + ((d&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
1035 }\
1036}\
669ac79c
MN
1037\
1038static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1040}\
1041\
1042static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1044}\
1045\
1046static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1048}\
1049\
1050static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1052}\
1053\
b3184779
MN
1054static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
1056 int i;\
1057 for(i=0; i<h; i++){\
b3184779 1058 uint32_t a, b, c, d, l0, l1, h0, h1;\
905694d9
RS
1059 a= AV_RN32(&src1[i*src_stride1]);\
1060 b= AV_RN32(&src2[i*src_stride2]);\
1061 c= AV_RN32(&src3[i*src_stride3]);\
1062 d= AV_RN32(&src4[i*src_stride4]);\
b3184779
MN
1063 l0= (a&0x03030303UL)\
1064 + (b&0x03030303UL)\
1065 + 0x01010101UL;\
1066 h0= ((a&0xFCFCFCFCUL)>>2)\
1067 + ((b&0xFCFCFCFCUL)>>2);\
1068 l1= (c&0x03030303UL)\
1069 + (d&0x03030303UL);\
1070 h1= ((c&0xFCFCFCFCUL)>>2)\
1071 + ((d&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
905694d9
RS
1073 a= AV_RN32(&src1[i*src_stride1+4]);\
1074 b= AV_RN32(&src2[i*src_stride2+4]);\
1075 c= AV_RN32(&src3[i*src_stride3+4]);\
1076 d= AV_RN32(&src4[i*src_stride4+4]);\
b3184779
MN
1077 l0= (a&0x03030303UL)\
1078 + (b&0x03030303UL)\
1079 + 0x01010101UL;\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 l1= (c&0x03030303UL)\
1083 + (d&0x03030303UL);\
1084 h1= ((c&0xFCFCFCFCUL)>>2)\
1085 + ((d&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
1087 }\
1088}\
b3184779
MN
1089static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093}\
1094static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1098}\
59fe111e 1099\
669ac79c
MN
1100static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1101{\
1102 int i, a0, b0, a1, b1;\
1103 a0= pixels[0];\
1104 b0= pixels[1] + 2;\
1105 a0 += b0;\
1106 b0 += pixels[2];\
1107\
1108 pixels+=line_size;\
1109 for(i=0; i<h; i+=2){\
1110 a1= pixels[0];\
1111 b1= pixels[1];\
1112 a1 += b1;\
1113 b1 += pixels[2];\
1114\
1115 block[0]= (a1+a0)>>2; /* FIXME non put */\
1116 block[1]= (b1+b0)>>2;\
1117\
1118 pixels+=line_size;\
1119 block +=line_size;\
1120\
1121 a0= pixels[0];\
1122 b0= pixels[1] + 2;\
1123 a0 += b0;\
1124 b0 += pixels[2];\
1125\
1126 block[0]= (a1+a0)>>2;\
1127 block[1]= (b1+b0)>>2;\
1128 pixels+=line_size;\
1129 block +=line_size;\
1130 }\
1131}\
1132\
1133static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1134{\
1135 int i;\
905694d9
RS
1136 const uint32_t a= AV_RN32(pixels );\
1137 const uint32_t b= AV_RN32(pixels+1);\
669ac79c
MN
1138 uint32_t l0= (a&0x03030303UL)\
1139 + (b&0x03030303UL)\
1140 + 0x02020202UL;\
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142 + ((b&0xFCFCFCFCUL)>>2);\
1143 uint32_t l1,h1;\
1144\
1145 pixels+=line_size;\
1146 for(i=0; i<h; i+=2){\
905694d9
RS
1147 uint32_t a= AV_RN32(pixels );\
1148 uint32_t b= AV_RN32(pixels+1);\
669ac79c
MN
1149 l1= (a&0x03030303UL)\
1150 + (b&0x03030303UL);\
1151 h1= ((a&0xFCFCFCFCUL)>>2)\
1152 + ((b&0xFCFCFCFCUL)>>2);\
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154 pixels+=line_size;\
1155 block +=line_size;\
905694d9
RS
1156 a= AV_RN32(pixels );\
1157 b= AV_RN32(pixels+1);\
669ac79c
MN
1158 l0= (a&0x03030303UL)\
1159 + (b&0x03030303UL)\
1160 + 0x02020202UL;\
1161 h0= ((a&0xFCFCFCFCUL)>>2)\
1162 + ((b&0xFCFCFCFCUL)>>2);\
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164 pixels+=line_size;\
1165 block +=line_size;\
1166 }\
1167}\
1168\
45553457 1169static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1170{\
1171 int j;\
1172 for(j=0; j<2; j++){\
1173 int i;\
905694d9
RS
1174 const uint32_t a= AV_RN32(pixels );\
1175 const uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1176 uint32_t l0= (a&0x03030303UL)\
1177 + (b&0x03030303UL)\
1178 + 0x02020202UL;\
1179 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180 + ((b&0xFCFCFCFCUL)>>2);\
1181 uint32_t l1,h1;\
1182\
1183 pixels+=line_size;\
1184 for(i=0; i<h; i+=2){\
905694d9
RS
1185 uint32_t a= AV_RN32(pixels );\
1186 uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1187 l1= (a&0x03030303UL)\
1188 + (b&0x03030303UL);\
1189 h1= ((a&0xFCFCFCFCUL)>>2)\
1190 + ((b&0xFCFCFCFCUL)>>2);\
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192 pixels+=line_size;\
1193 block +=line_size;\
905694d9
RS
1194 a= AV_RN32(pixels );\
1195 b= AV_RN32(pixels+1);\
59fe111e
MN
1196 l0= (a&0x03030303UL)\
1197 + (b&0x03030303UL)\
1198 + 0x02020202UL;\
1199 h0= ((a&0xFCFCFCFCUL)>>2)\
1200 + ((b&0xFCFCFCFCUL)>>2);\
1201 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1202 pixels+=line_size;\
1203 block +=line_size;\
1204 }\
1205 pixels+=4-line_size*(h+1);\
1206 block +=4-line_size*h;\
1207 }\
1208}\
1209\
45553457 1210static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1211{\
1212 int j;\
1213 for(j=0; j<2; j++){\
1214 int i;\
905694d9
RS
1215 const uint32_t a= AV_RN32(pixels );\
1216 const uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1217 uint32_t l0= (a&0x03030303UL)\
1218 + (b&0x03030303UL)\
1219 + 0x01010101UL;\
1220 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221 + ((b&0xFCFCFCFCUL)>>2);\
1222 uint32_t l1,h1;\
1223\
1224 pixels+=line_size;\
1225 for(i=0; i<h; i+=2){\
905694d9
RS
1226 uint32_t a= AV_RN32(pixels );\
1227 uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1228 l1= (a&0x03030303UL)\
1229 + (b&0x03030303UL);\
1230 h1= ((a&0xFCFCFCFCUL)>>2)\
1231 + ((b&0xFCFCFCFCUL)>>2);\
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233 pixels+=line_size;\
1234 block +=line_size;\
905694d9
RS
1235 a= AV_RN32(pixels );\
1236 b= AV_RN32(pixels+1);\
59fe111e
MN
1237 l0= (a&0x03030303UL)\
1238 + (b&0x03030303UL)\
1239 + 0x01010101UL;\
1240 h0= ((a&0xFCFCFCFCUL)>>2)\
1241 + ((b&0xFCFCFCFCUL)>>2);\
1242 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1243 pixels+=line_size;\
1244 block +=line_size;\
1245 }\
1246 pixels+=4-line_size*(h+1);\
1247 block +=4-line_size*h;\
1248 }\
1249}\
1250\
45553457
ZK
1251CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1252CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1256CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1259
d8085ea7 1260#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1261#endif
59fe111e
MN
1262#define op_put(a, b) a = b
1263
1264PIXOP2(avg, op_avg)
1265PIXOP2(put, op_put)
1266#undef op_avg
1267#undef op_put
1268
de6d9b64
FB
1269#define avg2(a,b) ((a+b+1)>>1)
1270#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1271
c0a0170c
MN
1272static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1274}
1275
1276static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1278}
073b013d 1279
0c1a9eda 1280static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1281{
1282 const int A=(16-x16)*(16-y16);
1283 const int B=( x16)*(16-y16);
1284 const int C=(16-x16)*( y16);
1285 const int D=( x16)*( y16);
1286 int i;
44eb4951
MN
1287
1288 for(i=0; i<h; i++)
1289 {
b3184779
MN
1290 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1298 dst+= stride;
1299 src+= stride;
44eb4951
MN
1300 }
1301}
1302
703c8195 1303void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1304 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1305{
1306 int y, vx, vy;
1307 const int s= 1<<shift;
115329f1 1308
073b013d
MN
1309 width--;
1310 height--;
1311
1312 for(y=0; y<h; y++){
1313 int x;
1314
1315 vx= ox;
1316 vy= oy;
1317 for(x=0; x<8; x++){ //XXX FIXME optimize
1318 int src_x, src_y, frac_x, frac_y, index;
1319
1320 src_x= vx>>16;
1321 src_y= vy>>16;
1322 frac_x= src_x&(s-1);
1323 frac_y= src_y&(s-1);
1324 src_x>>=shift;
1325 src_y>>=shift;
115329f1 1326
073b013d
MN
1327 if((unsigned)src_x < width){
1328 if((unsigned)src_y < height){
1329 index= src_x + src_y*stride;
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1331 + src[index +1]* frac_x )*(s-frac_y)
1332 + ( src[index+stride ]*(s-frac_x)
1333 + src[index+stride+1]* frac_x )* frac_y
1334 + r)>>(shift*2);
1335 }else{
f66e4f5f 1336 index= src_x + av_clip(src_y, 0, height)*stride;
115329f1 1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1338 + src[index +1]* frac_x )*s
1339 + r)>>(shift*2);
1340 }
1341 }else{
1342 if((unsigned)src_y < height){
f66e4f5f 1343 index= av_clip(src_x, 0, width) + src_y*stride;
115329f1 1344 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1345 + src[index+stride ]* frac_y )*s
1346 + r)>>(shift*2);
1347 }else{
f66e4f5f 1348 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
073b013d
MN
1349 dst[y*stride + x]= src[index ];
1350 }
1351 }
115329f1 1352
073b013d
MN
1353 vx+= dxx;
1354 vy+= dyx;
1355 }
1356 ox += dxy;
1357 oy += dyy;
1358 }
1359}
669ac79c
MN
1360
1361static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362 switch(width){
1363 case 2: put_pixels2_c (dst, src, stride, height); break;
1364 case 4: put_pixels4_c (dst, src, stride, height); break;
1365 case 8: put_pixels8_c (dst, src, stride, height); break;
1366 case 16:put_pixels16_c(dst, src, stride, height); break;
1367 }
1368}
1369
1370static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371 int i,j;
1372 for (i=0; i < height; i++) {
1373 for (j=0; j < width; j++) {
bb270c08 1374 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1375 }
1376 src += stride;
1377 dst += stride;
1378 }
1379}
1380
1381static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382 int i,j;
1383 for (i=0; i < height; i++) {
1384 for (j=0; j < width; j++) {
bb270c08 1385 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1386 }
1387 src += stride;
1388 dst += stride;
1389 }
1390}
115329f1 1391
669ac79c
MN
1392static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393 int i,j;
1394 for (i=0; i < height; i++) {
1395 for (j=0; j < width; j++) {
bb270c08 1396 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1397 }
1398 src += stride;
1399 dst += stride;
1400 }
1401}
115329f1 1402
669ac79c
MN
1403static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404 int i,j;
1405 for (i=0; i < height; i++) {
1406 for (j=0; j < width; j++) {
bb270c08 1407 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1408 }
1409 src += stride;
1410 dst += stride;
1411 }
1412}
1413
1414static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415 int i,j;
1416 for (i=0; i < height; i++) {
1417 for (j=0; j < width; j++) {
bb270c08 1418 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1419 }
1420 src += stride;
1421 dst += stride;
1422 }
1423}
1424
1425static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426 int i,j;
1427 for (i=0; i < height; i++) {
1428 for (j=0; j < width; j++) {
bb270c08 1429 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1430 }
1431 src += stride;
1432 dst += stride;
1433 }
1434}
1435
1436static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437 int i,j;
1438 for (i=0; i < height; i++) {
1439 for (j=0; j < width; j++) {
bb270c08 1440 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1441 }
1442 src += stride;
1443 dst += stride;
1444 }
1445}
1446
1447static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448 int i,j;
1449 for (i=0; i < height; i++) {
1450 for (j=0; j < width; j++) {
bb270c08 1451 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1452 }
1453 src += stride;
1454 dst += stride;
1455 }
1456}
da3b9756
MM
1457
1458static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459 switch(width){
1460 case 2: avg_pixels2_c (dst, src, stride, height); break;
1461 case 4: avg_pixels4_c (dst, src, stride, height); break;
1462 case 8: avg_pixels8_c (dst, src, stride, height); break;
1463 case 16:avg_pixels16_c(dst, src, stride, height); break;
1464 }
1465}
1466
1467static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1468 int i,j;
1469 for (i=0; i < height; i++) {
1470 for (j=0; j < width; j++) {
bb270c08 1471 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1472 }
1473 src += stride;
1474 dst += stride;
1475 }
1476}
1477
1478static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1479 int i,j;
1480 for (i=0; i < height; i++) {
1481 for (j=0; j < width; j++) {
bb270c08 1482 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1483 }
1484 src += stride;
1485 dst += stride;
1486 }
1487}
115329f1 1488
da3b9756
MM
1489static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1490 int i,j;
1491 for (i=0; i < height; i++) {
1492 for (j=0; j < width; j++) {
bb270c08 1493 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1494 }
1495 src += stride;
1496 dst += stride;
1497 }
1498}
115329f1 1499
da3b9756
MM
1500static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1501 int i,j;
1502 for (i=0; i < height; i++) {
1503 for (j=0; j < width; j++) {
bb270c08 1504 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1505 }
1506 src += stride;
1507 dst += stride;
1508 }
1509}
1510
1511static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512 int i,j;
1513 for (i=0; i < height; i++) {
1514 for (j=0; j < width; j++) {
bb270c08 1515 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1516 }
1517 src += stride;
1518 dst += stride;
1519 }
1520}
1521
1522static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523 int i,j;
1524 for (i=0; i < height; i++) {
1525 for (j=0; j < width; j++) {
bb270c08 1526 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1527 }
1528 src += stride;
1529 dst += stride;
1530 }
1531}
1532
1533static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534 int i,j;
1535 for (i=0; i < height; i++) {
1536 for (j=0; j < width; j++) {
bb270c08 1537 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1538 }
1539 src += stride;
1540 dst += stride;
1541 }
1542}
1543
1544static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1545 int i,j;
1546 for (i=0; i < height; i++) {
1547 for (j=0; j < width; j++) {
bb270c08 1548 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1549 }
1550 src += stride;
1551 dst += stride;
1552 }
1553}
669ac79c
MN
1554#if 0
1555#define TPEL_WIDTH(width)\
1556static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1574#endif
1575
0da71265
MN
1576#define H264_CHROMA_MC(OPNAME, OP)\
1577static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578 const int A=(8-x)*(8-y);\
1579 const int B=( x)*(8-y);\
1580 const int C=(8-x)*( y);\
1581 const int D=( x)*( y);\
1582 int i;\
1583 \
1584 assert(x<8 && y<8 && x>=0 && y>=0);\
1585\
febdd0b9 1586 if(D){\
f315b394 1587 for(i=0; i<h; i++){\
76abb18e
MN
1588 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1590 dst+= stride;\
1591 src+= stride;\
1592 }\
febdd0b9
MN
1593 }else{\
1594 const int E= B+C;\
1595 const int step= C ? stride : 1;\
f315b394 1596 for(i=0; i<h; i++){\
febdd0b9
MN
1597 OP(dst[0], (A*src[0] + E*src[step+0]));\
1598 OP(dst[1], (A*src[1] + E*src[step+1]));\
1599 dst+= stride;\
1600 src+= stride;\
1601 }\
1602 }\
0da71265
MN
1603}\
1604\
1605static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606 const int A=(8-x)*(8-y);\
1607 const int B=( x)*(8-y);\
1608 const int C=(8-x)*( y);\
1609 const int D=( x)*( y);\
1610 int i;\
1611 \
1612 assert(x<8 && y<8 && x>=0 && y>=0);\
1613\
febdd0b9 1614 if(D){\
f315b394 1615 for(i=0; i<h; i++){\
76abb18e
MN
1616 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1620 dst+= stride;\
1621 src+= stride;\
1622 }\
febdd0b9
MN
1623 }else{\
1624 const int E= B+C;\
1625 const int step= C ? stride : 1;\
f315b394 1626 for(i=0; i<h; i++){\
febdd0b9
MN
1627 OP(dst[0], (A*src[0] + E*src[step+0]));\
1628 OP(dst[1], (A*src[1] + E*src[step+1]));\
1629 OP(dst[2], (A*src[2] + E*src[step+2]));\
1630 OP(dst[3], (A*src[3] + E*src[step+3]));\
1631 dst+= stride;\
1632 src+= stride;\
1633 }\
1634 }\
0da71265
MN
1635}\
1636\
1637static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638 const int A=(8-x)*(8-y);\
1639 const int B=( x)*(8-y);\
1640 const int C=(8-x)*( y);\
1641 const int D=( x)*( y);\
1642 int i;\
1643 \
1644 assert(x<8 && y<8 && x>=0 && y>=0);\
1645\
815c81c0 1646 if(D){\
f315b394 1647 for(i=0; i<h; i++){\
76abb18e
MN
1648 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1656 dst+= stride;\
1657 src+= stride;\
1658 }\
815c81c0
MN
1659 }else{\
1660 const int E= B+C;\
1661 const int step= C ? stride : 1;\
f315b394 1662 for(i=0; i<h; i++){\
815c81c0
MN
1663 OP(dst[0], (A*src[0] + E*src[step+0]));\
1664 OP(dst[1], (A*src[1] + E*src[step+1]));\
1665 OP(dst[2], (A*src[2] + E*src[step+2]));\
1666 OP(dst[3], (A*src[3] + E*src[step+3]));\
1667 OP(dst[4], (A*src[4] + E*src[step+4]));\
1668 OP(dst[5], (A*src[5] + E*src[step+5]));\
1669 OP(dst[6], (A*src[6] + E*src[step+6]));\
1670 OP(dst[7], (A*src[7] + E*src[step+7]));\
1671 dst+= stride;\
1672 src+= stride;\
1673 }\
1674 }\
0da71265
MN
1675}
1676
1677#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678#define op_put(a, b) a = (((b) + 32)>>6)
1679
1680H264_CHROMA_MC(put_ , op_put)
1681H264_CHROMA_MC(avg_ , op_avg)
1682#undef op_avg
1683#undef op_put
1684
c374691b 1685static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
e34350a3
KS
1686 const int A=(8-x)*(8-y);
1687 const int B=( x)*(8-y);
1688 const int C=(8-x)*( y);
1689 const int D=( x)*( y);
1690 int i;
1691
1692 assert(x<8 && y<8 && x>=0 && y>=0);
1693
1694 for(i=0; i<h; i++)
1695 {
1696 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1704 dst+= stride;
1705 src+= stride;
1706 }
1707}
1708
8013da73
DC
1709static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710 const int A=(8-x)*(8-y);
1711 const int B=( x)*(8-y);
1712 const int C=(8-x)*( y);
1713 const int D=( x)*( y);
1714 int i;
1715
1716 assert(x<8 && y<8 && x>=0 && y>=0);
1717
1718 for(i=0; i<h; i++)
1719 {
1720 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1728 dst+= stride;
1729 src+= stride;
1730 }
1731}
1732
b3184779 1733#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda 1734static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1735 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1736 int i;\
1737 for(i=0; i<h; i++)\
1738 {\
1739 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1747 dst+=dstStride;\
1748 src+=srcStride;\
1749 }\
44eb4951
MN
1750}\
1751\
0c1a9eda 1752static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1753 const int w=8;\
55fde95e 1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1755 int i;\
1756 for(i=0; i<w; i++)\
1757 {\
1758 const int src0= src[0*srcStride];\
1759 const int src1= src[1*srcStride];\
1760 const int src2= src[2*srcStride];\
1761 const int src3= src[3*srcStride];\
1762 const int src4= src[4*srcStride];\
1763 const int src5= src[5*srcStride];\
1764 const int src6= src[6*srcStride];\
1765 const int src7= src[7*srcStride];\
1766 const int src8= src[8*srcStride];\
1767 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1775 dst++;\
1776 src++;\
1777 }\
1778}\
1779\
0c1a9eda 1780static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1782 int i;\
826f429a 1783 \
b3184779
MN
1784 for(i=0; i<h; i++)\
1785 {\
1786 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1802 dst+=dstStride;\
1803 src+=srcStride;\
1804 }\
1805}\
1806\
0c1a9eda 1807static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
55fde95e 1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1809 int i;\
826f429a 1810 const int w=16;\
b3184779
MN
1811 for(i=0; i<w; i++)\
1812 {\
1813 const int src0= src[0*srcStride];\
1814 const int src1= src[1*srcStride];\
1815 const int src2= src[2*srcStride];\
1816 const int src3= src[3*srcStride];\
1817 const int src4= src[4*srcStride];\
1818 const int src5= src[5*srcStride];\
1819 const int src6= src[6*srcStride];\
1820 const int src7= src[7*srcStride];\
1821 const int src8= src[8*srcStride];\
1822 const int src9= src[9*srcStride];\
1823 const int src10= src[10*srcStride];\
1824 const int src11= src[11*srcStride];\
1825 const int src12= src[12*srcStride];\
1826 const int src13= src[13*srcStride];\
1827 const int src14= src[14*srcStride];\
1828 const int src15= src[15*srcStride];\
1829 const int src16= src[16*srcStride];\
1830 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1846 dst++;\
1847 src++;\
1848 }\
1849}\
1850\
0c1a9eda 1851static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1852 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1853}\
1854\
0c1a9eda
ZK
1855static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1856 uint8_t half[64];\
b3184779
MN
1857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1859}\
1860\
0c1a9eda 1861static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1862 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1863}\
1864\
0c1a9eda
ZK
1865static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1866 uint8_t half[64];\
b3184779
MN
1867 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1869}\
1870\
0c1a9eda
ZK
1871static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[16*9];\
1873 uint8_t half[64];\
b3184779 1874 copy_block9(full, src, 16, stride, 9);\
db794953 1875 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1876 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1877}\
1878\
0c1a9eda
ZK
1879static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
b3184779 1881 copy_block9(full, src, 16, stride, 9);\
db794953 1882 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1883}\
1884\
0c1a9eda
ZK
1885static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1887 uint8_t half[64];\
b3184779 1888 copy_block9(full, src, 16, stride, 9);\
db794953 1889 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1890 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1891}\
0c1a9eda
ZK
1892void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[16*9];\
1894 uint8_t halfH[72];\
1895 uint8_t halfV[64];\
1896 uint8_t halfHV[64];\
b3184779
MN
1897 copy_block9(full, src, 16, stride, 9);\
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1901 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1902}\
0c1a9eda
ZK
1903static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[16*9];\
1905 uint8_t halfH[72];\
1906 uint8_t halfHV[64];\
db794953
MN
1907 copy_block9(full, src, 16, stride, 9);\
1908 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1912}\
0c1a9eda
ZK
1913void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[16*9];\
1915 uint8_t halfH[72];\
1916 uint8_t halfV[64];\
1917 uint8_t halfHV[64];\
b3184779
MN
1918 copy_block9(full, src, 16, stride, 9);\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1922 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1923}\
0c1a9eda
ZK
1924static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[16*9];\
1926 uint8_t halfH[72];\
1927 uint8_t halfHV[64];\
db794953
MN
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1933}\
0c1a9eda
ZK
1934void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[16*9];\
1936 uint8_t halfH[72];\
1937 uint8_t halfV[64];\
1938 uint8_t halfHV[64];\
b3184779
MN
1939 copy_block9(full, src, 16, stride, 9);\
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1943 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1944}\
0c1a9eda
ZK
1945static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[16*9];\
1947 uint8_t halfH[72];\
1948 uint8_t halfHV[64];\
db794953
MN
1949 copy_block9(full, src, 16, stride, 9);\
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1954}\
0c1a9eda
ZK
1955void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[16*9];\
1957 uint8_t halfH[72];\
1958 uint8_t halfV[64];\
1959 uint8_t halfHV[64];\
b3184779
MN
1960 copy_block9(full, src, 16, stride, 9);\
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1964 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1965}\
0c1a9eda
ZK
1966static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[16*9];\
1968 uint8_t halfH[72];\
1969 uint8_t halfHV[64];\
db794953
MN
1970 copy_block9(full, src, 16, stride, 9);\
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1975}\
0c1a9eda
ZK
1976static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t halfH[72];\
1978 uint8_t halfHV[64];\
b3184779 1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1981 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1982}\
0c1a9eda
ZK
1983static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t halfH[72];\
1985 uint8_t halfHV[64];\
b3184779 1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1988 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1989}\
0c1a9eda
ZK
1990void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[16*9];\
1992 uint8_t halfH[72];\
1993 uint8_t halfV[64];\
1994 uint8_t halfHV[64];\
b3184779
MN
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1999 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 2000}\
0c1a9eda
ZK
2001static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[16*9];\
2003 uint8_t halfH[72];\
db794953
MN
2004 copy_block9(full, src, 16, stride, 9);\
2005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2008}\
0c1a9eda
ZK
2009void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2011 uint8_t halfH[72];\
2012 uint8_t halfV[64];\
2013 uint8_t halfHV[64];\
b3184779
MN
2014 copy_block9(full, src, 16, stride, 9);\
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 2018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 2019}\
0c1a9eda
ZK
2020static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[16*9];\
2022 uint8_t halfH[72];\
db794953
MN
2023 copy_block9(full, src, 16, stride, 9);\
2024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2027}\
0c1a9eda
ZK
2028static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t halfH[72];\
b3184779 2030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 2031 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 2032}\
0c1a9eda 2033static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 2034 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
2035}\
2036\
0c1a9eda
ZK
2037static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2038 uint8_t half[256];\
b3184779
MN
2039 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2041}\
2042\
0c1a9eda 2043static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 2044 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 2045}\
b3184779 2046\
0c1a9eda
ZK
2047static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2048 uint8_t half[256];\
b3184779
MN
2049 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2051}\
2052\
0c1a9eda
ZK
2053static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054 uint8_t full[24*17];\
2055 uint8_t half[256];\
b3184779 2056 copy_block17(full, src, 24, stride, 17);\
826f429a 2057 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
2058 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2059}\
2060\
0c1a9eda
ZK
2061static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
b3184779 2063 copy_block17(full, src, 24, stride, 17);\
826f429a 2064 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
2065}\
2066\
0c1a9eda
ZK
2067static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2069 uint8_t half[256];\
b3184779 2070 copy_block17(full, src, 24, stride, 17);\
826f429a 2071 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
2072 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2073}\
0c1a9eda
ZK
2074void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t full[24*17];\
2076 uint8_t halfH[272];\
2077 uint8_t halfV[256];\
2078 uint8_t halfHV[256];\
b3184779
MN
2079 copy_block17(full, src, 24, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2083 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2084}\
0c1a9eda
ZK
2085static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086 uint8_t full[24*17];\
2087 uint8_t halfH[272];\
2088 uint8_t halfHV[256];\
db794953
MN
2089 copy_block17(full, src, 24, stride, 17);\
2090 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2094}\
0c1a9eda
ZK
2095void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096 uint8_t full[24*17];\
2097 uint8_t halfH[272];\
2098 uint8_t halfV[256];\
2099 uint8_t halfHV[256];\
b3184779
MN
2100 copy_block17(full, src, 24, stride, 17);\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2104 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2105}\
0c1a9eda
ZK
2106static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107 uint8_t full[24*17];\
2108 uint8_t halfH[272];\
2109 uint8_t halfHV[256];\
db794953
MN
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2115}\
0c1a9eda
ZK
2116void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117 uint8_t full[24*17];\
2118 uint8_t halfH[272];\
2119 uint8_t halfV[256];\
2120 uint8_t halfHV[256];\
b3184779
MN
2121 copy_block17(full, src, 24, stride, 17);\
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2125 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2126}\
0c1a9eda
ZK
2127static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128 uint8_t full[24*17];\
2129 uint8_t halfH[272];\
2130 uint8_t halfHV[256];\
db794953
MN
2131 copy_block17(full, src, 24, stride, 17);\
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2136}\
0c1a9eda
ZK
2137void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t full[24*17];\
2139 uint8_t halfH[272];\
2140 uint8_t halfV[256];\
2141 uint8_t halfHV[256];\
b3184779
MN
2142 copy_block17(full, src, 24, stride, 17);\
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2146 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2147}\
0c1a9eda
ZK
2148static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149 uint8_t full[24*17];\
2150 uint8_t halfH[272];\
2151 uint8_t halfHV[256];\
db794953
MN
2152 copy_block17(full, src, 24, stride, 17);\
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2157}\
0c1a9eda
ZK
2158static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159 uint8_t halfH[272];\
2160 uint8_t halfHV[256];\
b3184779 2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2163 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2164}\
0c1a9eda
ZK
2165static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t halfH[272];\
2167 uint8_t halfHV[256];\
b3184779 2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2170 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2171}\
0c1a9eda
ZK
2172void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173 uint8_t full[24*17];\
2174 uint8_t halfH[272];\
2175 uint8_t halfV[256];\
2176 uint8_t halfHV[256];\
b3184779
MN
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2181 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2182}\
0c1a9eda
ZK
2183static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t full[24*17];\
2185 uint8_t halfH[272];\
db794953
MN
2186 copy_block17(full, src, 24, stride, 17);\
2187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2190}\
0c1a9eda
ZK
2191void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfV[256];\
2195 uint8_t halfHV[256];\
b3184779
MN
2196 copy_block17(full, src, 24, stride, 17);\
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2200 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2201}\
0c1a9eda
ZK
2202static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203 uint8_t full[24*17];\
2204 uint8_t halfH[272];\
db794953
MN
2205 copy_block17(full, src, 24, stride, 17);\
2206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2209}\
0c1a9eda
ZK
2210static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211 uint8_t halfH[272];\
b3184779 2212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2213 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2214}
44eb4951 2215
b3184779
MN
2216#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218#define op_put(a, b) a = cm[((b) + 16)>>5]
2219#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2220
2221QPEL_MC(0, put_ , _ , op_put)
2222QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223QPEL_MC(0, avg_ , _ , op_avg)
2224//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2225#undef op_avg
2226#undef op_avg_no_rnd
2227#undef op_put
2228#undef op_put_no_rnd
44eb4951 2229
0da71265
MN
2230#if 1
2231#define H264_LOWPASS(OPNAME, OP, OP2) \
bb5705b9 2232static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
80e44bc3 2233 const int h=2;\
55fde95e 2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2235 int i;\
2236 for(i=0; i<h; i++)\
2237 {\
2238 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2240 dst+=dstStride;\
2241 src+=srcStride;\
2242 }\
2243}\
2244\
bb5705b9 2245static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
80e44bc3 2246 const int w=2;\
55fde95e 2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2248 int i;\
2249 for(i=0; i<w; i++)\
2250 {\
2251 const int srcB= src[-2*srcStride];\
2252 const int srcA= src[-1*srcStride];\
2253 const int src0= src[0 *srcStride];\
2254 const int src1= src[1 *srcStride];\
2255 const int src2= src[2 *srcStride];\
2256 const int src3= src[3 *srcStride];\
2257 const int src4= src[4 *srcStride];\
2258 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2260 dst++;\
2261 src++;\
2262 }\
2263}\
2264\
bb5705b9 2265static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
80e44bc3
MN
2266 const int h=2;\
2267 const int w=2;\
55fde95e 2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2269 int i;\
2270 src -= 2*srcStride;\
2271 for(i=0; i<h+5; i++)\
2272 {\
2273 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2275 tmp+=tmpStride;\
2276 src+=srcStride;\
2277 }\
2278 tmp -= tmpStride*(h+5-2);\
2279 for(i=0; i<w; i++)\
2280 {\
2281 const int tmpB= tmp[-2*tmpStride];\
2282 const int tmpA= tmp[-1*tmpStride];\
2283 const int tmp0= tmp[0 *tmpStride];\
2284 const int tmp1= tmp[1 *tmpStride];\
2285 const int tmp2= tmp[2 *tmpStride];\
2286 const int tmp3= tmp[3 *tmpStride];\
2287 const int tmp4= tmp[4 *tmpStride];\
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290 dst++;\
2291 tmp++;\
2292 }\
2293}\
0da71265
MN
2294static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2295 const int h=4;\
55fde95e 2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2297 int i;\
2298 for(i=0; i<h; i++)\
2299 {\
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2304 dst+=dstStride;\
2305 src+=srcStride;\
2306 }\
2307}\
2308\
2309static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2310 const int w=4;\
55fde95e 2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2312 int i;\
2313 for(i=0; i<w; i++)\
2314 {\
2315 const int srcB= src[-2*srcStride];\
2316 const int srcA= src[-1*srcStride];\
2317 const int src0= src[0 *srcStride];\
2318 const int src1= src[1 *srcStride];\
2319 const int src2= src[2 *srcStride];\
2320 const int src3= src[3 *srcStride];\
2321 const int src4= src[4 *srcStride];\
2322 const int src5= src[5 *srcStride];\
2323 const int src6= src[6 *srcStride];\
2324 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2328 dst++;\
2329 src++;\
2330 }\
2331}\
2332\
2333static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334 const int h=4;\
2335 const int w=4;\
55fde95e 2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2337 int i;\
2338 src -= 2*srcStride;\
2339 for(i=0; i<h+5; i++)\
2340 {\
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2345 tmp+=tmpStride;\
2346 src+=srcStride;\
2347 }\
2348 tmp -= tmpStride*(h+5-2);\
2349 for(i=0; i<w; i++)\
2350 {\
2351 const int tmpB= tmp[-2*tmpStride];\
2352 const int tmpA= tmp[-1*tmpStride];\
2353 const int tmp0= tmp[0 *tmpStride];\
2354 const int tmp1= tmp[1 *tmpStride];\
2355 const int tmp2= tmp[2 *tmpStride];\
2356 const int tmp3= tmp[3 *tmpStride];\
2357 const int tmp4= tmp[4 *tmpStride];\
2358 const int tmp5= tmp[5 *tmpStride];\
2359 const int tmp6= tmp[6 *tmpStride];\
2360 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2364 dst++;\
2365 tmp++;\
2366 }\
2367}\
2368\
2369static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2370 const int h=8;\
55fde95e 2371 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2372 int i;\
2373 for(i=0; i<h; i++)\
2374 {\
2375 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2383 dst+=dstStride;\
2384 src+=srcStride;\
2385 }\
2386}\
2387\
2388static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389 const int w=8;\
55fde95e 2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2391 int i;\
2392 for(i=0; i<w; i++)\
2393 {\
2394 const int srcB= src[-2*srcStride];\
2395 const int srcA= src[-1*srcStride];\
2396 const int src0= src[0 *srcStride];\
2397 const int src1= src[1 *srcStride];\
2398 const int src2= src[2 *srcStride];\
2399 const int src3= src[3 *srcStride];\
2400 const int src4= src[4 *srcStride];\
2401 const int src5= src[5 *srcStride];\
2402 const int src6= src[6 *srcStride];\
2403 const int src7= src[7 *srcStride];\
2404 const int src8= src[8 *srcStride];\
2405 const int src9= src[9 *srcStride];\
2406 const int src10=src[10*srcStride];\
2407 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2415 dst++;\
2416 src++;\
2417 }\
2418}\
2419\
2420static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2421 const int h=8;\
2422 const int w=8;\
55fde95e 2423 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2424 int i;\
2425 src -= 2*srcStride;\
2426 for(i=0; i<h+5; i++)\
2427 {\
2428 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2436 tmp+=tmpStride;\
2437 src+=srcStride;\
2438 }\
2439 tmp -= tmpStride*(h+5-2);\
2440 for(i=0; i<w; i++)\
2441 {\
2442 const int tmpB= tmp[-2*tmpStride];\
2443 const int tmpA= tmp[-1*tmpStride];\
2444 const int tmp0= tmp[0 *tmpStride];\
2445 const int tmp1= tmp[1 *tmpStride];\
2446 const int tmp2= tmp[2 *tmpStride];\
2447 const int tmp3= tmp[3 *tmpStride];\
2448 const int tmp4= tmp[4 *tmpStride];\
2449 const int tmp5= tmp[5 *tmpStride];\
2450 const int tmp6= tmp[6 *tmpStride];\
2451 const int tmp7= tmp[7 *tmpStride];\
2452 const int tmp8= tmp[8 *tmpStride];\
2453 const int tmp9= tmp[9 *tmpStride];\
2454 const int tmp10=tmp[10*tmpStride];\
2455 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2463 dst++;\
2464 tmp++;\
2465 }\
2466}\
2467\
2468static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2470 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471 src += 8*srcStride;\
2472 dst += 8*dstStride;\
2473 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2474 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2475}\
2476\
2477static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2479 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480 src += 8*srcStride;\
2481 dst += 8*dstStride;\
2482 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2483 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2484}\
2485\
2486static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2488 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489 src += 8*srcStride;\
0da71265
MN
2490 dst += 8*dstStride;\
2491 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2492 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2493}\
2494
2495#define H264_MC(OPNAME, SIZE) \
2496static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2498}\
2499\
2500static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501 uint8_t half[SIZE*SIZE];\
2502 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2504}\
2505\
2506static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2508}\
2509\
2510static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511 uint8_t half[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2514}\
2515\
2516static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517 uint8_t full[SIZE*(SIZE+5)];\
2518 uint8_t * const full_mid= full + SIZE*2;\
2519 uint8_t half[SIZE*SIZE];\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2523}\
2524\
2525static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2529 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2530}\
2531\
2532static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533 uint8_t full[SIZE*(SIZE+5)];\
2534 uint8_t * const full_mid= full + SIZE*2;\
2535 uint8_t half[SIZE*SIZE];\
2536 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2537 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2539}\
2540\
2541static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542 uint8_t full[SIZE*(SIZE+5)];\
2543 uint8_t * const full_mid= full + SIZE*2;\
2544 uint8_t halfH[SIZE*SIZE];\
2545 uint8_t halfV[SIZE*SIZE];\
2546 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2548 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2550}\
2551\
2552static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553 uint8_t full[SIZE*(SIZE+5)];\
2554 uint8_t * const full_mid= full + SIZE*2;\
2555 uint8_t halfH[SIZE*SIZE];\
2556 uint8_t halfV[SIZE*SIZE];\
2557 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2559 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2561}\
2562\
2563static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564 uint8_t full[SIZE*(SIZE+5)];\
2565 uint8_t * const full_mid= full + SIZE*2;\
2566 uint8_t halfH[SIZE*SIZE];\
2567 uint8_t halfV[SIZE*SIZE];\
2568 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2570 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2572}\
2573\
2574static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575 uint8_t full[SIZE*(SIZE+5)];\
2576 uint8_t * const full_mid= full + SIZE*2;\
2577 uint8_t halfH[SIZE*SIZE];\
2578 uint8_t halfV[SIZE*SIZE];\
2579 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2583}\
2584\
2585static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586 int16_t tmp[SIZE*(SIZE+5)];\
2587 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2588}\
2589\
2590static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591 int16_t tmp[SIZE*(SIZE+5)];\
2592 uint8_t halfH[SIZE*SIZE];\
2593 uint8_t halfHV[SIZE*SIZE];\
2594 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2597}\
2598\
2599static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600 int16_t tmp[SIZE*(SIZE+5)];\
2601 uint8_t halfH[SIZE*SIZE];\
2602 uint8_t halfHV[SIZE*SIZE];\
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2606}\
2607\
2608static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609 uint8_t full[SIZE*(SIZE+5)];\
2610 uint8_t * const full_mid= full + SIZE*2;\
2611 int16_t tmp[SIZE*(SIZE+5)];\
2612 uint8_t halfV[SIZE*SIZE];\
2613 uint8_t halfHV[SIZE*SIZE];\
2614 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2615 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2618}\
2619\
2620static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621 uint8_t full[SIZE*(SIZE+5)];\
2622 uint8_t * const full_mid= full + SIZE*2;\
2623 int16_t tmp[SIZE*(SIZE+5)];\
2624 uint8_t halfV[SIZE*SIZE];\
2625 uint8_t halfHV[SIZE*SIZE];\
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2630}\
2631
2632#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634#define op_put(a, b) a = cm[((b) + 16)>>5]
2635#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636#define op2_put(a, b) a = cm[((b) + 512)>>10]
2637
2638H264_LOWPASS(put_ , op_put, op2_put)
2639H264_LOWPASS(avg_ , op_avg, op2_avg)
80e44bc3 2640H264_MC(put_, 2)
0da71265
MN
2641H264_MC(put_, 4)
2642H264_MC(put_, 8)
2643H264_MC(put_, 16)
2644H264_MC(avg_, 4)
2645H264_MC(avg_, 8)
2646H264_MC(avg_, 16)
2647
2648#undef op_avg
2649#undef op_put
2650#undef op2_avg
2651#undef op2_put
2652#endif
2653
f66e4f5f
RD
2654#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2656#define H264_WEIGHT(W,H) \
2657static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
e8b56208 2658 int y; \
9f2d1b4f
LM
2659 offset <<= log2_denom; \
2660 if(log2_denom) offset += 1<<(log2_denom-1); \
2661 for(y=0; y<H; y++, block += stride){ \
2662 op_scale1(0); \
2663 op_scale1(1); \
2664 if(W==2) continue; \
2665 op_scale1(2); \
2666 op_scale1(3); \
2667 if(W==4) continue; \
2668 op_scale1(4); \
2669 op_scale1(5); \
2670 op_scale1(6); \
2671 op_scale1(7); \
2672 if(W==8) continue; \
2673 op_scale1(8); \
2674 op_scale1(9); \
2675 op_scale1(10); \
2676 op_scale1(11); \
2677 op_scale1(12); \
2678 op_scale1(13); \
2679 op_scale1(14); \
2680 op_scale1(15); \
2681 } \
2682} \
e8b56208
LM
2683static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2684 int y; \
2685 offset = ((offset + 1) | 1) << log2_denom; \
9f2d1b4f
LM
2686 for(y=0; y<H; y++, dst += stride, src += stride){ \
2687 op_scale2(0); \
2688 op_scale2(1); \
2689 if(W==2) continue; \
2690 op_scale2(2); \
2691 op_scale2(3); \
2692 if(W==4) continue; \
2693 op_scale2(4); \
2694 op_scale2(5); \
2695 op_scale2(6); \
2696 op_scale2(7); \
2697 if(W==8) continue; \
2698 op_scale2(8); \
2699 op_scale2(9); \
2700 op_scale2(10); \
2701 op_scale2(11); \
2702 op_scale2(12); \
2703 op_scale2(13); \
2704 op_scale2(14); \
2705 op_scale2(15); \
2706 } \
2707}
2708
2709H264_WEIGHT(16,16)
2710H264_WEIGHT(16,8)
2711H264_WEIGHT(8,16)
2712H264_WEIGHT(8,8)
2713H264_WEIGHT(8,4)
2714H264_WEIGHT(4,8)
2715H264_WEIGHT(4,4)
2716H264_WEIGHT(4,2)
2717H264_WEIGHT(2,4)
2718H264_WEIGHT(2,2)
2719
2720#undef op_scale1
2721#undef op_scale2
2722#undef H264_WEIGHT
2723
1457ab52 2724static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
55fde95e 2725 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2726 int i;
2727
2728 for(i=0; i<h; i++){
2729 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2737 dst+=dstStride;
115329f1 2738 src+=srcStride;
1457ab52
MN
2739 }
2740}
2741
b250f9c6 2742#if CONFIG_CAVS_DECODER
b482e2d1
MN
2743/* AVS specific */
2744void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2745
2746void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747 put_pixels8_c(dst, src, stride, 8);
2748}
2749void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750 avg_pixels8_c(dst, src, stride, 8);
2751}
2752void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753 put_pixels16_c(dst, src, stride, 16);
2754}
2755void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756 avg_pixels16_c(dst, src, stride, 16);
2757}
29c5cdca 2758#endif /* CONFIG_CAVS_DECODER */
b482e2d1 2759
da00b525 2760void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
bf4f19dc 2761
9be6f0d2 2762#if CONFIG_VC1_DECODER
64db55ae
KS
2763/* VC-1 specific */
2764void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2765
2766void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
74691b7b
KS
2767 put_pixels8_c(dst, src, stride, 8);
2768}
6cecd630
DC
2769void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770 avg_pixels8_c(dst, src, stride, 8);
2771}
9be6f0d2 2772#endif /* CONFIG_VC1_DECODER */
64db55ae 2773
9abc7e0f 2774void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
9abc7e0f 2775
c6b237da 2776/* H264 specific */
edecaff8 2777void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
c6b237da 2778
b250f9c6 2779#if CONFIG_RV30_DECODER
6beb8b26
KS
2780void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781#endif /* CONFIG_RV30_DECODER */
2782
b250f9c6 2783#if CONFIG_RV40_DECODER
2d8a0815
KS
2784static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785 put_pixels16_xy2_c(dst, src, stride, 16);
2786}
2787static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788 avg_pixels16_xy2_c(dst, src, stride, 16);
2789}
2790static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791 put_pixels8_xy2_c(dst, src, stride, 8);
2792}
2793static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794 avg_pixels8_xy2_c(dst, src, stride, 8);
2795}
2796
2797void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798#endif /* CONFIG_RV40_DECODER */
2799
1457ab52 2800static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
55fde95e 2801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2802 int i;
2803
2804 for(i=0; i<w; i++){
2805 const int src_1= src[ -srcStride];
2806 const int src0 = src[0 ];
2807 const int src1 = src[ srcStride];
2808 const int src2 = src[2*srcStride];
2809 const int src3 = src[3*srcStride];
2810 const int src4 = src[4*srcStride];
2811 const int src5 = src[5*srcStride];
2812 const int src6 = src[6*srcStride];
2813 const int src7 = src[7*srcStride];
2814 const int src8 = src[8*srcStride];
2815 const int src9 = src[9*srcStride];
2816 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2818 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2819 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2820 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2821 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2822 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2823 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2824 src++;
2825 dst++;
2826 }
2827}
2828
2829static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830 put_pixels8_c(dst, src, stride, 8);
2831}
2832
2833static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2834 uint8_t half[64];
2835 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2837}
2838
2839static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2841}
2842
2843static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2844 uint8_t half[64];
2845 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2847}
2848
2849static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2851}
2852
2853static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2854 uint8_t halfH[88];
2855 uint8_t halfV[64];
2856 uint8_t halfHV[64];
2857 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2861}
2862static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2863 uint8_t halfH[88];
2864 uint8_t halfV[64];
2865 uint8_t halfHV[64];
2866 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2870}
2871static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2872 uint8_t halfH[88];
2873 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2875}
2876
332f9ac4 2877static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
4052cbf1 2878 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
332f9ac4
MN
2879 int x;
2880 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2881
332f9ac4
MN
2882 for(x=0; x<8; x++){
2883 int d1, d2, ad1;
2884 int p0= src[x-2*stride];
2885 int p1= src[x-1*stride];
2886 int p2= src[x+0*stride];
2887 int p3= src[x+1*stride];
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2889
2890 if (d<-2*strength) d1= 0;
2891 else if(d<- strength) d1=-2*strength - d;
2892 else if(d< strength) d1= d;
2893 else if(d< 2*strength) d1= 2*strength - d;
2894 else d1= 0;
115329f1 2895
332f9ac4
MN
2896 p1 += d1;
2897 p2 -= d1;
2898 if(p1&256) p1= ~(p1>>31);
2899 if(p2&256) p2= ~(p2>>31);
115329f1 2900
332f9ac4
MN
2901 src[x-1*stride] = p1;
2902 src[x+0*stride] = p2;
2903
c26abfa5 2904 ad1= FFABS(d1)>>1;
115329f1 2905
f66e4f5f 2906 d2= av_clip((p0-p3)/4, -ad1, ad1);
115329f1 2907
332f9ac4
MN
2908 src[x-2*stride] = p0 - d2;
2909 src[x+ stride] = p3 + d2;
2910 }
73f51a4d 2911 }
332f9ac4
MN
2912}
2913
2914static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
4052cbf1 2915 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
332f9ac4
MN
2916 int y;
2917 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2918
332f9ac4
MN
2919 for(y=0; y<8; y++){
2920 int d1, d2, ad1;
2921 int p0= src[y*stride-2];
2922 int p1= src[y*stride-1];
2923 int p2= src[y*stride+0];
2924 int p3= src[y*stride+1];
2925 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2926
2927 if (d<-2*strength) d1= 0;
2928 else if(d<- strength) d1=-2*strength - d;
2929 else if(d< strength) d1= d;
2930 else if(d< 2*strength) d1= 2*strength - d;
2931 else d1= 0;
115329f1 2932
332f9ac4
MN
2933 p1 += d1;
2934 p2 -= d1;
2935 if(p1&256) p1= ~(p1>>31);
2936 if(p2&256) p2= ~(p2>>31);
115329f1 2937
332f9ac4
MN
2938 src[y*stride-1] = p1;
2939 src[y*stride+0] = p2;
2940
c26abfa5 2941 ad1= FFABS(d1)>>1;
115329f1 2942
f66e4f5f 2943 d2= av_clip((p0-p3)/4, -ad1, ad1);
115329f1 2944
332f9ac4
MN
2945 src[y*stride-2] = p0 - d2;
2946 src[y*stride+1] = p3 + d2;
2947 }
73f51a4d 2948 }
332f9ac4 2949}
1457ab52 2950
fdbbf2e0
MN
2951static void h261_loop_filter_c(uint8_t *src, int stride){
2952 int x,y,xy,yz;
2953 int temp[64];
2954
2955 for(x=0; x<8; x++){
2956 temp[x ] = 4*src[x ];
2957 temp[x + 7*8] = 4*src[x + 7*stride];
2958 }
2959 for(y=1; y<7; y++){
2960 for(x=0; x<8; x++){
2961 xy = y * stride + x;
2962 yz = y * 8 + x;
2963 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2964 }
2965 }
115329f1 2966
fdbbf2e0
MN
2967 for(y=0; y<8; y++){
2968 src[ y*stride] = (temp[ y*8] + 2)>>2;
2969 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2970 for(x=1; x<7; x++){
2971 xy = y * stride + x;
2972 yz = y * 8 + x;
2973 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2974 }
2975 }
2976}
2977
3f50965b 2978static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2979{
2980 int i, d;
2981 for( i = 0; i < 4; i++ ) {
2982 if( tc0[i] < 0 ) {
2983 pix += 4*ystride;
2984 continue;
2985 }
2986 for( d = 0; d < 4; d++ ) {
2987 const int p0 = pix[-1*xstride];
2988 const int p1 = pix[-2*xstride];
2989 const int p2 = pix[-3*xstride];
2990 const int q0 = pix[0];
2991 const int q1 = pix[1*xstride];
2992 const int q2 = pix[2*xstride];
115329f1 2993
c26abfa5
DB
2994 if( FFABS( p0 - q0 ) < alpha &&
2995 FFABS( p1 - p0 ) < beta &&
2996 FFABS( q1 - q0 ) < beta ) {
115329f1 2997
42251a2a
LM
2998 int tc = tc0[i];
2999 int i_delta;
115329f1 3000
c26abfa5 3001 if( FFABS( p2 - p0 ) < beta ) {
c9640c17 3002 if(tc0[i])
f66e4f5f 3003 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
3004 tc++;
3005 }
c26abfa5 3006 if( FFABS( q2 - q0 ) < beta ) {
c9640c17 3007 if(tc0[i])
f66e4f5f 3008 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
3009 tc++;
3010 }
115329f1 3011
f66e4f5f
RD
3012 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3013 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3014 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
42251a2a
LM
3015 }
3016 pix += ystride;
3017 }
3018 }
3019}
5cf08f23 3020static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
3021{
3022 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3023}
5cf08f23 3024static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
3025{
3026 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3027}
3028
3f50965b 3029static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
712ca84c
JGG
3030{
3031 int d;
3032 for( d = 0; d < 16; d++ ) {
3033 const int p2 = pix[-3*xstride];
3034 const int p1 = pix[-2*xstride];
3035 const int p0 = pix[-1*xstride];
3036
3037 const int q0 = pix[ 0*xstride];
3038 const int q1 = pix[ 1*xstride];
3039 const int q2 = pix[ 2*xstride];
3040
3041 if( FFABS( p0 - q0 ) < alpha &&
3042 FFABS( p1 - p0 ) < beta &&
3043 FFABS( q1 - q0 ) < beta ) {
3044
3045 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3046 if( FFABS( p2 - p0 ) < beta)
3047 {
3048 const int p3 = pix[-4*xstride];
3049 /* p0', p1', p2' */
3050 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3051 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3052 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3053 } else {
3054 /* p0' */
3055 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3056 }
3057 if( FFABS( q2 - q0 ) < beta)
3058 {
3059 const int q3 = pix[3*xstride];
3060 /* q0', q1', q2' */
3061 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3062 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3063 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3064 } else {
3065 /* q0' */
3066 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3067 }
3068 }else{
3069 /* p0', q0' */
3070 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3071 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3072 }
3073 }
3074 pix += ystride;
3075 }
3076}
3077static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3078{
3079 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3080}
3081static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3082{
3083 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3084}
3085
3f50965b 3086static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
3087{
3088 int i, d;
3089 for( i = 0; i < 4; i++ ) {
3090 const int tc = tc0[i];
3091 if( tc <= 0 ) {
3092 pix += 2*ystride;
3093 continue;
3094 }
3095 for( d = 0; d < 2; d++ ) {
3096 const int p0 = pix[-1*xstride];
3097 const int p1 = pix[-2*xstride];
3098 const int q0 = pix[0];
3099 const int q1 = pix[1*xstride];
3100
c26abfa5
DB
3101 if( FFABS( p0 - q0 ) < alpha &&
3102 FFABS( p1 - p0 ) < beta &&
3103 FFABS( q1 - q0 ) < beta ) {
42251a2a 3104
f66e4f5f 3105 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
42251a2a 3106
f66e4f5f
RD
3107 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3108 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
42251a2a
LM
3109 }
3110 pix += ystride;
3111 }
3112 }
3113}
5cf08f23 3114static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
3115{
3116 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3117}
5cf08f23 3118static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
3119{
3120 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3121}
3122
3f50965b 3123static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
5cf08f23
LM
3124{
3125 int d;
3126 for( d = 0; d < 8; d++ ) {
3127 const int p0 = pix[-1*xstride];
3128 const int p1 = pix[-2*xstride];
3129 const int q0 = pix[0];
3130 const int q1 = pix[1*xstride];
3131
c26abfa5
DB
3132 if( FFABS( p0 - q0 ) < alpha &&
3133 FFABS( p1 - p0 ) < beta &&
3134 FFABS( q1 - q0 ) < beta ) {
5cf08f23
LM
3135
3136 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3137 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3138 }
3139 pix += ystride;
3140 }
3141}
3142static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3143{
3144 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3145}
3146static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3147{
3148 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3149}
3150
bb198e19 3151static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
3152{
3153 int s, i;
3154
3155 s = 0;
bb198e19 3156 for(i=0;i<h;i++) {
de6d9b64
FB
3157 s += abs(pix1[0] - pix2[0]);
3158 s += abs(pix1[1] - pix2[1]);
3159 s += abs(pix1[2] - pix2[2]);
3160 s += abs(pix1[3] - pix2[3]);
3161 s += abs(pix1[4] - pix2[4]);
3162 s += abs(pix1[5] - pix2[5]);
3163 s += abs(pix1[6] - pix2[6]);
3164 s += abs(pix1[7] - pix2[7]);
3165 s += abs(pix1[8] - pix2[8]);
3166 s += abs(pix1[9] - pix2[9]);
3167 s += abs(pix1[10] - pix2[10]);
3168 s += abs(pix1[11] - pix2[11]);
3169 s += abs(pix1[12] - pix2[12]);
3170 s += abs(pix1[13] - pix2[13]);
3171 s += abs(pix1[14] - pix2[14]);
3172 s += abs(pix1[15] - pix2[15]);
3173 pix1 += line_size;
3174 pix2 += line_size;
3175 }
3176 return s;
3177}
3178
bb198e19 3179static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
3180{
3181 int s, i;
3182
3183 s = 0;
bb198e19 3184 for(i=0;i<h;i++) {
de6d9b64
FB
3185 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3186 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3187 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3188 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3189 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3190 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3191 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3192 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3193 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3194 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3195 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3196 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3197 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3198 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3199 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3200 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3201 pix1 += line_size;
3202 pix2 += line_size;
3203 }
3204 return s;
3205}
3206
bb198e19 3207static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
3208{
3209 int s, i;
0c1a9eda 3210 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
3211
3212 s = 0;
bb198e19 3213 for(i=0;i<h;i++) {
de6d9b64
FB
3214 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3215 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3216 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3217 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3218 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3219 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3220 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3221 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3222 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3223 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3224 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3225 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3226 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3227 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3228 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3229 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3230 pix1 += line_size;
3231 pix2 += line_size;
3232 pix3 += line_size;
3233 }
3234 return s;
3235}
3236
bb198e19 3237static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
3238{
3239 int s, i;
0c1a9eda 3240 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
3241
3242 s = 0;
bb198e19 3243 for(i=0;i<h;i++) {
de6d9b64
FB
3244 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3245 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3246 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3247 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3248 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3249 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3250 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3251 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3252 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3253 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3254 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3255 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3256 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3257 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3258 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3259 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3260 pix1 += line_size;
3261 pix2 += line_size;
3262 pix3 += line_size;
3263 }
3264 return s;
3265}
3266
bb198e19 3267static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3268{
3269 int s, i;
3270
3271 s = 0;
bb198e19 3272 for(i=0;i<h;i++) {
ba6802de
MN
3273 s += abs(pix1[0] - pix2[0]);
3274 s += abs(pix1[1] - pix2[1]);
3275 s += abs(pix1[2] - pix2[2]);
3276 s += abs(pix1[3] - pix2[3]);
3277 s += abs(pix1[4] - pix2[4]);
3278 s += abs(pix1[5] - pix2[5]);
3279 s += abs(pix1[6] - pix2[6]);
3280 s += abs(pix1[7] - pix2[7]);
3281 pix1 += line_size;
3282 pix2 += line_size;
3283 }
3284 return s;
3285}
3286
bb198e19 3287static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3288{
3289 int s, i;
3290
3291 s = 0;
bb198e19 3292 for(i=0;i<h;i++) {
ba6802de
MN
3293 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3294 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3295 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3296 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3297 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3298 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3299 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3300 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3301 pix1 += line_size;
3302 pix2 += line_size;
3303 }
3304 return s;
3305}
3306
bb198e19 3307static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3308{
3309 int s, i;
0c1a9eda 3310 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3311
3312 s = 0;
bb198e19 3313 for(i=0;i<h;i++) {
ba6802de
MN
3314 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3315 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3316 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3317 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3318 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3319 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3320 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3321 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3322 pix1 += line_size;
3323 pix2 += line_size;
3324 pix3 += line_size;
3325 }
3326 return s;
3327}
3328
bb198e19 3329static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3330{
3331 int s, i;
0c1a9eda 3332 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3333
3334 s = 0;
bb198e19 3335 for(i=0;i<h;i++) {
ba6802de
MN
3336 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3337 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3338 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3339 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3340 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3341 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3342 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3343 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3344 pix1 += line_size;
3345 pix2 += line_size;
3346 pix3 += line_size;
3347 }
3348 return s;
3349}
3350
bf4e3bd2
MR
3351static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3352 MpegEncContext *c = v;
e6a2ac34
MN
3353 int score1=0;
3354 int score2=0;
3355 int x,y;
d4c5d2ad 3356
e6a2ac34
MN
3357 for(y=0; y<h; y++){
3358 for(x=0; x<16; x++){
3359 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3360 }
3361 if(y+1<h){
3362 for(x=0; x<15; x++){
c26abfa5 3363 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3364 - s1[x+1] + s1[x+1+stride])
c26abfa5 3365 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3366 - s2[x+1] + s2[x+1+stride]);
3367 }
3368 }
3369 s1+= stride;
3370 s2+= stride;
3371 }
d4c5d2ad 3372
c26abfa5
DB
3373 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3374 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3375}
3376
bf4e3bd2
MR
3377static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3378 MpegEncContext *c = v;
e6a2ac34
MN
3379 int score1=0;
3380 int score2=0;
3381 int x,y;
115329f1 3382
e6a2ac34
MN
3383 for(y=0; y<h; y++){
3384 for(x=0; x<8; x++){
3385 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3386 }
3387 if(y+1<h){
3388 for(x=0; x<7; x++){
c26abfa5 3389 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3390 - s1[x+1] + s1[x+1+stride])
c26abfa5 3391 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3392 - s2[x+1] + s2[x+1+stride]);
3393 }
3394 }
3395 s1+= stride;
3396 s2+= stride;
3397 }
115329f1 3398
c26abfa5
DB
3399 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3400 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3401}
3402
364a1797
MN
3403static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3404 int i;
3405 unsigned int sum=0;
3406
3407 for(i=0; i<8*8; i++){
3408 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3409 int w= weight[i];
3410 b>>= RECON_SHIFT;
3411 assert(-512<b && b<512);
3412
3413 sum += (w*b)*(w*b)>>4;
3414 }
3415 return sum>>2;
3416}
3417
3418static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3419 int i;
3420
3421 for(i=0; i<8*8; i++){
3422 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3423 }
364a1797
MN
3424}
3425
a9badb51
MN
3426/**
3427 * permutes an 8x8 block.
2a5700de 3428 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3429 * @param permutation the permutation vector
3430 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3431 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3432 * (inverse) permutated to scantable order!
a9badb51 3433 */
0c1a9eda 3434void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3435{
7801d21d 3436 int i;
477ab036 3437 DCTELEM temp[64];
115329f1 3438
7801d21d 3439 if(last<=0) return;
90b5b51e 3440 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
d962f6fd 3441
7801d21d
MN
3442 for(i=0; i<=last; i++){
3443 const int j= scantable[i];
3444 temp[j]= block[j];
3445 block[j]=0;
3446 }
115329f1 3447
7801d21d
MN
3448 for(i=0; i<=last; i++){
3449 const int j= scantable[i];
3450 const int perm_j= permutation[j];
3451 block[perm_j]= temp[j];
3452 }
d962f6fd 3453}
e0eac44e 3454
622348f9
MN
3455static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3456 return 0;
3457}
3458
3459void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3460 int i;
115329f1 3461
3899eb2f 3462 memset(cmp, 0, sizeof(void*)*6);
115329f1 3463
3899eb2f 3464 for(i=0; i<6; i++){
622348f9
MN
3465 switch(type&0xFF){
3466 case FF_CMP_SAD:
3467 cmp[i]= c->sad[i];
3468 break;
3469 case FF_CMP_SATD:
3470 cmp[i]= c->hadamard8_diff[i];
3471 break;
3472 case FF_CMP_SSE:
3473 cmp[i]= c->sse[i];
3474 break;
3475 case FF_CMP_DCT:
3476 cmp[i]= c->dct_sad[i];
3477 break;
27c61ac5
MN
3478 case FF_CMP_DCT264:
3479 cmp[i]= c->dct264_sad[i];
3480 break;
0fd6aea1
MN
3481 case FF_CMP_DCTMAX:
3482 cmp[i]= c->dct_max[i];
3483 break;
622348f9
MN
3484 case FF_CMP_PSNR:
3485 cmp[i]= c->quant_psnr[i];
3486 break;
3487 case FF_CMP_BIT:
3488 cmp[i]= c->bit[i];
3489 break;
3490 case FF_CMP_RD:
3491 cmp[i]= c->rd[i];
3492 break;
3493 case FF_CMP_VSAD:
3494 cmp[i]= c->vsad[i];
3495 break;
3496 case FF_CMP_VSSE:
3497 cmp[i]= c->vsse[i];
3498 break;
3499 case FF_CMP_ZERO:
3500 cmp[i]= zero_cmp;
3501 break;
e6a2ac34
MN
3502 case FF_CMP_NSSE:
3503 cmp[i]= c->nsse[i];
3504 break;
b250f9c6 3505#if CONFIG_SNOW_ENCODER
26efc54e
MN
3506 case FF_CMP_W53:
3507 cmp[i]= c->w53[i];
3508 break;
3509 case FF_CMP_W97:
3510 cmp[i]= c->w97[i];
3511 break;
3a6fc8fa 3512#endif
622348f9
MN
3513 default:
3514 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3515 }
3516 }
3517}
3518
5fecfb7d
LM
3519static void clear_block_c(DCTELEM *block)
3520{
3521 memset(block, 0, sizeof(DCTELEM)*64);
3522}
3523
2a5700de
MN
3524/**
3525 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3526 */
eb4b3dd3 3527static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3528{
3529 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3530}
3531
11f18faf 3532static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
469bd7b1
LM
3533 long i;
3534 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3535 long a = *(long*)(src+i);
3536 long b = *(long*)(dst+i);
3537 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
11f18faf
MN
3538 }
3539 for(; i<w; i++)
3540 dst[i+0] += src[i+0];
3541}
3542
4a9ca0a2 3543static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
469bd7b1 3544 long i;
4a9ca0a2
LM
3545 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3546 long a = *(long*)(src1+i);
3547 long b = *(long*)(src2+i);