Clarify Visual Studio FAQ.
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
7b94177e
DB
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
b78e7197
DB
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
FB
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
b78e7197 13 * version 2.1 of the License, or (at your option) any later version.
de6d9b64 14 *
b78e7197 15 * FFmpeg is distributed in the hope that it will be useful,
de6d9b64 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
de6d9b64 19 *
ff4ec49e 20 * You should have received a copy of the GNU Lesser General Public
b78e7197 21 * License along with FFmpeg; if not, write to the Free Software
5509bffa 22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
de6d9b64 23 */
115329f1 24
983e3246
MN
25/**
26 * @file dsputil.c
27 * DSP utils
28 */
115329f1 29
de6d9b64
FB
30#include "avcodec.h"
31#include "dsputil.h"
1457ab52 32#include "mpegvideo.h"
b0368839 33#include "simple_idct.h"
65e4c8c9 34#include "faandct.h"
eb75a698 35#include "h263.h"
059715a4 36#include "snow.h"
5596c60c 37
88730be6
MR
38/* snow.c */
39void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
40
2dac4acf
LM
41/* vorbis.c */
42void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43
6810b93a
LM
44/* flacenc.c */
45void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
46
55fde95e 47uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
1d503957 48uint32_t ff_squareTbl[512] = {0, };
de6d9b64 49
0c1a9eda 50const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 53 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 54 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
59};
60
10acc479
RS
61/* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
72};
73
2f349de2 74/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
486497e0 75DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
2f349de2 76
0c1a9eda 77const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 78 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 79 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 80 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 81 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 82 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 83 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 84 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
85 52, 53, 54, 55, 60, 61, 62, 63,
86};
87
0c1a9eda 88const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 89 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 90 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 91 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 92 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 93 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 94 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 95 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
96 38, 46, 54, 62, 39, 47, 55, 63,
97};
98
2f349de2 99/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
36cd3069 100const uint32_t ff_inverse[256]={
115329f1
DB
101 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
102 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
103 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
104 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
105 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
106 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
107 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
108 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
109 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
110 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
111 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
112 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
113 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
114 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
115 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
116 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
117 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
118 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
119 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
120 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
121 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
122 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
123 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
124 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
125 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
126 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
127 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
128 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
129 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
130 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
131 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2
MN
132 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
133};
134
b0368839
MN
135/* Input permutation for the simple_idct_mmx */
136static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
137 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
145};
146
0c1a9eda 147static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
148{
149 int s, i, j;
150
151 s = 0;
152 for (i = 0; i < 16; i++) {
bb270c08
DB
153 for (j = 0; j < 16; j += 8) {
154 s += pix[0];
155 s += pix[1];
156 s += pix[2];
157 s += pix[3];
158 s += pix[4];
159 s += pix[5];
160 s += pix[6];
161 s += pix[7];
162 pix += 8;
163 }
164 pix += line_size - 16;
3aa102be
MN
165 }
166 return s;
167}
168
0c1a9eda 169static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
170{
171 int s, i, j;
1d503957 172 uint32_t *sq = ff_squareTbl + 256;
3aa102be
MN
173
174 s = 0;
175 for (i = 0; i < 16; i++) {
bb270c08 176 for (j = 0; j < 16; j += 8) {
2a006cd3 177#if 0
bb270c08
DB
178 s += sq[pix[0]];
179 s += sq[pix[1]];
180 s += sq[pix[2]];
181 s += sq[pix[3]];
182 s += sq[pix[4]];
183 s += sq[pix[5]];
184 s += sq[pix[6]];
185 s += sq[pix[7]];
2a006cd3
FL
186#else
187#if LONG_MAX > 2147483647
bb270c08
DB
188 register uint64_t x=*(uint64_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
2a006cd3
FL
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
197#else
bb270c08
DB
198 register uint32_t x=*(uint32_t*)pix;
199 s += sq[x&0xff];
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
2a006cd3
FL
203 x=*(uint32_t*)(pix+4);
204 s += sq[x&0xff];
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
208#endif
209#endif
bb270c08
DB
210 pix += 8;
211 }
212 pix += line_size - 16;
3aa102be
MN
213 }
214 return s;
215}
216
3d2e8cce
MN
217static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
218 int i;
115329f1 219
3d2e8cce
MN
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= bswap_32(src[i+0]);
222 dst[i+1]= bswap_32(src[i+1]);
223 dst[i+2]= bswap_32(src[i+2]);
224 dst[i+3]= bswap_32(src[i+3]);
225 dst[i+4]= bswap_32(src[i+4]);
226 dst[i+5]= bswap_32(src[i+5]);
227 dst[i+6]= bswap_32(src[i+6]);
228 dst[i+7]= bswap_32(src[i+7]);
229 }
230 for(;i<w; i++){
231 dst[i+0]= bswap_32(src[i+0]);
232 }
233}
3aa102be 234
26efc54e
MN
235static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
236{
237 int s, i;
1d503957 238 uint32_t *sq = ff_squareTbl + 256;
26efc54e
MN
239
240 s = 0;
241 for (i = 0; i < h; i++) {
242 s += sq[pix1[0] - pix2[0]];
243 s += sq[pix1[1] - pix2[1]];
244 s += sq[pix1[2] - pix2[2]];
245 s += sq[pix1[3] - pix2[3]];
246 pix1 += line_size;
247 pix2 += line_size;
248 }
249 return s;
250}
251
bb198e19 252static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
253{
254 int s, i;
1d503957 255 uint32_t *sq = ff_squareTbl + 256;
1457ab52
MN
256
257 s = 0;
bb198e19 258 for (i = 0; i < h; i++) {
1457ab52
MN
259 s += sq[pix1[0] - pix2[0]];
260 s += sq[pix1[1] - pix2[1]];
261 s += sq[pix1[2] - pix2[2]];
262 s += sq[pix1[3] - pix2[3]];
263 s += sq[pix1[4] - pix2[4]];
264 s += sq[pix1[5] - pix2[5]];
265 s += sq[pix1[6] - pix2[6]];
266 s += sq[pix1[7] - pix2[7]];
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271}
272
bb198e19 273static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 274{
6b026927 275 int s, i;
1d503957 276 uint32_t *sq = ff_squareTbl + 256;
9c76bd48
BF
277
278 s = 0;
bb198e19 279 for (i = 0; i < h; i++) {
6b026927
FH
280 s += sq[pix1[ 0] - pix2[ 0]];
281 s += sq[pix1[ 1] - pix2[ 1]];
282 s += sq[pix1[ 2] - pix2[ 2]];
283 s += sq[pix1[ 3] - pix2[ 3]];
284 s += sq[pix1[ 4] - pix2[ 4]];
285 s += sq[pix1[ 5] - pix2[ 5]];
286 s += sq[pix1[ 6] - pix2[ 6]];
287 s += sq[pix1[ 7] - pix2[ 7]];
288 s += sq[pix1[ 8] - pix2[ 8]];
289 s += sq[pix1[ 9] - pix2[ 9]];
290 s += sq[pix1[10] - pix2[10]];
291 s += sq[pix1[11] - pix2[11]];
292 s += sq[pix1[12] - pix2[12]];
293 s += sq[pix1[13] - pix2[13]];
294 s += sq[pix1[14] - pix2[14]];
295 s += sq[pix1[15] - pix2[15]];
2a006cd3 296
6b026927
FH
297 pix1 += line_size;
298 pix2 += line_size;
9c76bd48
BF
299 }
300 return s;
301}
302
26efc54e 303
871371a7 304#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
3a6fc8fa 305static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
26efc54e
MN
306 int s, i, j;
307 const int dec_count= w==8 ? 3 : 4;
871371a7 308 int tmp[32*32];
26efc54e 309 int level, ori;
115329f1 310 static const int scale[2][2][4][4]={
26efc54e
MN
311 {
312 {
871371a7 313 // 9/7 8x8 dec=3
26efc54e
MN
314 {268, 239, 239, 213},
315 { 0, 224, 224, 152},
316 { 0, 135, 135, 110},
317 },{
871371a7 318 // 9/7 16x16 or 32x32 dec=4
26efc54e
MN
319 {344, 310, 310, 280},
320 { 0, 320, 320, 228},
321 { 0, 175, 175, 136},
322 { 0, 129, 129, 102},
323 }
324 },{
871371a7
LM
325 {
326 // 5/3 8x8 dec=3
26efc54e
MN
327 {275, 245, 245, 218},
328 { 0, 230, 230, 156},
329 { 0, 138, 138, 113},
330 },{
871371a7 331 // 5/3 16x16 or 32x32 dec=4
26efc54e
MN
332 {352, 317, 317, 286},
333 { 0, 328, 328, 233},
334 { 0, 180, 180, 140},
335 { 0, 132, 132, 105},
336 }
337 }
338 };
26efc54e
MN
339
340 for (i = 0; i < h; i++) {
341 for (j = 0; j < w; j+=4) {
871371a7
LM
342 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
26efc54e
MN
346 }
347 pix1 += line_size;
348 pix2 += line_size;
349 }
8b975b7c 350
871371a7 351 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
26efc54e
MN
352
353 s=0;
871371a7 354 assert(w==h);
26efc54e
MN
355 for(level=0; level<dec_count; level++){
356 for(ori= level ? 1 : 0; ori<4; ori++){
871371a7
LM
357 int size= w>>(dec_count-level);
358 int sx= (ori&1) ? size : 0;
359 int stride= 32<<(dec_count-level);
26efc54e 360 int sy= (ori&2) ? stride>>1 : 0;
115329f1 361
26efc54e
MN
362 for(i=0; i<size; i++){
363 for(j=0; j<size; j++){
364 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
c26abfa5 365 s += FFABS(v);
26efc54e
MN
366 }
367 }
368 }
369 }
115329f1 370 assert(s>=0);
871371a7 371 return s>>9;
26efc54e
MN
372}
373
374static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
376}
377
378static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
380}
381
382static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
384}
385
386static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
388}
389
486497e0 390int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
391 return w_c(v, pix1, pix2, line_size, 32, h, 1);
392}
393
486497e0 394int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
395 return w_c(v, pix1, pix2, line_size, 32, h, 0);
396}
3a6fc8fa 397#endif
871371a7 398
0c1a9eda 399static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 400{
de6d9b64
FB
401 int i;
402
403 /* read the pixels */
de6d9b64 404 for(i=0;i<8;i++) {
c13e1abd
FH
405 block[0] = pixels[0];
406 block[1] = pixels[1];
407 block[2] = pixels[2];
408 block[3] = pixels[3];
409 block[4] = pixels[4];
410 block[5] = pixels[5];
411 block[6] = pixels[6];
412 block[7] = pixels[7];
413 pixels += line_size;
414 block += 8;
de6d9b64
FB
415 }
416}
417
0c1a9eda 418static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 419 const uint8_t *s2, int stride){
9dbcbd92
MN
420 int i;
421
422 /* read the pixels */
9dbcbd92 423 for(i=0;i<8;i++) {
c13e1abd
FH
424 block[0] = s1[0] - s2[0];
425 block[1] = s1[1] - s2[1];
426 block[2] = s1[2] - s2[2];
427 block[3] = s1[3] - s2[3];
428 block[4] = s1[4] - s2[4];
429 block[5] = s1[5] - s2[5];
430 block[6] = s1[6] - s2[6];
431 block[7] = s1[7] - s2[7];
9dbcbd92
MN
432 s1 += stride;
433 s2 += stride;
c13e1abd 434 block += 8;
9dbcbd92
MN
435 }
436}
437
438
0c1a9eda 439static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 440 int line_size)
de6d9b64 441{
de6d9b64 442 int i;
55fde95e 443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 444
de6d9b64 445 /* read the pixels */
de6d9b64 446 for(i=0;i<8;i++) {
c13e1abd
FH
447 pixels[0] = cm[block[0]];
448 pixels[1] = cm[block[1]];
449 pixels[2] = cm[block[2]];
450 pixels[3] = cm[block[3]];
451 pixels[4] = cm[block[4]];
452 pixels[5] = cm[block[5]];
453 pixels[6] = cm[block[6]];
454 pixels[7] = cm[block[7]];
455
456 pixels += line_size;
457 block += 8;
de6d9b64
FB
458 }
459}
460
178fcca8 461static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 462 int line_size)
178fcca8
MN
463{
464 int i;
55fde95e 465 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 466
178fcca8
MN
467 /* read the pixels */
468 for(i=0;i<4;i++) {
469 pixels[0] = cm[block[0]];
470 pixels[1] = cm[block[1]];
471 pixels[2] = cm[block[2]];
472 pixels[3] = cm[block[3]];
473
474 pixels += line_size;
475 block += 8;
476 }
477}
478
9ca358b9 479static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 480 int line_size)
9ca358b9
MN
481{
482 int i;
55fde95e 483 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 484
9ca358b9
MN
485 /* read the pixels */
486 for(i=0;i<2;i++) {
487 pixels[0] = cm[block[0]];
488 pixels[1] = cm[block[1]];
489
490 pixels += line_size;
491 block += 8;
492 }
493}
494
115329f1 495static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
496 uint8_t *restrict pixels,
497 int line_size)
498{
499 int i, j;
500
501 for (i = 0; i < 8; i++) {
502 for (j = 0; j < 8; j++) {
503 if (*block < -128)
504 *pixels = 0;
505 else if (*block > 127)
506 *pixels = 255;
507 else
508 *pixels = (uint8_t)(*block + 128);
509 block++;
510 pixels++;
511 }
512 pixels += (line_size - 8);
513 }
514}
515
0c1a9eda 516static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 517 int line_size)
de6d9b64 518{
de6d9b64 519 int i;
55fde95e 520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 521
de6d9b64 522 /* read the pixels */
de6d9b64 523 for(i=0;i<8;i++) {
c13e1abd
FH
524 pixels[0] = cm[pixels[0] + block[0]];
525 pixels[1] = cm[pixels[1] + block[1]];
526 pixels[2] = cm[pixels[2] + block[2]];
527 pixels[3] = cm[pixels[3] + block[3]];
528 pixels[4] = cm[pixels[4] + block[4]];
529 pixels[5] = cm[pixels[5] + block[5]];
530 pixels[6] = cm[pixels[6] + block[6]];
531 pixels[7] = cm[pixels[7] + block[7]];
532 pixels += line_size;
533 block += 8;
de6d9b64
FB
534 }
535}
178fcca8
MN
536
537static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
538 int line_size)
539{
540 int i;
55fde95e 541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 542
178fcca8
MN
543 /* read the pixels */
544 for(i=0;i<4;i++) {
545 pixels[0] = cm[pixels[0] + block[0]];
546 pixels[1] = cm[pixels[1] + block[1]];
547 pixels[2] = cm[pixels[2] + block[2]];
548 pixels[3] = cm[pixels[3] + block[3]];
549 pixels += line_size;
550 block += 8;
551 }
552}
9ca358b9
MN
553
554static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
555 int line_size)
556{
557 int i;
55fde95e 558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 559
9ca358b9
MN
560 /* read the pixels */
561 for(i=0;i<2;i++) {
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
564 pixels += line_size;
565 block += 8;
566 }
567}
36940eca
LM
568
569static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
570{
571 int i;
572 for(i=0;i<8;i++) {
573 pixels[0] += block[0];
574 pixels[1] += block[1];
575 pixels[2] += block[2];
576 pixels[3] += block[3];
577 pixels[4] += block[4];
578 pixels[5] += block[5];
579 pixels[6] += block[6];
580 pixels[7] += block[7];
581 pixels += line_size;
582 block += 8;
583 }
584}
585
586static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
587{
588 int i;
589 for(i=0;i<4;i++) {
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
594 pixels += line_size;
595 block += 4;
596 }
597}
598
1edbfe19
LM
599static int sum_abs_dctelem_c(DCTELEM *block)
600{
601 int sum=0, i;
602 for(i=0; i<64; i++)
603 sum+= FFABS(block[i]);
604 return sum;
605}
606
59fe111e
MN
607#if 0
608
609#define PIXOP2(OPNAME, OP) \
b3184779 610static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
611{\
612 int i;\
613 for(i=0; i<h; i++){\
905694d9 614 OP(*((uint64_t*)block), AV_RN64(pixels));\
59fe111e
MN
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618}\
619\
45553457 620static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
621{\
622 int i;\
623 for(i=0; i<h; i++){\
905694d9
RS
624 const uint64_t a= AV_RN64(pixels );\
625 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
626 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627 pixels+=line_size;\
628 block +=line_size;\
629 }\
630}\
631\
45553457 632static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
633{\
634 int i;\
635 for(i=0; i<h; i++){\
905694d9
RS
636 const uint64_t a= AV_RN64(pixels );\
637 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
638 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642}\
643\
45553457 644static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
645{\
646 int i;\
647 for(i=0; i<h; i++){\
905694d9
RS
648 const uint64_t a= AV_RN64(pixels );\
649 const uint64_t b= AV_RN64(pixels+line_size);\
59fe111e
MN
650 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654}\
655\
45553457 656static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
657{\
658 int i;\
659 for(i=0; i<h; i++){\
905694d9
RS
660 const uint64_t a= AV_RN64(pixels );\
661 const uint64_t b= AV_RN64(pixels+line_size);\
59fe111e
MN
662 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
663 pixels+=line_size;\
664 block +=line_size;\
665 }\
666}\
667\
45553457 668static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
669{\
670 int i;\
905694d9
RS
671 const uint64_t a= AV_RN64(pixels );\
672 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
673 uint64_t l0= (a&0x0303030303030303ULL)\
674 + (b&0x0303030303030303ULL)\
675 + 0x0202020202020202ULL;\
676 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
678 uint64_t l1,h1;\
679\
680 pixels+=line_size;\
681 for(i=0; i<h; i+=2){\
905694d9
RS
682 uint64_t a= AV_RN64(pixels );\
683 uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
684 l1= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL);\
686 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
689 pixels+=line_size;\
690 block +=line_size;\
905694d9
RS
691 a= AV_RN64(pixels );\
692 b= AV_RN64(pixels+1);\
59fe111e
MN
693 l0= (a&0x0303030303030303ULL)\
694 + (b&0x0303030303030303ULL)\
695 + 0x0202020202020202ULL;\
696 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
699 pixels+=line_size;\
700 block +=line_size;\
701 }\
702}\
703\
45553457 704static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
705{\
706 int i;\
905694d9
RS
707 const uint64_t a= AV_RN64(pixels );\
708 const uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
709 uint64_t l0= (a&0x0303030303030303ULL)\
710 + (b&0x0303030303030303ULL)\
711 + 0x0101010101010101ULL;\
712 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
714 uint64_t l1,h1;\
715\
716 pixels+=line_size;\
717 for(i=0; i<h; i+=2){\
905694d9
RS
718 uint64_t a= AV_RN64(pixels );\
719 uint64_t b= AV_RN64(pixels+1);\
59fe111e
MN
720 l1= (a&0x0303030303030303ULL)\
721 + (b&0x0303030303030303ULL);\
722 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
725 pixels+=line_size;\
726 block +=line_size;\
905694d9
RS
727 a= AV_RN64(pixels );\
728 b= AV_RN64(pixels+1);\
59fe111e
MN
729 l0= (a&0x0303030303030303ULL)\
730 + (b&0x0303030303030303ULL)\
731 + 0x0101010101010101ULL;\
732 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
735 pixels+=line_size;\
736 block +=line_size;\
737 }\
738}\
739\
45553457
ZK
740CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
741CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
747
748#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749#else // 64 bit variant
750
751#define PIXOP2(OPNAME, OP) \
669ac79c
MN
752static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
753 int i;\
754 for(i=0; i<h; i++){\
905694d9 755 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
669ac79c
MN
756 pixels+=line_size;\
757 block +=line_size;\
758 }\
759}\
0da71265
MN
760static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761 int i;\
762 for(i=0; i<h; i++){\
905694d9 763 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
0da71265
MN
764 pixels+=line_size;\
765 block +=line_size;\
766 }\
767}\
45553457 768static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
769 int i;\
770 for(i=0; i<h; i++){\
905694d9
RS
771 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
772 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
59fe111e
MN
773 pixels+=line_size;\
774 block +=line_size;\
775 }\
776}\
45553457
ZK
777static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 779}\
59fe111e 780\
b3184779
MN
781static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
59fe111e
MN
783 int i;\
784 for(i=0; i<h; i++){\
b3184779 785 uint32_t a,b;\
905694d9
RS
786 a= AV_RN32(&src1[i*src_stride1 ]);\
787 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 788 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
905694d9
RS
789 a= AV_RN32(&src1[i*src_stride1+4]);\
790 b= AV_RN32(&src2[i*src_stride2+4]);\
d8085ea7 791 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
792 }\
793}\
794\
b3184779
MN
795static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
59fe111e
MN
797 int i;\
798 for(i=0; i<h; i++){\
b3184779 799 uint32_t a,b;\
905694d9
RS
800 a= AV_RN32(&src1[i*src_stride1 ]);\
801 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
905694d9
RS
803 a= AV_RN32(&src1[i*src_stride1+4]);\
804 b= AV_RN32(&src2[i*src_stride2+4]);\
d8085ea7 805 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
806 }\
807}\
808\
0da71265
MN
809static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810 int src_stride1, int src_stride2, int h){\
811 int i;\
812 for(i=0; i<h; i++){\
813 uint32_t a,b;\
905694d9
RS
814 a= AV_RN32(&src1[i*src_stride1 ]);\
815 b= AV_RN32(&src2[i*src_stride2 ]);\
d8085ea7 816 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
817 }\
818}\
819\
669ac79c
MN
820static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
822 int i;\
823 for(i=0; i<h; i++){\
824 uint32_t a,b;\
905694d9
RS
825 a= AV_RN16(&src1[i*src_stride1 ]);\
826 b= AV_RN16(&src2[i*src_stride2 ]);\
669ac79c
MN
827 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
828 }\
829}\
830\
b3184779
MN
831static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832 int src_stride1, int src_stride2, int h){\
833 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
834 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
835}\
836\
837static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838 int src_stride1, int src_stride2, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
840 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
841}\
842\
45553457 843static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
844 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
845}\
846\
45553457 847static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
848 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
849}\
850\
45553457 851static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
852 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853}\
854\
45553457 855static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
856 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
857}\
858\
859static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
861 int i;\
862 for(i=0; i<h; i++){\
b3184779 863 uint32_t a, b, c, d, l0, l1, h0, h1;\
905694d9
RS
864 a= AV_RN32(&src1[i*src_stride1]);\
865 b= AV_RN32(&src2[i*src_stride2]);\
866 c= AV_RN32(&src3[i*src_stride3]);\
867 d= AV_RN32(&src4[i*src_stride4]);\
b3184779
MN
868 l0= (a&0x03030303UL)\
869 + (b&0x03030303UL)\
870 + 0x02020202UL;\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
874 + (d&0x03030303UL);\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
905694d9
RS
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 c= AV_RN32(&src3[i*src_stride3+4]);\
881 d= AV_RN32(&src4[i*src_stride4+4]);\
b3184779
MN
882 l0= (a&0x03030303UL)\
883 + (b&0x03030303UL)\
884 + 0x02020202UL;\
885 h0= ((a&0xFCFCFCFCUL)>>2)\
886 + ((b&0xFCFCFCFCUL)>>2);\
887 l1= (c&0x03030303UL)\
888 + (d&0x03030303UL);\
889 h1= ((c&0xFCFCFCFCUL)>>2)\
890 + ((d&0xFCFCFCFCUL)>>2);\
891 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
892 }\
893}\
669ac79c
MN
894\
895static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
897}\
898\
899static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
901}\
902\
903static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905}\
906\
907static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
909}\
910\
b3184779
MN
911static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
913 int i;\
914 for(i=0; i<h; i++){\
b3184779 915 uint32_t a, b, c, d, l0, l1, h0, h1;\
905694d9
RS
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
b3184779
MN
920 l0= (a&0x03030303UL)\
921 + (b&0x03030303UL)\
922 + 0x01010101UL;\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
926 + (d&0x03030303UL);\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
905694d9
RS
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
b3184779
MN
934 l0= (a&0x03030303UL)\
935 + (b&0x03030303UL)\
936 + 0x01010101UL;\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
940 + (d&0x03030303UL);\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
944 }\
945}\
b3184779
MN
946static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
950}\
951static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
955}\
59fe111e 956\
669ac79c
MN
957static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
958{\
959 int i, a0, b0, a1, b1;\
960 a0= pixels[0];\
961 b0= pixels[1] + 2;\
962 a0 += b0;\
963 b0 += pixels[2];\
964\
965 pixels+=line_size;\
966 for(i=0; i<h; i+=2){\
967 a1= pixels[0];\
968 b1= pixels[1];\
969 a1 += b1;\
970 b1 += pixels[2];\
971\
972 block[0]= (a1+a0)>>2; /* FIXME non put */\
973 block[1]= (b1+b0)>>2;\
974\
975 pixels+=line_size;\
976 block +=line_size;\
977\
978 a0= pixels[0];\
979 b0= pixels[1] + 2;\
980 a0 += b0;\
981 b0 += pixels[2];\
982\
983 block[0]= (a1+a0)>>2;\
984 block[1]= (b1+b0)>>2;\
985 pixels+=line_size;\
986 block +=line_size;\
987 }\
988}\
989\
990static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
991{\
992 int i;\
905694d9
RS
993 const uint32_t a= AV_RN32(pixels );\
994 const uint32_t b= AV_RN32(pixels+1);\
669ac79c
MN
995 uint32_t l0= (a&0x03030303UL)\
996 + (b&0x03030303UL)\
997 + 0x02020202UL;\
998 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 uint32_t l1,h1;\
1001\
1002 pixels+=line_size;\
1003 for(i=0; i<h; i+=2){\
905694d9
RS
1004 uint32_t a= AV_RN32(pixels );\
1005 uint32_t b= AV_RN32(pixels+1);\
669ac79c
MN
1006 l1= (a&0x03030303UL)\
1007 + (b&0x03030303UL);\
1008 h1= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 pixels+=line_size;\
1012 block +=line_size;\
905694d9
RS
1013 a= AV_RN32(pixels );\
1014 b= AV_RN32(pixels+1);\
669ac79c
MN
1015 l0= (a&0x03030303UL)\
1016 + (b&0x03030303UL)\
1017 + 0x02020202UL;\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021 pixels+=line_size;\
1022 block +=line_size;\
1023 }\
1024}\
1025\
45553457 1026static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1027{\
1028 int j;\
1029 for(j=0; j<2; j++){\
1030 int i;\
905694d9
RS
1031 const uint32_t a= AV_RN32(pixels );\
1032 const uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1033 uint32_t l0= (a&0x03030303UL)\
1034 + (b&0x03030303UL)\
1035 + 0x02020202UL;\
1036 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1038 uint32_t l1,h1;\
1039\
1040 pixels+=line_size;\
1041 for(i=0; i<h; i+=2){\
905694d9
RS
1042 uint32_t a= AV_RN32(pixels );\
1043 uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1044 l1= (a&0x03030303UL)\
1045 + (b&0x03030303UL);\
1046 h1= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 pixels+=line_size;\
1050 block +=line_size;\
905694d9
RS
1051 a= AV_RN32(pixels );\
1052 b= AV_RN32(pixels+1);\
59fe111e
MN
1053 l0= (a&0x03030303UL)\
1054 + (b&0x03030303UL)\
1055 + 0x02020202UL;\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059 pixels+=line_size;\
1060 block +=line_size;\
1061 }\
1062 pixels+=4-line_size*(h+1);\
1063 block +=4-line_size*h;\
1064 }\
1065}\
1066\
45553457 1067static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1068{\
1069 int j;\
1070 for(j=0; j<2; j++){\
1071 int i;\
905694d9
RS
1072 const uint32_t a= AV_RN32(pixels );\
1073 const uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1074 uint32_t l0= (a&0x03030303UL)\
1075 + (b&0x03030303UL)\
1076 + 0x01010101UL;\
1077 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 uint32_t l1,h1;\
1080\
1081 pixels+=line_size;\
1082 for(i=0; i<h; i+=2){\
905694d9
RS
1083 uint32_t a= AV_RN32(pixels );\
1084 uint32_t b= AV_RN32(pixels+1);\
59fe111e
MN
1085 l1= (a&0x03030303UL)\
1086 + (b&0x03030303UL);\
1087 h1= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090 pixels+=line_size;\
1091 block +=line_size;\
905694d9
RS
1092 a= AV_RN32(pixels );\
1093 b= AV_RN32(pixels+1);\
59fe111e
MN
1094 l0= (a&0x03030303UL)\
1095 + (b&0x03030303UL)\
1096 + 0x01010101UL;\
1097 h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 pixels+=line_size;\
1101 block +=line_size;\
1102 }\
1103 pixels+=4-line_size*(h+1);\
1104 block +=4-line_size*h;\
1105 }\
1106}\
1107\
45553457
ZK
1108CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1109CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1113CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1116
d8085ea7 1117#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1118#endif
59fe111e
MN
1119#define op_put(a, b) a = b
1120
1121PIXOP2(avg, op_avg)
1122PIXOP2(put, op_put)
1123#undef op_avg
1124#undef op_put
1125
de6d9b64
FB
1126#define avg2(a,b) ((a+b+1)>>1)
1127#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1128
c0a0170c
MN
1129static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1131}
1132
1133static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1135}
073b013d 1136
0c1a9eda 1137static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1138{
1139 const int A=(16-x16)*(16-y16);
1140 const int B=( x16)*(16-y16);
1141 const int C=(16-x16)*( y16);
1142 const int D=( x16)*( y16);
1143 int i;
44eb4951
MN
1144
1145 for(i=0; i<h; i++)
1146 {
b3184779
MN
1147 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1155 dst+= stride;
1156 src+= stride;
44eb4951
MN
1157 }
1158}
1159
703c8195 1160void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1161 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1162{
1163 int y, vx, vy;
1164 const int s= 1<<shift;
115329f1 1165
073b013d
MN
1166 width--;
1167 height--;
1168
1169 for(y=0; y<h; y++){
1170 int x;
1171
1172 vx= ox;
1173 vy= oy;
1174 for(x=0; x<8; x++){ //XXX FIXME optimize
1175 int src_x, src_y, frac_x, frac_y, index;
1176
1177 src_x= vx>>16;
1178 src_y= vy>>16;
1179 frac_x= src_x&(s-1);
1180 frac_y= src_y&(s-1);
1181 src_x>>=shift;
1182 src_y>>=shift;
115329f1 1183
073b013d
MN
1184 if((unsigned)src_x < width){
1185 if((unsigned)src_y < height){
1186 index= src_x + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1188 + src[index +1]* frac_x )*(s-frac_y)
1189 + ( src[index+stride ]*(s-frac_x)
1190 + src[index+stride+1]* frac_x )* frac_y
1191 + r)>>(shift*2);
1192 }else{
f66e4f5f 1193 index= src_x + av_clip(src_y, 0, height)*stride;
115329f1 1194 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1195 + src[index +1]* frac_x )*s
1196 + r)>>(shift*2);
1197 }
1198 }else{
1199 if((unsigned)src_y < height){
f66e4f5f 1200 index= av_clip(src_x, 0, width) + src_y*stride;
115329f1 1201 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1202 + src[index+stride ]* frac_y )*s
1203 + r)>>(shift*2);
1204 }else{
f66e4f5f 1205 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
073b013d
MN
1206 dst[y*stride + x]= src[index ];
1207 }
1208 }
115329f1 1209
073b013d
MN
1210 vx+= dxx;
1211 vy+= dyx;
1212 }
1213 ox += dxy;
1214 oy += dyy;
1215 }
1216}
669ac79c
MN
1217
1218static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1219 switch(width){
1220 case 2: put_pixels2_c (dst, src, stride, height); break;
1221 case 4: put_pixels4_c (dst, src, stride, height); break;
1222 case 8: put_pixels8_c (dst, src, stride, height); break;
1223 case 16:put_pixels16_c(dst, src, stride, height); break;
1224 }
1225}
1226
1227static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228 int i,j;
1229 for (i=0; i < height; i++) {
1230 for (j=0; j < width; j++) {
bb270c08 1231 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1232 }
1233 src += stride;
1234 dst += stride;
1235 }
1236}
1237
1238static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239 int i,j;
1240 for (i=0; i < height; i++) {
1241 for (j=0; j < width; j++) {
bb270c08 1242 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1243 }
1244 src += stride;
1245 dst += stride;
1246 }
1247}
115329f1 1248
669ac79c
MN
1249static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250 int i,j;
1251 for (i=0; i < height; i++) {
1252 for (j=0; j < width; j++) {
bb270c08 1253 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1254 }
1255 src += stride;
1256 dst += stride;
1257 }
1258}
115329f1 1259
669ac79c
MN
1260static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261 int i,j;
1262 for (i=0; i < height; i++) {
1263 for (j=0; j < width; j++) {
bb270c08 1264 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1265 }
1266 src += stride;
1267 dst += stride;
1268 }
1269}
1270
1271static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 int i,j;
1273 for (i=0; i < height; i++) {
1274 for (j=0; j < width; j++) {
bb270c08 1275 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1276 }
1277 src += stride;
1278 dst += stride;
1279 }
1280}
1281
1282static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283 int i,j;
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
bb270c08 1286 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1287 }
1288 src += stride;
1289 dst += stride;
1290 }
1291}
1292
1293static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294 int i,j;
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
bb270c08 1297 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1298 }
1299 src += stride;
1300 dst += stride;
1301 }
1302}
1303
1304static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305 int i,j;
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
bb270c08 1308 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1309 }
1310 src += stride;
1311 dst += stride;
1312 }
1313}
da3b9756
MM
1314
1315static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316 switch(width){
1317 case 2: avg_pixels2_c (dst, src, stride, height); break;
1318 case 4: avg_pixels4_c (dst, src, stride, height); break;
1319 case 8: avg_pixels8_c (dst, src, stride, height); break;
1320 case 16:avg_pixels16_c(dst, src, stride, height); break;
1321 }
1322}
1323
1324static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 int i,j;
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
bb270c08 1328 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1329 }
1330 src += stride;
1331 dst += stride;
1332 }
1333}
1334
1335static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 int i,j;
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
bb270c08 1339 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1340 }
1341 src += stride;
1342 dst += stride;
1343 }
1344}
115329f1 1345
da3b9756
MM
1346static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 int i,j;
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
bb270c08 1350 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1351 }
1352 src += stride;
1353 dst += stride;
1354 }
1355}
115329f1 1356
da3b9756
MM
1357static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 int i,j;
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
bb270c08 1361 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1362 }
1363 src += stride;
1364 dst += stride;
1365 }
1366}
1367
1368static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 int i,j;
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
bb270c08 1372 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1373 }
1374 src += stride;
1375 dst += stride;
1376 }
1377}
1378
1379static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 int i,j;
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
bb270c08 1383 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1384 }
1385 src += stride;
1386 dst += stride;
1387 }
1388}
1389
1390static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 int i,j;
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
bb270c08 1394 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1395 }
1396 src += stride;
1397 dst += stride;
1398 }
1399}
1400
1401static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402 int i,j;
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
bb270c08 1405 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1406 }
1407 src += stride;
1408 dst += stride;
1409 }
1410}
669ac79c
MN
1411#if 0
1412#define TPEL_WIDTH(width)\
1413static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1431#endif
1432
0da71265
MN
1433#define H264_CHROMA_MC(OPNAME, OP)\
1434static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435 const int A=(8-x)*(8-y);\
1436 const int B=( x)*(8-y);\
1437 const int C=(8-x)*( y);\
1438 const int D=( x)*( y);\
1439 int i;\
1440 \
1441 assert(x<8 && y<8 && x>=0 && y>=0);\
1442\
1443 for(i=0; i<h; i++)\
1444 {\
1445 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1446 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1447 dst+= stride;\
1448 src+= stride;\
1449 }\
1450}\
1451\
1452static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1453 const int A=(8-x)*(8-y);\
1454 const int B=( x)*(8-y);\
1455 const int C=(8-x)*( y);\
1456 const int D=( x)*( y);\
1457 int i;\
1458 \
1459 assert(x<8 && y<8 && x>=0 && y>=0);\
1460\
1461 for(i=0; i<h; i++)\
1462 {\
1463 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1464 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1465 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1466 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1467 dst+= stride;\
1468 src+= stride;\
1469 }\
1470}\
1471\
1472static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1473 const int A=(8-x)*(8-y);\
1474 const int B=( x)*(8-y);\
1475 const int C=(8-x)*( y);\
1476 const int D=( x)*( y);\
1477 int i;\
1478 \
1479 assert(x<8 && y<8 && x>=0 && y>=0);\
1480\
1481 for(i=0; i<h; i++)\
1482 {\
1483 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1484 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1485 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1486 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1487 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1488 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1489 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1490 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1491 dst+= stride;\
1492 src+= stride;\
1493 }\
1494}
1495
1496#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1497#define op_put(a, b) a = (((b) + 32)>>6)
1498
1499H264_CHROMA_MC(put_ , op_put)
1500H264_CHROMA_MC(avg_ , op_avg)
1501#undef op_avg
1502#undef op_put
1503
e34350a3
KS
1504static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1505 const int A=(8-x)*(8-y);
1506 const int B=( x)*(8-y);
1507 const int C=(8-x)*( y);
1508 const int D=( x)*( y);
1509 int i;
1510
1511 assert(x<8 && y<8 && x>=0 && y>=0);
1512
1513 for(i=0; i<h; i++)
1514 {
1515 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1516 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1517 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1518 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1519 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1520 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1521 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1522 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1523 dst+= stride;
1524 src+= stride;
1525 }
1526}
1527
b3184779 1528#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda 1529static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1530 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1531 int i;\
1532 for(i=0; i<h; i++)\
1533 {\
1534 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1535 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1536 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1537 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1538 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1539 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1540 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1541 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1542 dst+=dstStride;\
1543 src+=srcStride;\
1544 }\
44eb4951
MN
1545}\
1546\
0c1a9eda 1547static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1548 const int w=8;\
55fde95e 1549 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1550 int i;\
1551 for(i=0; i<w; i++)\
1552 {\
1553 const int src0= src[0*srcStride];\
1554 const int src1= src[1*srcStride];\
1555 const int src2= src[2*srcStride];\
1556 const int src3= src[3*srcStride];\
1557 const int src4= src[4*srcStride];\
1558 const int src5= src[5*srcStride];\
1559 const int src6= src[6*srcStride];\
1560 const int src7= src[7*srcStride];\
1561 const int src8= src[8*srcStride];\
1562 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1563 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1564 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1565 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1566 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1567 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1568 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1569 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1570 dst++;\
1571 src++;\
1572 }\
1573}\
1574\
0c1a9eda 1575static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1576 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1577 int i;\
826f429a 1578 \
b3184779
MN
1579 for(i=0; i<h; i++)\
1580 {\
1581 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1582 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1583 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1584 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1585 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1586 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1587 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1588 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1589 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1590 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1591 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1592 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1593 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1594 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1595 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1596 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1597 dst+=dstStride;\
1598 src+=srcStride;\
1599 }\
1600}\
1601\
0c1a9eda 1602static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
55fde95e 1603 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1604 int i;\
826f429a 1605 const int w=16;\
b3184779
MN
1606 for(i=0; i<w; i++)\
1607 {\
1608 const int src0= src[0*srcStride];\
1609 const int src1= src[1*srcStride];\
1610 const int src2= src[2*srcStride];\
1611 const int src3= src[3*srcStride];\
1612 const int src4= src[4*srcStride];\
1613 const int src5= src[5*srcStride];\
1614 const int src6= src[6*srcStride];\
1615 const int src7= src[7*srcStride];\
1616 const int src8= src[8*srcStride];\
1617 const int src9= src[9*srcStride];\
1618 const int src10= src[10*srcStride];\
1619 const int src11= src[11*srcStride];\
1620 const int src12= src[12*srcStride];\
1621 const int src13= src[13*srcStride];\
1622 const int src14= src[14*srcStride];\
1623 const int src15= src[15*srcStride];\
1624 const int src16= src[16*srcStride];\
1625 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1626 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1627 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1628 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1629 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1630 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1631 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1632 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1633 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1634 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1635 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1636 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1637 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1638 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1639 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1640 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1641 dst++;\
1642 src++;\
1643 }\
1644}\
1645\
0c1a9eda 1646static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1647 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1648}\
1649\
0c1a9eda
ZK
1650static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1651 uint8_t half[64];\
b3184779
MN
1652 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1653 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1654}\
1655\
0c1a9eda 1656static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1657 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1658}\
1659\
0c1a9eda
ZK
1660static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1661 uint8_t half[64];\
b3184779
MN
1662 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1663 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1664}\
1665\
0c1a9eda
ZK
1666static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1667 uint8_t full[16*9];\
1668 uint8_t half[64];\
b3184779 1669 copy_block9(full, src, 16, stride, 9);\
db794953 1670 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1671 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1672}\
1673\
0c1a9eda
ZK
1674static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[16*9];\
b3184779 1676 copy_block9(full, src, 16, stride, 9);\
db794953 1677 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1678}\
1679\
0c1a9eda
ZK
1680static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[16*9];\
1682 uint8_t half[64];\
b3184779 1683 copy_block9(full, src, 16, stride, 9);\
db794953 1684 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1685 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1686}\
0c1a9eda
ZK
1687void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1688 uint8_t full[16*9];\
1689 uint8_t halfH[72];\
1690 uint8_t halfV[64];\
1691 uint8_t halfHV[64];\
b3184779
MN
1692 copy_block9(full, src, 16, stride, 9);\
1693 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1694 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1695 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1696 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1697}\
0c1a9eda
ZK
1698static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1699 uint8_t full[16*9];\
1700 uint8_t halfH[72];\
1701 uint8_t halfHV[64];\
db794953
MN
1702 copy_block9(full, src, 16, stride, 9);\
1703 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1704 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1705 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1706 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1707}\
0c1a9eda
ZK
1708void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1709 uint8_t full[16*9];\
1710 uint8_t halfH[72];\
1711 uint8_t halfV[64];\
1712 uint8_t halfHV[64];\
b3184779
MN
1713 copy_block9(full, src, 16, stride, 9);\
1714 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1715 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1716 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1717 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1718}\
0c1a9eda
ZK
1719static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1720 uint8_t full[16*9];\
1721 uint8_t halfH[72];\
1722 uint8_t halfHV[64];\
db794953
MN
1723 copy_block9(full, src, 16, stride, 9);\
1724 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1725 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1726 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1727 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1728}\
0c1a9eda
ZK
1729void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t full[16*9];\
1731 uint8_t halfH[72];\
1732 uint8_t halfV[64];\
1733 uint8_t halfHV[64];\
b3184779
MN
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1736 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1737 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1738 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1739}\
0c1a9eda
ZK
1740static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1741 uint8_t full[16*9];\
1742 uint8_t halfH[72];\
1743 uint8_t halfHV[64];\
db794953
MN
1744 copy_block9(full, src, 16, stride, 9);\
1745 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1746 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1747 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1748 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1749}\
0c1a9eda
ZK
1750void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t full[16*9];\
1752 uint8_t halfH[72];\
1753 uint8_t halfV[64];\
1754 uint8_t halfHV[64];\
b3184779
MN
1755 copy_block9(full, src, 16, stride, 9);\
1756 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1757 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1758 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1759 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1760}\
0c1a9eda
ZK
1761static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1762 uint8_t full[16*9];\
1763 uint8_t halfH[72];\
1764 uint8_t halfHV[64];\
db794953
MN
1765 copy_block9(full, src, 16, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1767 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1768 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1769 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1770}\
0c1a9eda
ZK
1771static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[72];\
1773 uint8_t halfHV[64];\
b3184779 1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1776 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1777}\
0c1a9eda
ZK
1778static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t halfH[72];\
1780 uint8_t halfHV[64];\
b3184779 1781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1782 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1783 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1784}\
0c1a9eda
ZK
1785void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[16*9];\
1787 uint8_t halfH[72];\
1788 uint8_t halfV[64];\
1789 uint8_t halfHV[64];\
b3184779
MN
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1792 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1794 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1795}\
0c1a9eda
ZK
1796static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1797 uint8_t full[16*9];\
1798 uint8_t halfH[72];\
db794953
MN
1799 copy_block9(full, src, 16, stride, 9);\
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1801 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1802 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1803}\
0c1a9eda
ZK
1804void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1805 uint8_t full[16*9];\
1806 uint8_t halfH[72];\
1807 uint8_t halfV[64];\
1808 uint8_t halfHV[64];\
b3184779
MN
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1813 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1814}\
0c1a9eda
ZK
1815static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[16*9];\
1817 uint8_t halfH[72];\
db794953
MN
1818 copy_block9(full, src, 16, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1821 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1822}\
0c1a9eda
ZK
1823static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t halfH[72];\
b3184779 1825 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1826 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1827}\
0c1a9eda 1828static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1829 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1830}\
1831\
0c1a9eda
ZK
1832static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t half[256];\
b3184779
MN
1834 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1835 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1836}\
1837\
0c1a9eda 1838static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1839 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1840}\
b3184779 1841\
0c1a9eda
ZK
1842static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t half[256];\
b3184779
MN
1844 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1845 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1846}\
1847\
0c1a9eda
ZK
1848static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[24*17];\
1850 uint8_t half[256];\
b3184779 1851 copy_block17(full, src, 24, stride, 17);\
826f429a 1852 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1853 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1854}\
1855\
0c1a9eda
ZK
1856static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[24*17];\
b3184779 1858 copy_block17(full, src, 24, stride, 17);\
826f429a 1859 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1860}\
1861\
0c1a9eda
ZK
1862static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t full[24*17];\
1864 uint8_t half[256];\
b3184779 1865 copy_block17(full, src, 24, stride, 17);\
826f429a 1866 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1867 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1868}\
0c1a9eda
ZK
1869void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[24*17];\
1871 uint8_t halfH[272];\
1872 uint8_t halfV[256];\
1873 uint8_t halfHV[256];\
b3184779
MN
1874 copy_block17(full, src, 24, stride, 17);\
1875 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1876 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1877 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1878 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1879}\
0c1a9eda
ZK
1880static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[24*17];\
1882 uint8_t halfH[272];\
1883 uint8_t halfHV[256];\
db794953
MN
1884 copy_block17(full, src, 24, stride, 17);\
1885 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1886 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1887 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1888 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1889}\
0c1a9eda
ZK
1890void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1891 uint8_t full[24*17];\
1892 uint8_t halfH[272];\
1893 uint8_t halfV[256];\
1894 uint8_t halfHV[256];\
b3184779
MN
1895 copy_block17(full, src, 24, stride, 17);\
1896 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1897 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1898 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1899 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1900}\
0c1a9eda
ZK
1901static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t full[24*17];\
1903 uint8_t halfH[272];\
1904 uint8_t halfHV[256];\
db794953
MN
1905 copy_block17(full, src, 24, stride, 17);\
1906 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1907 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1908 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1909 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1910}\
0c1a9eda
ZK
1911void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[24*17];\
1913 uint8_t halfH[272];\
1914 uint8_t halfV[256];\
1915 uint8_t halfHV[256];\
b3184779
MN
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1918 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1919 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1920 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1921}\
0c1a9eda
ZK
1922static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[24*17];\
1924 uint8_t halfH[272];\
1925 uint8_t halfHV[256];\
db794953
MN
1926 copy_block17(full, src, 24, stride, 17);\
1927 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1928 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1929 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1930 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1931}\
0c1a9eda
ZK
1932void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1933 uint8_t full[24*17];\
1934 uint8_t halfH[272];\
1935 uint8_t halfV[256];\
1936 uint8_t halfHV[256];\
b3184779
MN
1937 copy_block17(full, src, 24, stride, 17);\
1938 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1939 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1940 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1941 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1942}\
0c1a9eda
ZK
1943static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1944 uint8_t full[24*17];\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
db794953
MN
1947 copy_block17(full, src, 24, stride, 17);\
1948 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1949 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1950 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1951 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1952}\
0c1a9eda
ZK
1953static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1954 uint8_t halfH[272];\
1955 uint8_t halfHV[256];\
b3184779 1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1958 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1959}\
0c1a9eda
ZK
1960static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t halfH[272];\
1962 uint8_t halfHV[256];\
b3184779 1963 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1964 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1965 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1966}\
0c1a9eda
ZK
1967void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[24*17];\
1969 uint8_t halfH[272];\
1970 uint8_t halfV[256];\
1971 uint8_t halfHV[256];\
b3184779
MN
1972 copy_block17(full, src, 24, stride, 17);\
1973 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1974 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1975 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1976 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1977}\
0c1a9eda
ZK
1978static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[24*17];\
1980 uint8_t halfH[272];\
db794953
MN
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1983 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1984 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1985}\
0c1a9eda
ZK
1986void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[24*17];\
1988 uint8_t halfH[272];\
1989 uint8_t halfV[256];\
1990 uint8_t halfHV[256];\
b3184779
MN
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1995 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1996}\
0c1a9eda
ZK
1997static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t full[24*17];\
1999 uint8_t halfH[272];\
db794953
MN
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2003 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2004}\
0c1a9eda
ZK
2005static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2006 uint8_t halfH[272];\
b3184779 2007 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2008 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2009}
44eb4951 2010
b3184779
MN
2011#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2012#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2013#define op_put(a, b) a = cm[((b) + 16)>>5]
2014#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2015
2016QPEL_MC(0, put_ , _ , op_put)
2017QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2018QPEL_MC(0, avg_ , _ , op_avg)
2019//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2020#undef op_avg
2021#undef op_avg_no_rnd
2022#undef op_put
2023#undef op_put_no_rnd
44eb4951 2024
0da71265
MN
2025#if 1
2026#define H264_LOWPASS(OPNAME, OP, OP2) \
bb5705b9 2027static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
80e44bc3 2028 const int h=2;\
55fde95e 2029 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2030 int i;\
2031 for(i=0; i<h; i++)\
2032 {\
2033 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2034 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2035 dst+=dstStride;\
2036 src+=srcStride;\
2037 }\
2038}\
2039\
bb5705b9 2040static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
80e44bc3 2041 const int w=2;\
55fde95e 2042 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2043 int i;\
2044 for(i=0; i<w; i++)\
2045 {\
2046 const int srcB= src[-2*srcStride];\
2047 const int srcA= src[-1*srcStride];\
2048 const int src0= src[0 *srcStride];\
2049 const int src1= src[1 *srcStride];\
2050 const int src2= src[2 *srcStride];\
2051 const int src3= src[3 *srcStride];\
2052 const int src4= src[4 *srcStride];\
2053 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2054 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2055 dst++;\
2056 src++;\
2057 }\
2058}\
2059\
bb5705b9 2060static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
80e44bc3
MN
2061 const int h=2;\
2062 const int w=2;\
55fde95e 2063 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2064 int i;\
2065 src -= 2*srcStride;\
2066 for(i=0; i<h+5; i++)\
2067 {\
2068 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp+=tmpStride;\
2071 src+=srcStride;\
2072 }\
2073 tmp -= tmpStride*(h+5-2);\
2074 for(i=0; i<w; i++)\
2075 {\
2076 const int tmpB= tmp[-2*tmpStride];\
2077 const int tmpA= tmp[-1*tmpStride];\
2078 const int tmp0= tmp[0 *tmpStride];\
2079 const int tmp1= tmp[1 *tmpStride];\
2080 const int tmp2= tmp[2 *tmpStride];\
2081 const int tmp3= tmp[3 *tmpStride];\
2082 const int tmp4= tmp[4 *tmpStride];\
2083 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2084 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2085 dst++;\
2086 tmp++;\
2087 }\
2088}\
0da71265
MN
2089static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2090 const int h=4;\
55fde95e 2091 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2092 int i;\
2093 for(i=0; i<h; i++)\
2094 {\
2095 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2096 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2097 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2098 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2099 dst+=dstStride;\
2100 src+=srcStride;\
2101 }\
2102}\
2103\
2104static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2105 const int w=4;\
55fde95e 2106 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2107 int i;\
2108 for(i=0; i<w; i++)\
2109 {\
2110 const int srcB= src[-2*srcStride];\
2111 const int srcA= src[-1*srcStride];\
2112 const int src0= src[0 *srcStride];\
2113 const int src1= src[1 *srcStride];\
2114 const int src2= src[2 *srcStride];\
2115 const int src3= src[3 *srcStride];\
2116 const int src4= src[4 *srcStride];\
2117 const int src5= src[5 *srcStride];\
2118 const int src6= src[6 *srcStride];\
2119 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2120 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2121 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2122 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2123 dst++;\
2124 src++;\
2125 }\
2126}\
2127\
2128static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2129 const int h=4;\
2130 const int w=4;\
55fde95e 2131 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2132 int i;\
2133 src -= 2*srcStride;\
2134 for(i=0; i<h+5; i++)\
2135 {\
2136 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2137 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2138 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2139 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2140 tmp+=tmpStride;\
2141 src+=srcStride;\
2142 }\
2143 tmp -= tmpStride*(h+5-2);\
2144 for(i=0; i<w; i++)\
2145 {\
2146 const int tmpB= tmp[-2*tmpStride];\
2147 const int tmpA= tmp[-1*tmpStride];\
2148 const int tmp0= tmp[0 *tmpStride];\
2149 const int tmp1= tmp[1 *tmpStride];\
2150 const int tmp2= tmp[2 *tmpStride];\
2151 const int tmp3= tmp[3 *tmpStride];\
2152 const int tmp4= tmp[4 *tmpStride];\
2153 const int tmp5= tmp[5 *tmpStride];\
2154 const int tmp6= tmp[6 *tmpStride];\
2155 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2156 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2157 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2158 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2159 dst++;\
2160 tmp++;\
2161 }\
2162}\
2163\
2164static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2165 const int h=8;\
55fde95e 2166 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2167 int i;\
2168 for(i=0; i<h; i++)\
2169 {\
2170 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2171 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2172 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2173 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2174 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2175 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2176 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2177 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2178 dst+=dstStride;\
2179 src+=srcStride;\
2180 }\
2181}\
2182\
2183static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2184 const int w=8;\
55fde95e 2185 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2186 int i;\
2187 for(i=0; i<w; i++)\
2188 {\
2189 const int srcB= src[-2*srcStride];\
2190 const int srcA= src[-1*srcStride];\
2191 const int src0= src[0 *srcStride];\
2192 const int src1= src[1 *srcStride];\
2193 const int src2= src[2 *srcStride];\
2194 const int src3= src[3 *srcStride];\
2195 const int src4= src[4 *srcStride];\
2196 const int src5= src[5 *srcStride];\
2197 const int src6= src[6 *srcStride];\
2198 const int src7= src[7 *srcStride];\
2199 const int src8= src[8 *srcStride];\
2200 const int src9= src[9 *srcStride];\
2201 const int src10=src[10*srcStride];\
2202 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2203 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2204 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2205 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2206 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2207 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2208 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2209 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2210 dst++;\
2211 src++;\
2212 }\
2213}\
2214\
2215static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2216 const int h=8;\
2217 const int w=8;\
55fde95e 2218 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2219 int i;\
2220 src -= 2*srcStride;\
2221 for(i=0; i<h+5; i++)\
2222 {\
2223 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2224 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2225 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2226 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2227 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2228 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2229 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2230 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2231 tmp+=tmpStride;\
2232 src+=srcStride;\
2233 }\
2234 tmp -= tmpStride*(h+5-2);\
2235 for(i=0; i<w; i++)\
2236 {\
2237 const int tmpB= tmp[-2*tmpStride];\
2238 const int tmpA= tmp[-1*tmpStride];\
2239 const int tmp0= tmp[0 *tmpStride];\
2240 const int tmp1= tmp[1 *tmpStride];\
2241 const int tmp2= tmp[2 *tmpStride];\
2242 const int tmp3= tmp[3 *tmpStride];\
2243 const int tmp4= tmp[4 *tmpStride];\
2244 const int tmp5= tmp[5 *tmpStride];\
2245 const int tmp6= tmp[6 *tmpStride];\
2246 const int tmp7= tmp[7 *tmpStride];\
2247 const int tmp8= tmp[8 *tmpStride];\
2248 const int tmp9= tmp[9 *tmpStride];\
2249 const int tmp10=tmp[10*tmpStride];\
2250 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2251 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2252 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2253 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2254 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2255 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2256 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2257 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2258 dst++;\
2259 tmp++;\
2260 }\
2261}\
2262\
2263static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2264 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2265 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2266 src += 8*srcStride;\
2267 dst += 8*dstStride;\
2268 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2269 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2270}\
2271\
2272static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2273 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2274 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2275 src += 8*srcStride;\
2276 dst += 8*dstStride;\
2277 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2278 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2279}\
2280\
2281static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2282 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2283 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2284 src += 8*srcStride;\
0da71265
MN
2285 dst += 8*dstStride;\
2286 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2287 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2288}\
2289
2290#define H264_MC(OPNAME, SIZE) \
2291static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2292 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2293}\
2294\
2295static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2296 uint8_t half[SIZE*SIZE];\
2297 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2299}\
2300\
2301static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2302 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2303}\
2304\
2305static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2306 uint8_t half[SIZE*SIZE];\
2307 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2308 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2309}\
2310\
2311static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2312 uint8_t full[SIZE*(SIZE+5)];\
2313 uint8_t * const full_mid= full + SIZE*2;\
2314 uint8_t half[SIZE*SIZE];\
2315 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2316 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2317 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2318}\
2319\
2320static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2321 uint8_t full[SIZE*(SIZE+5)];\
2322 uint8_t * const full_mid= full + SIZE*2;\
2323 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2324 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2325}\
2326\
2327static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2328 uint8_t full[SIZE*(SIZE+5)];\
2329 uint8_t * const full_mid= full + SIZE*2;\
2330 uint8_t half[SIZE*SIZE];\
2331 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2332 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2333 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2334}\
2335\
2336static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2337 uint8_t full[SIZE*(SIZE+5)];\
2338 uint8_t * const full_mid= full + SIZE*2;\
2339 uint8_t halfH[SIZE*SIZE];\
2340 uint8_t halfV[SIZE*SIZE];\
2341 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2342 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2343 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2345}\
2346\
2347static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 uint8_t halfH[SIZE*SIZE];\
2351 uint8_t halfV[SIZE*SIZE];\
2352 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2353 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2356}\
2357\
2358static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2359 uint8_t full[SIZE*(SIZE+5)];\
2360 uint8_t * const full_mid= full + SIZE*2;\
2361 uint8_t halfH[SIZE*SIZE];\
2362 uint8_t halfV[SIZE*SIZE];\
2363 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2364 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2365 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2366 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2367}\
2368\
2369static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2370 uint8_t full[SIZE*(SIZE+5)];\
2371 uint8_t * const full_mid= full + SIZE*2;\
2372 uint8_t halfH[SIZE*SIZE];\
2373 uint8_t halfV[SIZE*SIZE];\
2374 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2375 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2376 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2378}\
2379\
2380static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2381 int16_t tmp[SIZE*(SIZE+5)];\
2382 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2383}\
2384\
2385static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2386 int16_t tmp[SIZE*(SIZE+5)];\
2387 uint8_t halfH[SIZE*SIZE];\
2388 uint8_t halfHV[SIZE*SIZE];\
2389 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2390 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2391 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2392}\
2393\
2394static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2395 int16_t tmp[SIZE*(SIZE+5)];\
2396 uint8_t halfH[SIZE*SIZE];\
2397 uint8_t halfHV[SIZE*SIZE];\
2398 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2399 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2401}\
2402\
2403static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2404 uint8_t full[SIZE*(SIZE+5)];\
2405 uint8_t * const full_mid= full + SIZE*2;\
2406 int16_t tmp[SIZE*(SIZE+5)];\
2407 uint8_t halfV[SIZE*SIZE];\
2408 uint8_t halfHV[SIZE*SIZE];\
2409 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2410 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2413}\
2414\
2415static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2416 uint8_t full[SIZE*(SIZE+5)];\
2417 uint8_t * const full_mid= full + SIZE*2;\
2418 int16_t tmp[SIZE*(SIZE+5)];\
2419 uint8_t halfV[SIZE*SIZE];\
2420 uint8_t halfHV[SIZE*SIZE];\
2421 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2422 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2423 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2424 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2425}\
2426
2427#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2428//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2429#define op_put(a, b) a = cm[((b) + 16)>>5]
2430#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2431#define op2_put(a, b) a = cm[((b) + 512)>>10]
2432
2433H264_LOWPASS(put_ , op_put, op2_put)
2434H264_LOWPASS(avg_ , op_avg, op2_avg)
80e44bc3 2435H264_MC(put_, 2)
0da71265
MN
2436H264_MC(put_, 4)
2437H264_MC(put_, 8)
2438H264_MC(put_, 16)
2439H264_MC(avg_, 4)
2440H264_MC(avg_, 8)
2441H264_MC(avg_, 16)
2442
2443#undef op_avg
2444#undef op_put
2445#undef op2_avg
2446#undef op2_put
2447#endif
2448
f66e4f5f
RD
2449#define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2450#define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2451#define H264_WEIGHT(W,H) \
2452static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
e8b56208 2453 int y; \
9f2d1b4f
LM
2454 offset <<= log2_denom; \
2455 if(log2_denom) offset += 1<<(log2_denom-1); \
2456 for(y=0; y<H; y++, block += stride){ \
2457 op_scale1(0); \
2458 op_scale1(1); \
2459 if(W==2) continue; \
2460 op_scale1(2); \
2461 op_scale1(3); \
2462 if(W==4) continue; \
2463 op_scale1(4); \
2464 op_scale1(5); \
2465 op_scale1(6); \
2466 op_scale1(7); \
2467 if(W==8) continue; \
2468 op_scale1(8); \
2469 op_scale1(9); \
2470 op_scale1(10); \
2471 op_scale1(11); \
2472 op_scale1(12); \
2473 op_scale1(13); \
2474 op_scale1(14); \
2475 op_scale1(15); \
2476 } \
2477} \
e8b56208
LM
2478static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2479 int y; \
2480 offset = ((offset + 1) | 1) << log2_denom; \
9f2d1b4f
LM
2481 for(y=0; y<H; y++, dst += stride, src += stride){ \
2482 op_scale2(0); \
2483 op_scale2(1); \
2484 if(W==2) continue; \
2485 op_scale2(2); \
2486 op_scale2(3); \
2487 if(W==4) continue; \
2488 op_scale2(4); \
2489 op_scale2(5); \
2490 op_scale2(6); \
2491 op_scale2(7); \
2492 if(W==8) continue; \
2493 op_scale2(8); \
2494 op_scale2(9); \
2495 op_scale2(10); \
2496 op_scale2(11); \
2497 op_scale2(12); \
2498 op_scale2(13); \
2499 op_scale2(14); \
2500 op_scale2(15); \
2501 } \
2502}
2503
2504H264_WEIGHT(16,16)
2505H264_WEIGHT(16,8)
2506H264_WEIGHT(8,16)
2507H264_WEIGHT(8,8)
2508H264_WEIGHT(8,4)
2509H264_WEIGHT(4,8)
2510H264_WEIGHT(4,4)
2511H264_WEIGHT(4,2)
2512H264_WEIGHT(2,4)
2513H264_WEIGHT(2,2)
2514
2515#undef op_scale1
2516#undef op_scale2
2517#undef H264_WEIGHT
2518
1457ab52 2519static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
55fde95e 2520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2521 int i;
2522
2523 for(i=0; i<h; i++){
2524 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2525 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2526 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2527 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2528 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2529 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2530 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2531 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2532 dst+=dstStride;
115329f1 2533 src+=srcStride;
1457ab52
MN
2534 }
2535}
2536
29c5cdca 2537#ifdef CONFIG_CAVS_DECODER
b482e2d1
MN
2538/* AVS specific */
2539void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2540
2541void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542 put_pixels8_c(dst, src, stride, 8);
2543}
2544void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545 avg_pixels8_c(dst, src, stride, 8);
2546}
2547void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548 put_pixels16_c(dst, src, stride, 16);
2549}
2550void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2551 avg_pixels16_c(dst, src, stride, 16);
2552}
29c5cdca 2553#endif /* CONFIG_CAVS_DECODER */
b482e2d1 2554
64db55ae
KS
2555#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2556/* VC-1 specific */
2557void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2558
2559void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
74691b7b
KS
2560 put_pixels8_c(dst, src, stride, 8);
2561}
64db55ae
KS
2562#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2563
c6b237da
PI
2564#if defined(CONFIG_H264_ENCODER)
2565/* H264 specific */
edecaff8 2566void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
c6b237da
PI
2567#endif /* CONFIG_H264_ENCODER */
2568
1457ab52 2569static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
55fde95e 2570 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2571 int i;
2572
2573 for(i=0; i<w; i++){
2574 const int src_1= src[ -srcStride];
2575 const int src0 = src[0 ];
2576 const int src1 = src[ srcStride];
2577 const int src2 = src[2*srcStride];
2578 const int src3 = src[3*srcStride];
2579 const int src4 = src[4*srcStride];
2580 const int src5 = src[5*srcStride];
2581 const int src6 = src[6*srcStride];
2582 const int src7 = src[7*srcStride];
2583 const int src8 = src[8*srcStride];
2584 const int src9 = src[9*srcStride];
2585 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2586 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2587 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2588 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2589 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2590 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2591 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2592 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2593 src++;
2594 dst++;
2595 }
2596}
2597
2598static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2599 put_pixels8_c(dst, src, stride, 8);
2600}
2601
2602static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2603 uint8_t half[64];
2604 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2605 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2606}
2607
2608static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2609 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2610}
2611
2612static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2613 uint8_t half[64];
2614 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2615 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2616}
2617
2618static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2619 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2620}
2621
2622static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2623 uint8_t halfH[88];
2624 uint8_t halfV[64];
2625 uint8_t halfHV[64];
2626 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2627 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2628 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2629 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2630}
2631static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2632 uint8_t halfH[88];
2633 uint8_t halfV[64];
2634 uint8_t halfHV[64];
2635 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2636 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2637 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2638 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2639}
2640static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2641 uint8_t halfH[88];
2642 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2643 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2644}
2645
332f9ac4 2646static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
73f51a4d 2647 if(ENABLE_ANY_H263) {
332f9ac4
MN
2648 int x;
2649 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2650
332f9ac4
MN
2651 for(x=0; x<8; x++){
2652 int d1, d2, ad1;
2653 int p0= src[x-2*stride];
2654 int p1= src[x-1*stride];
2655 int p2= src[x+0*stride];
2656 int p3= src[x+1*stride];
2657 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2658
2659 if (d<-2*strength) d1= 0;
2660 else if(d<- strength) d1=-2*strength - d;
2661 else if(d< strength) d1= d;
2662 else if(d< 2*strength) d1= 2*strength - d;
2663 else d1= 0;
115329f1 2664
332f9ac4
MN
2665 p1 += d1;
2666 p2 -= d1;
2667 if(p1&256) p1= ~(p1>>31);
2668 if(p2&256) p2= ~(p2>>31);
115329f1 2669
332f9ac4
MN
2670 src[x-1*stride] = p1;
2671 src[x+0*stride] = p2;
2672
c26abfa5 2673 ad1= FFABS(d1)>>1;
115329f1 2674
f66e4f5f 2675 d2= av_clip((p0-p3)/4, -ad1, ad1);
115329f1 2676
332f9ac4
MN
2677 src[x-2*stride] = p0 - d2;
2678 src[x+ stride] = p3 + d2;
2679 }
73f51a4d 2680 }
332f9ac4
MN
2681}
2682
2683static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
73f51a4d 2684 if(ENABLE_ANY_H263) {
332f9ac4
MN
2685 int y;
2686 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2687
332f9ac4
MN
2688 for(y=0; y<8; y++){
2689 int d1, d2, ad1;
2690 int p0= src[y*stride-2];
2691 int p1= src[y*stride-1];
2692 int p2= src[y*stride+0];
2693 int p3= src[y*stride+1];
2694 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2695
2696 if (d<-2*strength) d1= 0;
2697 else if(d<- strength) d1=-2*strength - d;
2698 else if(d< strength) d1= d;
2699 else if(d< 2*strength) d1= 2*strength - d;
2700 else d1= 0;
115329f1 2701
332f9ac4
MN
2702 p1 += d1;
2703 p2 -= d1;
2704 if(p1&256) p1= ~(p1>>31);
2705 if(p2&256) p2= ~(p2>>31);
115329f1 2706
332f9ac4
MN
2707 src[y*stride-1] = p1;
2708 src[y*stride+0] = p2;
2709
c26abfa5 2710 ad1= FFABS(d1)>>1;
115329f1 2711
f66e4f5f 2712 d2= av_clip((p0-p3)/4, -ad1, ad1);
115329f1 2713
332f9ac4
MN
2714 src[y*stride-2] = p0 - d2;
2715 src[y*stride+1] = p3 + d2;
2716 }
73f51a4d 2717 }
332f9ac4 2718}
1457ab52 2719
fdbbf2e0
MN
2720static void h261_loop_filter_c(uint8_t *src, int stride){
2721 int x,y,xy,yz;
2722 int temp[64];
2723
2724 for(x=0; x<8; x++){
2725 temp[x ] = 4*src[x ];
2726 temp[x + 7*8] = 4*src[x + 7*stride];
2727 }
2728 for(y=1; y<7; y++){
2729 for(x=0; x<8; x++){
2730 xy = y * stride + x;
2731 yz = y * 8 + x;
2732 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2733 }
2734 }
115329f1 2735
fdbbf2e0
MN
2736 for(y=0; y<8; y++){
2737 src[ y*stride] = (temp[ y*8] + 2)>>2;
2738 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2739 for(x=1; x<7; x++){
2740 xy = y * stride + x;
2741 yz = y * 8 + x;
2742 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2743 }
2744 }
2745}
2746
5cf08f23 2747static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2748{
2749 int i, d;
2750 for( i = 0; i < 4; i++ ) {
2751 if( tc0[i] < 0 ) {
2752 pix += 4*ystride;
2753 continue;
2754 }
2755 for( d = 0; d < 4; d++ ) {
2756 const int p0 = pix[-1*xstride];
2757 const int p1 = pix[-2*xstride];
2758 const int p2 = pix[-3*xstride];
2759 const int q0 = pix[0];
2760 const int q1 = pix[1*xstride];
2761 const int q2 = pix[2*xstride];
115329f1 2762
c26abfa5
DB
2763 if( FFABS( p0 - q0 ) < alpha &&
2764 FFABS( p1 - p0 ) < beta &&
2765 FFABS( q1 - q0 ) < beta ) {
115329f1 2766
42251a2a
LM
2767 int tc = tc0[i];
2768 int i_delta;
115329f1 2769
c26abfa5 2770 if( FFABS( p2 - p0 ) < beta ) {
f66e4f5f 2771 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
2772 tc++;
2773 }
c26abfa5 2774 if( FFABS( q2 - q0 ) < beta ) {
f66e4f5f 2775 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
2776 tc++;
2777 }
115329f1 2778
f66e4f5f
RD
2779 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2780 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2781 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
42251a2a
LM
2782 }
2783 pix += ystride;
2784 }
2785 }
2786}
5cf08f23 2787static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2788{
2789 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2790}
5cf08f23 2791static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2792{
2793 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2794}
2795
5cf08f23 2796static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2797{
2798 int i, d;
2799 for( i = 0; i < 4; i++ ) {
2800 const int tc = tc0[i];
2801 if( tc <= 0 ) {
2802 pix += 2*ystride;
2803 continue;
2804 }
2805 for( d = 0; d < 2; d++ ) {
2806 const int p0 = pix[-1*xstride];
2807 const int p1 = pix[-2*xstride];
2808 const int q0 = pix[0];
2809 const int q1 = pix[1*xstride];
2810
c26abfa5
DB
2811 if( FFABS( p0 - q0 ) < alpha &&
2812 FFABS( p1 - p0 ) < beta &&
2813 FFABS( q1 - q0 ) < beta ) {
42251a2a 2814
f66e4f5f 2815 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
42251a2a 2816
f66e4f5f
RD
2817 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2818 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
42251a2a
LM
2819 }
2820 pix += ystride;
2821 }
2822 }
2823}
5cf08f23 2824static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2825{
2826 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2827}
5cf08f23 2828static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2829{
2830 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2831}
2832
5cf08f23
LM
2833static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2834{
2835 int d;
2836 for( d = 0; d < 8; d++ ) {
2837 const int p0 = pix[-1*xstride];
2838 const int p1 = pix[-2*xstride];
2839 const int q0 = pix[0];
2840 const int q1 = pix[1*xstride];
2841
c26abfa5
DB
2842 if( FFABS( p0 - q0 ) < alpha &&
2843 FFABS( p1 - p0 ) < beta &&
2844 FFABS( q1 - q0 ) < beta ) {
5cf08f23
LM
2845
2846 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2847 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2848 }
2849 pix += ystride;
2850 }
2851}
2852static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2853{
2854 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2855}
2856static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857{
2858 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2859}
2860
bb198e19 2861static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2862{
2863 int s, i;
2864
2865 s = 0;
bb198e19 2866 for(i=0;i<h;i++) {
de6d9b64
FB
2867 s += abs(pix1[0] - pix2[0]);
2868 s += abs(pix1[1] - pix2[1]);
2869 s += abs(pix1[2] - pix2[2]);
2870 s += abs(pix1[3] - pix2[3]);
2871 s += abs(pix1[4] - pix2[4]);
2872 s += abs(pix1[5] - pix2[5]);
2873 s += abs(pix1[6] - pix2[6]);
2874 s += abs(pix1[7] - pix2[7]);
2875 s += abs(pix1[8] - pix2[8]);
2876 s += abs(pix1[9] - pix2[9]);
2877 s += abs(pix1[10] - pix2[10]);
2878 s += abs(pix1[11] - pix2[11]);
2879 s += abs(pix1[12] - pix2[12]);
2880 s += abs(pix1[13] - pix2[13]);
2881 s += abs(pix1[14] - pix2[14]);
2882 s += abs(pix1[15] - pix2[15]);
2883 pix1 += line_size;
2884 pix2 += line_size;
2885 }
2886 return s;
2887}
2888
bb198e19 2889static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2890{
2891 int s, i;
2892
2893 s = 0;
bb198e19 2894 for(i=0;i<h;i++) {
de6d9b64
FB
2895 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2896 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2897 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2898 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2899 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2900 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2901 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2902 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2903 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2904 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2905 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2906 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2907 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2908 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2909 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2910 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2911 pix1 += line_size;
2912 pix2 += line_size;
2913 }
2914 return s;
2915}
2916
bb198e19 2917static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2918{
2919 int s, i;
0c1a9eda 2920 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2921
2922 s = 0;
bb198e19 2923 for(i=0;i<h;i++) {
de6d9b64
FB
2924 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2925 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2926 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2927 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2928 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2929 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2930 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2931 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2932 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2933 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2934 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2935 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2936 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2937 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2938 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2939 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2940 pix1 += line_size;
2941 pix2 += line_size;
2942 pix3 += line_size;
2943 }
2944 return s;
2945}
2946
bb198e19 2947static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2948{
2949 int s, i;
0c1a9eda 2950 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2951
2952 s = 0;
bb198e19 2953 for(i=0;i<h;i++) {
de6d9b64
FB
2954 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2955 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2956 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2957 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2958 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2959 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2960 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2961 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2962 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2963 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2964 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2965 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2966 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2967 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2968 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2969 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2970 pix1 += line_size;
2971 pix2 += line_size;
2972 pix3 += line_size;
2973 }
2974 return s;
2975}
2976
bb198e19 2977static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2978{
2979 int s, i;
2980
2981 s = 0;
bb198e19 2982 for(i=0;i<h;i++) {
ba6802de
MN
2983 s += abs(pix1[0] - pix2[0]);
2984 s += abs(pix1[1] - pix2[1]);
2985 s += abs(pix1[2] - pix2[2]);
2986 s += abs(pix1[3] - pix2[3]);
2987 s += abs(pix1[4] - pix2[4]);
2988 s += abs(pix1[5] - pix2[5]);
2989 s += abs(pix1[6] - pix2[6]);
2990 s += abs(pix1[7] - pix2[7]);
2991 pix1 += line_size;
2992 pix2 += line_size;
2993 }
2994 return s;
2995}
2996
bb198e19 2997static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2998{
2999 int s, i;
3000
3001 s = 0;
bb198e19 3002 for(i=0;i<h;i++) {
ba6802de
MN
3003 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3004 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3005 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3006 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3007 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3008 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3009 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3010 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3011 pix1 += line_size;
3012 pix2 += line_size;
3013 }
3014 return s;
3015}
3016
bb198e19 3017static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3018{
3019 int s, i;
0c1a9eda 3020 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3021
3022 s = 0;
bb198e19 3023 for(i=0;i<h;i++) {
ba6802de
MN
3024 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3025 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3026 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3027 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3028 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3029 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3030 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3031 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3032 pix1 += line_size;
3033 pix2 += line_size;
3034 pix3 += line_size;
3035 }
3036 return s;
3037}
3038
bb198e19 3039static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3040{
3041 int s, i;
0c1a9eda 3042 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3043
3044 s = 0;
bb198e19 3045 for(i=0;i<h;i++) {
ba6802de
MN
3046 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3047 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3048 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3049 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3050 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3051 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3052 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3053 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3054 pix1 += line_size;
3055 pix2 += line_size;
3056 pix3 += line_size;
3057 }
3058 return s;
3059}
3060
bf4e3bd2
MR
3061static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3062 MpegEncContext *c = v;
e6a2ac34
MN
3063 int score1=0;
3064 int score2=0;
3065 int x,y;
d4c5d2ad 3066
e6a2ac34
MN
3067 for(y=0; y<h; y++){
3068 for(x=0; x<16; x++){
3069 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3070 }
3071 if(y+1<h){
3072 for(x=0; x<15; x++){
c26abfa5 3073 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3074 - s1[x+1] + s1[x+1+stride])
c26abfa5 3075 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3076 - s2[x+1] + s2[x+1+stride]);
3077 }
3078 }
3079 s1+= stride;
3080 s2+= stride;
3081 }
d4c5d2ad 3082
c26abfa5
DB
3083 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3084 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3085}
3086
bf4e3bd2
MR
3087static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3088 MpegEncContext *c = v;
e6a2ac34
MN
3089 int score1=0;
3090 int score2=0;
3091 int x,y;
115329f1 3092
e6a2ac34
MN
3093 for(y=0; y<h; y++){
3094 for(x=0; x<8; x++){
3095 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3096 }
3097 if(y+1<h){
3098 for(x=0; x<7; x++){
c26abfa5 3099 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3100 - s1[x+1] + s1[x+1+stride])
c26abfa5 3101 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3102 - s2[x+1] + s2[x+1+stride]);
3103 }
3104 }
3105 s1+= stride;
3106 s2+= stride;
3107 }
115329f1 3108
c26abfa5
DB
3109 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3110 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3111}
3112
364a1797
MN
3113static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3114 int i;
3115 unsigned int sum=0;
3116
3117 for(i=0; i<8*8; i++){
3118 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3119 int w= weight[i];
3120 b>>= RECON_SHIFT;
3121 assert(-512<b && b<512);
3122
3123 sum += (w*b)*(w*b)>>4;
3124 }
3125 return sum>>2;
3126}
3127
3128static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3129 int i;
3130
3131 for(i=0; i<8*8; i++){
3132 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3133 }
364a1797
MN
3134}
3135
a9badb51
MN
3136/**
3137 * permutes an 8x8 block.
2a5700de 3138 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3139 * @param permutation the permutation vector
3140 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3141 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3142 * (inverse) permutated to scantable order!
a9badb51 3143 */
0c1a9eda 3144void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3145{
7801d21d 3146 int i;
477ab036 3147 DCTELEM temp[64];
115329f1 3148
7801d21d 3149 if(last<=0) return;
90b5b51e 3150 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
d962f6fd 3151
7801d21d
MN
3152 for(i=0; i<=last; i++){
3153 const int j= scantable[i];
3154 temp[j]= block[j];
3155 block[j]=0;
3156 }
115329f1 3157
7801d21d
MN
3158 for(i=0; i<=last; i++){
3159 const int j= scantable[i];
3160 const int perm_j= permutation[j];
3161 block[perm_j]= temp[j];
3162 }
d962f6fd 3163}
e0eac44e 3164
622348f9
MN
3165static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3166 return 0;
3167}
3168
3169void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3170 int i;
115329f1 3171
622348f9 3172 memset(cmp, 0, sizeof(void*)*5);
115329f1 3173
622348f9
MN
3174 for(i=0; i<5; i++){
3175 switch(type&0xFF){
3176 case FF_CMP_SAD:
3177 cmp[i]= c->sad[i];
3178 break;
3179 case FF_CMP_SATD:
3180 cmp[i]= c->hadamard8_diff[i];
3181 break;
3182 case FF_CMP_SSE:
3183 cmp[i]= c->sse[i];
3184 break;
3185 case FF_CMP_DCT:
3186 cmp[i]= c->dct_sad[i];
3187 break;
27c61ac5
MN
3188 case FF_CMP_DCT264:
3189 cmp[i]= c->dct264_sad[i];
3190 break;
0fd6aea1
MN
3191 case FF_CMP_DCTMAX:
3192 cmp[i]= c->dct_max[i];
3193 break;
622348f9
MN
3194 case FF_CMP_PSNR:
3195 cmp[i]= c->quant_psnr[i];
3196 break;
3197 case FF_CMP_BIT:
3198 cmp[i]= c->bit[i];
3199 break;
3200 case FF_CMP_RD:
3201 cmp[i]= c->rd[i];
3202 break;
3203 case FF_CMP_VSAD:
3204 cmp[i]= c->vsad[i];
3205 break;
3206 case FF_CMP_VSSE:
3207 cmp[i]= c->vsse[i];
3208 break;
3209 case FF_CMP_ZERO:
3210 cmp[i]= zero_cmp;
3211 break;
e6a2ac34
MN
3212 case FF_CMP_NSSE:
3213 cmp[i]= c->nsse[i];
3214 break;
3a6fc8fa 3215#ifdef CONFIG_SNOW_ENCODER
26efc54e
MN
3216 case FF_CMP_W53:
3217 cmp[i]= c->w53[i];
3218 break;
3219 case FF_CMP_W97:
3220 cmp[i]= c->w97[i];
3221 break;
3a6fc8fa 3222#endif
622348f9
MN
3223 default:
3224 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3225 }
3226 }
3227}
3228
2a5700de
MN
3229/**
3230 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3231 */
eb4b3dd3 3232static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3233{
3234 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3235}
3236
11f18faf
MN
3237static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3238 int i;
d32ac509 3239 for(i=0; i+7<w; i+=8){
11f18faf
MN
3240 dst[i+0] += src[i+0];
3241 dst[i+1] += src[i+1];
3242 dst[i+2] += src[i+2];
3243 dst[i+3] += src[i+3];
3244 dst[i+4] += src[i+4];
3245 dst[i+5] += src[i+5];
3246 dst[i+6] += src[i+6];
3247 dst[i+7] += src[i+7];
3248 }
3249 for(; i<w; i++)
3250 dst[i+0] += src[i+0];
3251}
3252
3253static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3254 int i;
d32ac509 3255 for(i=0; i+7<w; i+=8){
11f18faf
MN
3256 dst[i+0] = src1[i+0]-src2[i+0];
3257 dst[i+1] = src1[i+1]-src2[i+1];
3258 dst[i+2] = src1[i+2]-src2[i+2];
3259 dst[i+3] = src1[i+3]-src2[i+3];
3260 dst[i+4] = src1[i+4]-src2[i+4];
3261 dst[i+5] = src1[i+5]-src2[i+5];
3262 dst[i+6] = src1[i+6]-src2[i+6];
3263 dst[i+7] = src1[i+7]-src2[i+7];
3264 }
3265 for(; i<w; i++)
3266 dst[i+0] = src1[i+0]-src2[i+0];
3267}
3268
84705403
MN
3269static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3270 int i;
3271 uint8_t l, lt;
3272
3273 l= *left;
3274 lt= *left_top;
3275
3276 for(i=0; i<w; i++){
3277 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3278 lt= src1[i];
3279 l= src2[i];
3280 dst[i]= l - pred;
115329f1 3281 }
84705403
MN
3282
3283 *left= l;
3284 *left_top= lt;
3285}
3286
1457ab52
MN
3287#define BUTTERFLY2(o1,o2,i1,i2) \
3288o1= (i1)+(i2);\
3289o2= (i1)-(i2);
3290
3291#define BUTTERFLY1(x,y) \
3292{\
3293 int a,b;\
3294 a= x;\
3295 b= y;\
3296 x= a+b;\
3297 y= a-b;\
3298}
3299
c26abfa5 3300#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1457ab52 3301
bb198e19 3302static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3303 int i;
3304 int temp[64];
3305 int sum=0;
115329f1 3306
bb198e19 3307 assert(h==8);
1457ab52
MN
3308
3309 for(i=0; i<8; i++){
3310 //FIXME try pointer walks
3311 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3312 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3313 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3314 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1 3315
1457ab52
MN
3316 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3317 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3318 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3319 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3320
1457ab52
MN
3321 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3322 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3323 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3324 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3325 }
3326
3327 for(i=0; i<8; i++){
3328 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3329 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3330 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3331 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3332
1457ab52
MN
3333 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3334 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3335 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3336 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3337
115329f1 3338 sum +=
1457ab52
MN
3339 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3340 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3341 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3342 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3343 }
3344#if 0
3345static int maxi=0;
3346if(sum>maxi){
3347 maxi=sum;
3348 printf("MAX:%d\n", maxi);
3349}
3350#endif
3351 return sum;
3352}
3353
622348f9 3354static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3355 int i;
3356 int temp[64];
3357 int sum=0;
115329f1 3358
622348f9 3359 assert(h==8);
115329f1 3360
1457ab52
MN
3361 for(i=0; i<8; i++){
3362 //FIXME try pointer walks
622348f9
MN
3363 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3364 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3365 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3366 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1 3367
1457ab52
MN
3368 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3369 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3370 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3371 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3372
1457ab52
MN
3373 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3374 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3375 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3376 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3377 }
3378
3379 for(i=0; i<8; i++){
3380 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3381 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3382 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3383 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3384
1457ab52
MN
3385 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3386 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3387 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3388 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
DB
3389
3390 sum +=
1457ab52
MN
3391 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3392 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3393 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3394 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3395 }
115329f1 3396
c26abfa5 3397 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
115329f1 3398
1457ab52
MN
3399 return sum;
3400}
3401
bb198e19 3402static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3403 MpegEncContext * const s= (MpegEncContext *)c;
1edbfe19 3404 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3405 DCTELEM * const temp= (DCTELEM*)aligned_temp;
115329f1 3406
bb198e19 3407 assert(h==8);
1457ab52
MN
3408
3409 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3410 s->dsp.fdct(temp);
1edbfe19 3411 return s->dsp.sum_abs_dctelem(temp);
1457ab52
MN
3412}
3413
27c61ac5
MN
3414#ifdef CONFIG_GPL
3415#define DCT8_1D {\
3416 const int s07 = SRC(0) + SRC(7);\
3417 const int s16 = SRC(1) + SRC(6);\
3418 const int s25 = SRC(2) + SRC(5);\
3419 const int s34 = SRC(3) + SRC(4);\
3420 const int a0 = s07 + s34;\
3421 const int a1 = s16 + s25;\
3422 const int a2 = s07 - s34;\
3423 const int a3 = s16 - s25;\
3424 const int d07 = SRC(0) - SRC(7);\
3425 const int d16 = SRC(1) - SRC(6);\
3426 const int d25 = SRC(2) - SRC(5);\
3427 const int d34 = SRC(3) - SRC(4);\
3428 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3429 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3430 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3431 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3432 DST(0, a0 + a1 ) ;\
3433 DST(1, a4 + (a7>>2)) ;\
3434 DST(2, a2 + (a3>>1)) ;\
3435 DST(3, a5 + (a6>>2)) ;\
3436 DST(4, a0 - a1 ) ;\
3437 DST(5, a6 - (a5>>2)) ;\
3438 DST(6, (a2>>1) - a3 ) ;\
3439 DST(7, (a4>>2) - a7 ) ;\
3440}
3441
3442static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3443 MpegEncContext * const s= (MpegEncContext *)c;
8d15910a 3444 DCTELEM dct[8][8];
27c61ac5
MN
3445 int i;
3446 int sum=0;
3447
8d15910a 3448 s->dsp.diff_pixels(dct[0], src1, src2, stride);
27c61ac5
MN
3449
3450#define SRC(x) dct[i][x]
3451#define DST(x,v) dct[i][x]= v
3452 for( i = 0; i < 8; i++ )
3453 DCT8_1D
3454#undef SRC
3455#undef DST
3456
3457#define SRC(x) dct[x][i]
c26abfa5 3458#define DST(x,v) sum += FFABS(v)
27c61ac5
MN
3459 for( i = 0; i < 8; i++ )
3460 DCT8_1D
3461#undef SRC
3462#undef DST
3463 return sum;
3464}
3465#endif
3466
0fd6aea1
MN
3467static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3469 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
0fd6aea1
MN
3470 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3471 int sum=0, i;
115329f1 3472
0fd6aea1
MN
3473 assert(h==8);
3474
3475 s->dsp.diff_pixels(temp, src1, src2, stride);
3476 s->dsp.fdct(temp);
3477
3478 for(i=0; i<64; i++)
c26abfa5 3479 sum= FFMAX(sum, FFABS(temp[i]));
115329f1 3480
0fd6aea1
MN
3481 return sum;
3482}
3483
bb198e19 3484static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3485 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3486 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
76fbb024
MN
3487 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3488 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3489 int sum=0, i;
3490
bb198e19 3491 assert(h==8);
1457ab52 3492 s->mb_intra=0;
115329f1 3493
1457ab52 3494 s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1 3495
1457ab52 3496 memcpy(bak, temp, 64*sizeof(DCTELEM));
115329f1 3497
67725183 3498 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3499 s->dct_unquantize_inter(s, temp, 0, s->qscale);
115329f1
DB
3500 simple_idct(temp); //FIXME
3501
1457ab52
MN
3502 for(i=0; i<64; i++)
3503 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1 3504
1457ab52
MN
3505 return sum;
3506}
3507
bb198e19 3508static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3509 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3510 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58
SH
3511 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3512 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
76fbb024
MN
3513 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3514 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3515 int i, last, run, bits, level, distoration, start_i;
3516 const int esc_length= s->ac_esc_length;
3517 uint8_t * length;
3518 uint8_t * last_length;
115329f1 3519
bb198e19
MN
3520 assert(h==8);
3521
67725183
MN
3522 for(i=0; i<8; i++){
3523 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3524 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3525 }
3a87ac94 3526