remove mpegvideo.c img resample dependancy
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
115329f1 22
983e3246
MN
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
115329f1 27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
059715a4 33#include "snow.h"
5596c60c 34
88730be6
MR
35/* snow.c */
36void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37
8b69867f
MN
38uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39uint32_t squareTbl[512] = {0, };
de6d9b64 40
0c1a9eda 41const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 44 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 45 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
50};
51
10acc479
RS
52/* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
63};
64
2f349de2 65/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
68b51e58 66DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
2f349de2 67
0c1a9eda 68const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 69 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 70 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 71 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 72 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 73 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 74 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 75 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
76 52, 53, 54, 55, 60, 61, 62, 63,
77};
78
0c1a9eda 79const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 80 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 81 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 82 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 83 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 84 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 85 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 86 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
87 38, 46, 54, 62, 39, 47, 55, 63,
88};
89
2f349de2 90/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 91const uint32_t inverse[256]={
115329f1
DB
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2
MN
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
124};
125
b0368839
MN
126/* Input permutation for the simple_idct_mmx */
127static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
136};
137
0c1a9eda 138static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
139{
140 int s, i, j;
141
142 s = 0;
143 for (i = 0; i < 16; i++) {
bb270c08
DB
144 for (j = 0; j < 16; j += 8) {
145 s += pix[0];
146 s += pix[1];
147 s += pix[2];
148 s += pix[3];
149 s += pix[4];
150 s += pix[5];
151 s += pix[6];
152 s += pix[7];
153 pix += 8;
154 }
155 pix += line_size - 16;
3aa102be
MN
156 }
157 return s;
158}
159
0c1a9eda 160static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
161{
162 int s, i, j;
0c1a9eda 163 uint32_t *sq = squareTbl + 256;
3aa102be
MN
164
165 s = 0;
166 for (i = 0; i < 16; i++) {
bb270c08 167 for (j = 0; j < 16; j += 8) {
2a006cd3 168#if 0
bb270c08
DB
169 s += sq[pix[0]];
170 s += sq[pix[1]];
171 s += sq[pix[2]];
172 s += sq[pix[3]];
173 s += sq[pix[4]];
174 s += sq[pix[5]];
175 s += sq[pix[6]];
176 s += sq[pix[7]];
2a006cd3
FL
177#else
178#if LONG_MAX > 2147483647
bb270c08
DB
179 register uint64_t x=*(uint64_t*)pix;
180 s += sq[x&0xff];
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
2a006cd3
FL
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
188#else
bb270c08
DB
189 register uint32_t x=*(uint32_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
2a006cd3
FL
194 x=*(uint32_t*)(pix+4);
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199#endif
200#endif
bb270c08
DB
201 pix += 8;
202 }
203 pix += line_size - 16;
3aa102be
MN
204 }
205 return s;
206}
207
3d2e8cce
MN
208static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209 int i;
115329f1 210
3d2e8cce
MN
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
220 }
221 for(;i<w; i++){
222 dst[i+0]= bswap_32(src[i+0]);
223 }
224}
3aa102be 225
26efc54e
MN
226static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227{
228 int s, i;
229 uint32_t *sq = squareTbl + 256;
230
231 s = 0;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
bb198e19 243static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
244{
245 int s, i;
0c1a9eda 246 uint32_t *sq = squareTbl + 256;
1457ab52
MN
247
248 s = 0;
bb198e19 249 for (i = 0; i < h; i++) {
1457ab52
MN
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
258 pix1 += line_size;
259 pix2 += line_size;
260 }
261 return s;
262}
263
bb198e19 264static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 265{
6b026927
FH
266 int s, i;
267 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
268
269 s = 0;
bb198e19 270 for (i = 0; i < h; i++) {
6b026927
FH
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
2a006cd3 287
6b026927
FH
288 pix1 += line_size;
289 pix2 += line_size;
9c76bd48
BF
290 }
291 return s;
292}
293
26efc54e
MN
294
295static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
8b975b7c 296#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
26efc54e
MN
297 int s, i, j;
298 const int dec_count= w==8 ? 3 : 4;
299 int tmp[16*16];
300#if 0
301 int level, ori;
115329f1 302 static const int scale[2][2][4][4]={
26efc54e
MN
303 {
304 {
305 //8x8 dec=3
306 {268, 239, 239, 213},
307 { 0, 224, 224, 152},
308 { 0, 135, 135, 110},
309 },{
310 //16x16 dec=4
311 {344, 310, 310, 280},
312 { 0, 320, 320, 228},
313 { 0, 175, 175, 136},
314 { 0, 129, 129, 102},
315 }
316 },{
317 {//FIXME 5/3
318 //8x8 dec=3
319 {275, 245, 245, 218},
320 { 0, 230, 230, 156},
321 { 0, 138, 138, 113},
322 },{
323 //16x16 dec=4
324 {352, 317, 317, 286},
325 { 0, 328, 328, 233},
326 { 0, 180, 180, 140},
327 { 0, 132, 132, 105},
328 }
329 }
330 };
331#endif
332
333 for (i = 0; i < h; i++) {
334 for (j = 0; j < w; j+=4) {
335 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
336 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
337 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
338 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
339 }
340 pix1 += line_size;
341 pix2 += line_size;
342 }
8b975b7c 343
26efc54e
MN
344 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
345
346 s=0;
347#if 0
348 for(level=0; level<dec_count; level++){
349 for(ori= level ? 1 : 0; ori<4; ori++){
350 int sx= (ori&1) ? 1<<level: 0;
351 int stride= 16<<(dec_count-level);
352 int sy= (ori&2) ? stride>>1 : 0;
353 int size= 1<<level;
115329f1 354
26efc54e
MN
355 for(i=0; i<size; i++){
356 for(j=0; j<size; j++){
357 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
358 s += ABS(v);
359 }
360 }
361 }
362 }
363#endif
364 for (i = 0; i < h; i++) {
365 for (j = 0; j < w; j+=4) {
366 s+= ABS(tmp[16*i+j+0]);
367 s+= ABS(tmp[16*i+j+1]);
368 s+= ABS(tmp[16*i+j+2]);
369 s+= ABS(tmp[16*i+j+3]);
370 }
371 }
115329f1
DB
372 assert(s>=0);
373
26efc54e 374 return s>>2;
8b975b7c 375#endif
26efc54e
MN
376}
377
378static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 1);
380}
381
382static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 8, h, 0);
384}
385
386static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 1);
388}
389
390static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 16, h, 0);
392}
393
0c1a9eda 394static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 395{
de6d9b64
FB
396 int i;
397
398 /* read the pixels */
de6d9b64 399 for(i=0;i<8;i++) {
c13e1abd
FH
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
408 pixels += line_size;
409 block += 8;
de6d9b64
FB
410 }
411}
412
0c1a9eda 413static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 414 const uint8_t *s2, int stride){
9dbcbd92
MN
415 int i;
416
417 /* read the pixels */
9dbcbd92 418 for(i=0;i<8;i++) {
c13e1abd
FH
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
9dbcbd92
MN
427 s1 += stride;
428 s2 += stride;
c13e1abd 429 block += 8;
9dbcbd92
MN
430 }
431}
432
433
0c1a9eda 434static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 435 int line_size)
de6d9b64 436{
de6d9b64 437 int i;
0c1a9eda 438 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 439
de6d9b64 440 /* read the pixels */
de6d9b64 441 for(i=0;i<8;i++) {
c13e1abd
FH
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
450
451 pixels += line_size;
452 block += 8;
de6d9b64
FB
453 }
454}
455
178fcca8 456static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 457 int line_size)
178fcca8
MN
458{
459 int i;
460 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 461
178fcca8
MN
462 /* read the pixels */
463 for(i=0;i<4;i++) {
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
468
469 pixels += line_size;
470 block += 8;
471 }
472}
473
9ca358b9 474static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 475 int line_size)
9ca358b9
MN
476{
477 int i;
478 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 479
9ca358b9
MN
480 /* read the pixels */
481 for(i=0;i<2;i++) {
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
484
485 pixels += line_size;
486 block += 8;
487 }
488}
489
115329f1 490static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
491 uint8_t *restrict pixels,
492 int line_size)
493{
494 int i, j;
495
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
498 if (*block < -128)
499 *pixels = 0;
500 else if (*block > 127)
501 *pixels = 255;
502 else
503 *pixels = (uint8_t)(*block + 128);
504 block++;
505 pixels++;
506 }
507 pixels += (line_size - 8);
508 }
509}
510
0c1a9eda 511static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 512 int line_size)
de6d9b64 513{
de6d9b64 514 int i;
0c1a9eda 515 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 516
de6d9b64 517 /* read the pixels */
de6d9b64 518 for(i=0;i<8;i++) {
c13e1abd
FH
519 pixels[0] = cm[pixels[0] + block[0]];
520 pixels[1] = cm[pixels[1] + block[1]];
521 pixels[2] = cm[pixels[2] + block[2]];
522 pixels[3] = cm[pixels[3] + block[3]];
523 pixels[4] = cm[pixels[4] + block[4]];
524 pixels[5] = cm[pixels[5] + block[5]];
525 pixels[6] = cm[pixels[6] + block[6]];
526 pixels[7] = cm[pixels[7] + block[7]];
527 pixels += line_size;
528 block += 8;
de6d9b64
FB
529 }
530}
178fcca8
MN
531
532static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
533 int line_size)
534{
535 int i;
536 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 537
178fcca8
MN
538 /* read the pixels */
539 for(i=0;i<4;i++) {
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels += line_size;
545 block += 8;
546 }
547}
9ca358b9
MN
548
549static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
550 int line_size)
551{
552 int i;
553 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 554
9ca358b9
MN
555 /* read the pixels */
556 for(i=0;i<2;i++) {
557 pixels[0] = cm[pixels[0] + block[0]];
558 pixels[1] = cm[pixels[1] + block[1]];
559 pixels += line_size;
560 block += 8;
561 }
562}
36940eca
LM
563
564static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
565{
566 int i;
567 for(i=0;i<8;i++) {
568 pixels[0] += block[0];
569 pixels[1] += block[1];
570 pixels[2] += block[2];
571 pixels[3] += block[3];
572 pixels[4] += block[4];
573 pixels[5] += block[5];
574 pixels[6] += block[6];
575 pixels[7] += block[7];
576 pixels += line_size;
577 block += 8;
578 }
579}
580
581static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
582{
583 int i;
584 for(i=0;i<4;i++) {
585 pixels[0] += block[0];
586 pixels[1] += block[1];
587 pixels[2] += block[2];
588 pixels[3] += block[3];
589 pixels += line_size;
590 block += 4;
591 }
592}
593
59fe111e
MN
594#if 0
595
596#define PIXOP2(OPNAME, OP) \
b3184779 597static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
598{\
599 int i;\
600 for(i=0; i<h; i++){\
601 OP(*((uint64_t*)block), LD64(pixels));\
602 pixels+=line_size;\
603 block +=line_size;\
604 }\
605}\
606\
45553457 607static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
608{\
609 int i;\
610 for(i=0; i<h; i++){\
611 const uint64_t a= LD64(pixels );\
612 const uint64_t b= LD64(pixels+1);\
613 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
614 pixels+=line_size;\
615 block +=line_size;\
616 }\
617}\
618\
45553457 619static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
620{\
621 int i;\
622 for(i=0; i<h; i++){\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
626 pixels+=line_size;\
627 block +=line_size;\
628 }\
629}\
630\
45553457 631static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
632{\
633 int i;\
634 for(i=0; i<h; i++){\
635 const uint64_t a= LD64(pixels );\
636 const uint64_t b= LD64(pixels+line_size);\
637 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
638 pixels+=line_size;\
639 block +=line_size;\
640 }\
641}\
642\
45553457 643static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
644{\
645 int i;\
646 for(i=0; i<h; i++){\
647 const uint64_t a= LD64(pixels );\
648 const uint64_t b= LD64(pixels+line_size);\
649 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
650 pixels+=line_size;\
651 block +=line_size;\
652 }\
653}\
654\
45553457 655static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
656{\
657 int i;\
658 const uint64_t a= LD64(pixels );\
659 const uint64_t b= LD64(pixels+1);\
660 uint64_t l0= (a&0x0303030303030303ULL)\
661 + (b&0x0303030303030303ULL)\
662 + 0x0202020202020202ULL;\
663 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
664 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
665 uint64_t l1,h1;\
666\
667 pixels+=line_size;\
668 for(i=0; i<h; i+=2){\
669 uint64_t a= LD64(pixels );\
670 uint64_t b= LD64(pixels+1);\
671 l1= (a&0x0303030303030303ULL)\
672 + (b&0x0303030303030303ULL);\
673 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
675 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
676 pixels+=line_size;\
677 block +=line_size;\
678 a= LD64(pixels );\
679 b= LD64(pixels+1);\
680 l0= (a&0x0303030303030303ULL)\
681 + (b&0x0303030303030303ULL)\
682 + 0x0202020202020202ULL;\
683 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
686 pixels+=line_size;\
687 block +=line_size;\
688 }\
689}\
690\
45553457 691static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
692{\
693 int i;\
694 const uint64_t a= LD64(pixels );\
695 const uint64_t b= LD64(pixels+1);\
696 uint64_t l0= (a&0x0303030303030303ULL)\
697 + (b&0x0303030303030303ULL)\
698 + 0x0101010101010101ULL;\
699 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
700 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
701 uint64_t l1,h1;\
702\
703 pixels+=line_size;\
704 for(i=0; i<h; i+=2){\
705 uint64_t a= LD64(pixels );\
706 uint64_t b= LD64(pixels+1);\
707 l1= (a&0x0303030303030303ULL)\
708 + (b&0x0303030303030303ULL);\
709 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
711 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
712 pixels+=line_size;\
713 block +=line_size;\
714 a= LD64(pixels );\
715 b= LD64(pixels+1);\
716 l0= (a&0x0303030303030303ULL)\
717 + (b&0x0303030303030303ULL)\
718 + 0x0101010101010101ULL;\
719 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
722 pixels+=line_size;\
723 block +=line_size;\
724 }\
725}\
726\
45553457
ZK
727CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
728CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
729CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
730CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
731CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
732CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
733CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
734
735#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
736#else // 64 bit variant
737
738#define PIXOP2(OPNAME, OP) \
669ac79c
MN
739static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
740 int i;\
741 for(i=0; i<h; i++){\
742 OP(*((uint16_t*)(block )), LD16(pixels ));\
743 pixels+=line_size;\
744 block +=line_size;\
745 }\
746}\
0da71265
MN
747static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
748 int i;\
749 for(i=0; i<h; i++){\
750 OP(*((uint32_t*)(block )), LD32(pixels ));\
751 pixels+=line_size;\
752 block +=line_size;\
753 }\
754}\
45553457 755static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
756 int i;\
757 for(i=0; i<h; i++){\
758 OP(*((uint32_t*)(block )), LD32(pixels ));\
759 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
760 pixels+=line_size;\
761 block +=line_size;\
762 }\
763}\
45553457
ZK
764static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
765 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 766}\
59fe111e 767\
b3184779
MN
768static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
769 int src_stride1, int src_stride2, int h){\
59fe111e
MN
770 int i;\
771 for(i=0; i<h; i++){\
b3184779
MN
772 uint32_t a,b;\
773 a= LD32(&src1[i*src_stride1 ]);\
774 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 775 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
776 a= LD32(&src1[i*src_stride1+4]);\
777 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 778 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
779 }\
780}\
781\
b3184779
MN
782static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
783 int src_stride1, int src_stride2, int h){\
59fe111e
MN
784 int i;\
785 for(i=0; i<h; i++){\
b3184779
MN
786 uint32_t a,b;\
787 a= LD32(&src1[i*src_stride1 ]);\
788 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 789 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
790 a= LD32(&src1[i*src_stride1+4]);\
791 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 792 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
793 }\
794}\
795\
0da71265
MN
796static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
797 int src_stride1, int src_stride2, int h){\
798 int i;\
799 for(i=0; i<h; i++){\
800 uint32_t a,b;\
801 a= LD32(&src1[i*src_stride1 ]);\
802 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 803 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
804 }\
805}\
806\
669ac79c
MN
807static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
808 int src_stride1, int src_stride2, int h){\
809 int i;\
810 for(i=0; i<h; i++){\
811 uint32_t a,b;\
812 a= LD16(&src1[i*src_stride1 ]);\
813 b= LD16(&src2[i*src_stride2 ]);\
814 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
815 }\
816}\
817\
b3184779
MN
818static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
819 int src_stride1, int src_stride2, int h){\
820 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
821 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
822}\
823\
824static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
825 int src_stride1, int src_stride2, int h){\
826 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
827 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
828}\
829\
45553457 830static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
831 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832}\
833\
45553457 834static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
835 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
836}\
837\
45553457 838static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
839 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840}\
841\
45553457 842static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
843 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
844}\
845\
846static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
847 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
848 int i;\
849 for(i=0; i<h; i++){\
b3184779
MN
850 uint32_t a, b, c, d, l0, l1, h0, h1;\
851 a= LD32(&src1[i*src_stride1]);\
852 b= LD32(&src2[i*src_stride2]);\
853 c= LD32(&src3[i*src_stride3]);\
854 d= LD32(&src4[i*src_stride4]);\
855 l0= (a&0x03030303UL)\
856 + (b&0x03030303UL)\
857 + 0x02020202UL;\
858 h0= ((a&0xFCFCFCFCUL)>>2)\
859 + ((b&0xFCFCFCFCUL)>>2);\
860 l1= (c&0x03030303UL)\
861 + (d&0x03030303UL);\
862 h1= ((c&0xFCFCFCFCUL)>>2)\
863 + ((d&0xFCFCFCFCUL)>>2);\
864 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
865 a= LD32(&src1[i*src_stride1+4]);\
866 b= LD32(&src2[i*src_stride2+4]);\
867 c= LD32(&src3[i*src_stride3+4]);\
868 d= LD32(&src4[i*src_stride4+4]);\
869 l0= (a&0x03030303UL)\
870 + (b&0x03030303UL)\
871 + 0x02020202UL;\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
875 + (d&0x03030303UL);\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
879 }\
880}\
669ac79c
MN
881\
882static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
884}\
885\
886static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
888}\
889\
890static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
892}\
893\
894static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
896}\
897\
b3184779
MN
898static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
900 int i;\
901 for(i=0; i<h; i++){\
b3184779
MN
902 uint32_t a, b, c, d, l0, l1, h0, h1;\
903 a= LD32(&src1[i*src_stride1]);\
904 b= LD32(&src2[i*src_stride2]);\
905 c= LD32(&src3[i*src_stride3]);\
906 d= LD32(&src4[i*src_stride4]);\
907 l0= (a&0x03030303UL)\
908 + (b&0x03030303UL)\
909 + 0x01010101UL;\
910 h0= ((a&0xFCFCFCFCUL)>>2)\
911 + ((b&0xFCFCFCFCUL)>>2);\
912 l1= (c&0x03030303UL)\
913 + (d&0x03030303UL);\
914 h1= ((c&0xFCFCFCFCUL)>>2)\
915 + ((d&0xFCFCFCFCUL)>>2);\
916 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
917 a= LD32(&src1[i*src_stride1+4]);\
918 b= LD32(&src2[i*src_stride2+4]);\
919 c= LD32(&src3[i*src_stride3+4]);\
920 d= LD32(&src4[i*src_stride4+4]);\
921 l0= (a&0x03030303UL)\
922 + (b&0x03030303UL)\
923 + 0x01010101UL;\
924 h0= ((a&0xFCFCFCFCUL)>>2)\
925 + ((b&0xFCFCFCFCUL)>>2);\
926 l1= (c&0x03030303UL)\
927 + (d&0x03030303UL);\
928 h1= ((c&0xFCFCFCFCUL)>>2)\
929 + ((d&0xFCFCFCFCUL)>>2);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
931 }\
932}\
b3184779
MN
933static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
934 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
935 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937}\
938static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
939 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
940 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942}\
59fe111e 943\
669ac79c
MN
944static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945{\
946 int i, a0, b0, a1, b1;\
947 a0= pixels[0];\
948 b0= pixels[1] + 2;\
949 a0 += b0;\
950 b0 += pixels[2];\
951\
952 pixels+=line_size;\
953 for(i=0; i<h; i+=2){\
954 a1= pixels[0];\
955 b1= pixels[1];\
956 a1 += b1;\
957 b1 += pixels[2];\
958\
959 block[0]= (a1+a0)>>2; /* FIXME non put */\
960 block[1]= (b1+b0)>>2;\
961\
962 pixels+=line_size;\
963 block +=line_size;\
964\
965 a0= pixels[0];\
966 b0= pixels[1] + 2;\
967 a0 += b0;\
968 b0 += pixels[2];\
969\
970 block[0]= (a1+a0)>>2;\
971 block[1]= (b1+b0)>>2;\
972 pixels+=line_size;\
973 block +=line_size;\
974 }\
975}\
976\
977static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
978{\
979 int i;\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
983 + (b&0x03030303UL)\
984 + 0x02020202UL;\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
987 uint32_t l1,h1;\
988\
989 pixels+=line_size;\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
994 + (b&0x03030303UL);\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 pixels+=line_size;\
999 block +=line_size;\
1000 a= LD32(pixels );\
1001 b= LD32(pixels+1);\
1002 l0= (a&0x03030303UL)\
1003 + (b&0x03030303UL)\
1004 + 0x02020202UL;\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 pixels+=line_size;\
1009 block +=line_size;\
1010 }\
1011}\
1012\
45553457 1013static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1014{\
1015 int j;\
1016 for(j=0; j<2; j++){\
1017 int i;\
1018 const uint32_t a= LD32(pixels );\
1019 const uint32_t b= LD32(pixels+1);\
1020 uint32_t l0= (a&0x03030303UL)\
1021 + (b&0x03030303UL)\
1022 + 0x02020202UL;\
1023 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1024 + ((b&0xFCFCFCFCUL)>>2);\
1025 uint32_t l1,h1;\
1026\
1027 pixels+=line_size;\
1028 for(i=0; i<h; i+=2){\
1029 uint32_t a= LD32(pixels );\
1030 uint32_t b= LD32(pixels+1);\
1031 l1= (a&0x03030303UL)\
1032 + (b&0x03030303UL);\
1033 h1= ((a&0xFCFCFCFCUL)>>2)\
1034 + ((b&0xFCFCFCFCUL)>>2);\
1035 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036 pixels+=line_size;\
1037 block +=line_size;\
1038 a= LD32(pixels );\
1039 b= LD32(pixels+1);\
1040 l0= (a&0x03030303UL)\
1041 + (b&0x03030303UL)\
1042 + 0x02020202UL;\
1043 h0= ((a&0xFCFCFCFCUL)>>2)\
1044 + ((b&0xFCFCFCFCUL)>>2);\
1045 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046 pixels+=line_size;\
1047 block +=line_size;\
1048 }\
1049 pixels+=4-line_size*(h+1);\
1050 block +=4-line_size*h;\
1051 }\
1052}\
1053\
45553457 1054static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1055{\
1056 int j;\
1057 for(j=0; j<2; j++){\
1058 int i;\
1059 const uint32_t a= LD32(pixels );\
1060 const uint32_t b= LD32(pixels+1);\
1061 uint32_t l0= (a&0x03030303UL)\
1062 + (b&0x03030303UL)\
1063 + 0x01010101UL;\
1064 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1065 + ((b&0xFCFCFCFCUL)>>2);\
1066 uint32_t l1,h1;\
1067\
1068 pixels+=line_size;\
1069 for(i=0; i<h; i+=2){\
1070 uint32_t a= LD32(pixels );\
1071 uint32_t b= LD32(pixels+1);\
1072 l1= (a&0x03030303UL)\
1073 + (b&0x03030303UL);\
1074 h1= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1076 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 pixels+=line_size;\
1078 block +=line_size;\
1079 a= LD32(pixels );\
1080 b= LD32(pixels+1);\
1081 l0= (a&0x03030303UL)\
1082 + (b&0x03030303UL)\
1083 + 0x01010101UL;\
1084 h0= ((a&0xFCFCFCFCUL)>>2)\
1085 + ((b&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087 pixels+=line_size;\
1088 block +=line_size;\
1089 }\
1090 pixels+=4-line_size*(h+1);\
1091 block +=4-line_size*h;\
1092 }\
1093}\
1094\
45553457
ZK
1095CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1096CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1097CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1098CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1099CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1100CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1101CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1102CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1103
d8085ea7 1104#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1105#endif
59fe111e
MN
1106#define op_put(a, b) a = b
1107
1108PIXOP2(avg, op_avg)
1109PIXOP2(put, op_put)
1110#undef op_avg
1111#undef op_put
1112
de6d9b64
FB
1113#define avg2(a,b) ((a+b+1)>>1)
1114#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115
c0a0170c
MN
1116static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1118}
1119
1120static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1122}
073b013d 1123
0c1a9eda 1124static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1125{
1126 const int A=(16-x16)*(16-y16);
1127 const int B=( x16)*(16-y16);
1128 const int C=(16-x16)*( y16);
1129 const int D=( x16)*( y16);
1130 int i;
44eb4951
MN
1131
1132 for(i=0; i<h; i++)
1133 {
b3184779
MN
1134 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1142 dst+= stride;
1143 src+= stride;
44eb4951
MN
1144 }
1145}
1146
115329f1 1147static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1148 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1149{
1150 int y, vx, vy;
1151 const int s= 1<<shift;
115329f1 1152
073b013d
MN
1153 width--;
1154 height--;
1155
1156 for(y=0; y<h; y++){
1157 int x;
1158
1159 vx= ox;
1160 vy= oy;
1161 for(x=0; x<8; x++){ //XXX FIXME optimize
1162 int src_x, src_y, frac_x, frac_y, index;
1163
1164 src_x= vx>>16;
1165 src_y= vy>>16;
1166 frac_x= src_x&(s-1);
1167 frac_y= src_y&(s-1);
1168 src_x>>=shift;
1169 src_y>>=shift;
115329f1 1170
073b013d
MN
1171 if((unsigned)src_x < width){
1172 if((unsigned)src_y < height){
1173 index= src_x + src_y*stride;
1174 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1175 + src[index +1]* frac_x )*(s-frac_y)
1176 + ( src[index+stride ]*(s-frac_x)
1177 + src[index+stride+1]* frac_x )* frac_y
1178 + r)>>(shift*2);
1179 }else{
115329f1
DB
1180 index= src_x + clip(src_y, 0, height)*stride;
1181 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1182 + src[index +1]* frac_x )*s
1183 + r)>>(shift*2);
1184 }
1185 }else{
1186 if((unsigned)src_y < height){
115329f1
DB
1187 index= clip(src_x, 0, width) + src_y*stride;
1188 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1189 + src[index+stride ]* frac_y )*s
1190 + r)>>(shift*2);
1191 }else{
115329f1 1192 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
073b013d
MN
1193 dst[y*stride + x]= src[index ];
1194 }
1195 }
115329f1 1196
073b013d
MN
1197 vx+= dxx;
1198 vy+= dyx;
1199 }
1200 ox += dxy;
1201 oy += dyy;
1202 }
1203}
669ac79c
MN
1204
1205static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 switch(width){
1207 case 2: put_pixels2_c (dst, src, stride, height); break;
1208 case 4: put_pixels4_c (dst, src, stride, height); break;
1209 case 8: put_pixels8_c (dst, src, stride, height); break;
1210 case 16:put_pixels16_c(dst, src, stride, height); break;
1211 }
1212}
1213
1214static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 int i,j;
1216 for (i=0; i < height; i++) {
1217 for (j=0; j < width; j++) {
bb270c08 1218 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1219 }
1220 src += stride;
1221 dst += stride;
1222 }
1223}
1224
1225static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 int i,j;
1227 for (i=0; i < height; i++) {
1228 for (j=0; j < width; j++) {
bb270c08 1229 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1230 }
1231 src += stride;
1232 dst += stride;
1233 }
1234}
115329f1 1235
669ac79c
MN
1236static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 int i,j;
1238 for (i=0; i < height; i++) {
1239 for (j=0; j < width; j++) {
bb270c08 1240 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1241 }
1242 src += stride;
1243 dst += stride;
1244 }
1245}
115329f1 1246
669ac79c
MN
1247static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 int i,j;
1249 for (i=0; i < height; i++) {
1250 for (j=0; j < width; j++) {
bb270c08 1251 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1252 }
1253 src += stride;
1254 dst += stride;
1255 }
1256}
1257
1258static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 int i,j;
1260 for (i=0; i < height; i++) {
1261 for (j=0; j < width; j++) {
bb270c08 1262 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1263 }
1264 src += stride;
1265 dst += stride;
1266 }
1267}
1268
1269static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 int i,j;
1271 for (i=0; i < height; i++) {
1272 for (j=0; j < width; j++) {
bb270c08 1273 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1274 }
1275 src += stride;
1276 dst += stride;
1277 }
1278}
1279
1280static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 int i,j;
1282 for (i=0; i < height; i++) {
1283 for (j=0; j < width; j++) {
bb270c08 1284 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1285 }
1286 src += stride;
1287 dst += stride;
1288 }
1289}
1290
1291static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 int i,j;
1293 for (i=0; i < height; i++) {
1294 for (j=0; j < width; j++) {
bb270c08 1295 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1296 }
1297 src += stride;
1298 dst += stride;
1299 }
1300}
da3b9756
MM
1301
1302static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 switch(width){
1304 case 2: avg_pixels2_c (dst, src, stride, height); break;
1305 case 4: avg_pixels4_c (dst, src, stride, height); break;
1306 case 8: avg_pixels8_c (dst, src, stride, height); break;
1307 case 16:avg_pixels16_c(dst, src, stride, height); break;
1308 }
1309}
1310
1311static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 int i,j;
1313 for (i=0; i < height; i++) {
1314 for (j=0; j < width; j++) {
bb270c08 1315 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1316 }
1317 src += stride;
1318 dst += stride;
1319 }
1320}
1321
1322static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 int i,j;
1324 for (i=0; i < height; i++) {
1325 for (j=0; j < width; j++) {
bb270c08 1326 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1327 }
1328 src += stride;
1329 dst += stride;
1330 }
1331}
115329f1 1332
da3b9756
MM
1333static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 int i,j;
1335 for (i=0; i < height; i++) {
1336 for (j=0; j < width; j++) {
bb270c08 1337 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1338 }
1339 src += stride;
1340 dst += stride;
1341 }
1342}
115329f1 1343
da3b9756
MM
1344static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 int i,j;
1346 for (i=0; i < height; i++) {
1347 for (j=0; j < width; j++) {
bb270c08 1348 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1349 }
1350 src += stride;
1351 dst += stride;
1352 }
1353}
1354
1355static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 int i,j;
1357 for (i=0; i < height; i++) {
1358 for (j=0; j < width; j++) {
bb270c08 1359 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1360 }
1361 src += stride;
1362 dst += stride;
1363 }
1364}
1365
1366static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 int i,j;
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
bb270c08 1370 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1371 }
1372 src += stride;
1373 dst += stride;
1374 }
1375}
1376
1377static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 int i,j;
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
bb270c08 1381 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1382 }
1383 src += stride;
1384 dst += stride;
1385 }
1386}
1387
1388static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 int i,j;
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
bb270c08 1392 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1393 }
1394 src += stride;
1395 dst += stride;
1396 }
1397}
669ac79c
MN
1398#if 0
1399#define TPEL_WIDTH(width)\
1400static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1418#endif
1419
0da71265
MN
1420#define H264_CHROMA_MC(OPNAME, OP)\
1421static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1426 int i;\
1427 \
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1429\
1430 for(i=0; i<h; i++)\
1431 {\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 dst+= stride;\
1435 src+= stride;\
1436 }\
1437}\
1438\
1439static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440 const int A=(8-x)*(8-y);\
1441 const int B=( x)*(8-y);\
1442 const int C=(8-x)*( y);\
1443 const int D=( x)*( y);\
1444 int i;\
1445 \
1446 assert(x<8 && y<8 && x>=0 && y>=0);\
1447\
1448 for(i=0; i<h; i++)\
1449 {\
1450 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1454 dst+= stride;\
1455 src+= stride;\
1456 }\
1457}\
1458\
1459static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460 const int A=(8-x)*(8-y);\
1461 const int B=( x)*(8-y);\
1462 const int C=(8-x)*( y);\
1463 const int D=( x)*( y);\
1464 int i;\
1465 \
1466 assert(x<8 && y<8 && x>=0 && y>=0);\
1467\
1468 for(i=0; i<h; i++)\
1469 {\
1470 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1478 dst+= stride;\
1479 src+= stride;\
1480 }\
1481}
1482
1483#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484#define op_put(a, b) a = (((b) + 32)>>6)
1485
1486H264_CHROMA_MC(put_ , op_put)
1487H264_CHROMA_MC(avg_ , op_avg)
1488#undef op_avg
1489#undef op_put
1490
80e44bc3
MN
1491static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1492{
1493 int i;
1494 for(i=0; i<h; i++)
1495 {
1496 ST16(dst , LD16(src ));
1497 dst+=dstStride;
1498 src+=srcStride;
1499 }
1500}
1501
0da71265
MN
1502static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1503{
1504 int i;
1505 for(i=0; i<h; i++)
1506 {
1507 ST32(dst , LD32(src ));
1508 dst+=dstStride;
1509 src+=srcStride;
1510 }
1511}
1512
1513static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514{
1515 int i;
1516 for(i=0; i<h; i++)
1517 {
1518 ST32(dst , LD32(src ));
1519 ST32(dst+4 , LD32(src+4 ));
1520 dst+=dstStride;
1521 src+=srcStride;
1522 }
1523}
1524
1525static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1526{
1527 int i;
1528 for(i=0; i<h; i++)
1529 {
1530 ST32(dst , LD32(src ));
1531 ST32(dst+4 , LD32(src+4 ));
1532 ST32(dst+8 , LD32(src+8 ));
1533 ST32(dst+12, LD32(src+12));
1534 dst+=dstStride;
1535 src+=srcStride;
1536 }
1537}
073b013d 1538
0c1a9eda 1539static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1540{
44eb4951
MN
1541 int i;
1542 for(i=0; i<h; i++)
1543 {
b3184779
MN
1544 ST32(dst , LD32(src ));
1545 ST32(dst+4 , LD32(src+4 ));
1546 ST32(dst+8 , LD32(src+8 ));
1547 ST32(dst+12, LD32(src+12));
1548 dst[16]= src[16];
44eb4951
MN
1549 dst+=dstStride;
1550 src+=srcStride;
1551 }
1552}
1553
0c1a9eda 1554static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1555{
1556 int i;
b3184779 1557 for(i=0; i<h; i++)
44eb4951 1558 {
b3184779
MN
1559 ST32(dst , LD32(src ));
1560 ST32(dst+4 , LD32(src+4 ));
1561 dst[8]= src[8];
44eb4951
MN
1562 dst+=dstStride;
1563 src+=srcStride;
1564 }
1565}
1566
826f429a 1567
b3184779 1568#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1569static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1571 int i;\
1572 for(i=0; i<h; i++)\
1573 {\
1574 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1575 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1576 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1577 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1578 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1579 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1580 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1581 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1582 dst+=dstStride;\
1583 src+=srcStride;\
1584 }\
44eb4951
MN
1585}\
1586\
0c1a9eda 1587static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1588 const int w=8;\
0c1a9eda 1589 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1590 int i;\
1591 for(i=0; i<w; i++)\
1592 {\
1593 const int src0= src[0*srcStride];\
1594 const int src1= src[1*srcStride];\
1595 const int src2= src[2*srcStride];\
1596 const int src3= src[3*srcStride];\
1597 const int src4= src[4*srcStride];\
1598 const int src5= src[5*srcStride];\
1599 const int src6= src[6*srcStride];\
1600 const int src7= src[7*srcStride];\
1601 const int src8= src[8*srcStride];\
1602 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1603 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1604 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1605 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1606 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1607 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1608 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1609 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1610 dst++;\
1611 src++;\
1612 }\
1613}\
1614\
0c1a9eda
ZK
1615static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1617 int i;\
826f429a 1618 \
b3184779
MN
1619 for(i=0; i<h; i++)\
1620 {\
1621 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1622 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1623 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1624 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1625 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1626 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1627 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1628 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1629 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1630 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1631 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1632 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1633 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1634 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1635 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1636 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1637 dst+=dstStride;\
1638 src+=srcStride;\
1639 }\
1640}\
1641\
0c1a9eda
ZK
1642static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1643 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1644 int i;\
826f429a 1645 const int w=16;\
b3184779
MN
1646 for(i=0; i<w; i++)\
1647 {\
1648 const int src0= src[0*srcStride];\
1649 const int src1= src[1*srcStride];\
1650 const int src2= src[2*srcStride];\
1651 const int src3= src[3*srcStride];\
1652 const int src4= src[4*srcStride];\
1653 const int src5= src[5*srcStride];\
1654 const int src6= src[6*srcStride];\
1655 const int src7= src[7*srcStride];\
1656 const int src8= src[8*srcStride];\
1657 const int src9= src[9*srcStride];\
1658 const int src10= src[10*srcStride];\
1659 const int src11= src[11*srcStride];\
1660 const int src12= src[12*srcStride];\
1661 const int src13= src[13*srcStride];\
1662 const int src14= src[14*srcStride];\
1663 const int src15= src[15*srcStride];\
1664 const int src16= src[16*srcStride];\
1665 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1666 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1667 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1668 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1669 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1670 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1671 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1672 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1673 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1674 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1675 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1676 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1677 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1678 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1679 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1680 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1681 dst++;\
1682 src++;\
1683 }\
1684}\
1685\
0c1a9eda 1686static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1687 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1688}\
1689\
0c1a9eda
ZK
1690static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691 uint8_t half[64];\
b3184779
MN
1692 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1693 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1694}\
1695\
0c1a9eda 1696static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1697 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1698}\
1699\
0c1a9eda
ZK
1700static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701 uint8_t half[64];\
b3184779
MN
1702 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1703 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1704}\
1705\
0c1a9eda
ZK
1706static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[16*9];\
1708 uint8_t half[64];\
b3184779 1709 copy_block9(full, src, 16, stride, 9);\
db794953 1710 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1711 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1712}\
1713\
0c1a9eda
ZK
1714static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t full[16*9];\
b3184779 1716 copy_block9(full, src, 16, stride, 9);\
db794953 1717 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1718}\
1719\
0c1a9eda
ZK
1720static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1722 uint8_t half[64];\
b3184779 1723 copy_block9(full, src, 16, stride, 9);\
db794953 1724 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1725 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1726}\
0c1a9eda
ZK
1727void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1728 uint8_t full[16*9];\
1729 uint8_t halfH[72];\
1730 uint8_t halfV[64];\
1731 uint8_t halfHV[64];\
b3184779
MN
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1736 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1737}\
0c1a9eda
ZK
1738static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1740 uint8_t halfH[72];\
1741 uint8_t halfHV[64];\
db794953
MN
1742 copy_block9(full, src, 16, stride, 9);\
1743 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1744 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1745 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1746 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747}\
0c1a9eda
ZK
1748void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t full[16*9];\
1750 uint8_t halfH[72];\
1751 uint8_t halfV[64];\
1752 uint8_t halfHV[64];\
b3184779
MN
1753 copy_block9(full, src, 16, stride, 9);\
1754 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1757 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1758}\
0c1a9eda
ZK
1759static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1760 uint8_t full[16*9];\
1761 uint8_t halfH[72];\
1762 uint8_t halfHV[64];\
db794953
MN
1763 copy_block9(full, src, 16, stride, 9);\
1764 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768}\
0c1a9eda
ZK
1769void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[16*9];\
1771 uint8_t halfH[72];\
1772 uint8_t halfV[64];\
1773 uint8_t halfHV[64];\
b3184779
MN
1774 copy_block9(full, src, 16, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1778 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1779}\
0c1a9eda
ZK
1780static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1782 uint8_t halfH[72];\
1783 uint8_t halfHV[64];\
db794953
MN
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789}\
0c1a9eda
ZK
1790void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1792 uint8_t halfH[72];\
1793 uint8_t halfV[64];\
1794 uint8_t halfHV[64];\
b3184779
MN
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1799 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1800}\
0c1a9eda
ZK
1801static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1803 uint8_t halfH[72];\
1804 uint8_t halfHV[64];\
db794953
MN
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810}\
0c1a9eda
ZK
1811static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t halfH[72];\
1813 uint8_t halfHV[64];\
b3184779 1814 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1815 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1816 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1817}\
0c1a9eda
ZK
1818static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819 uint8_t halfH[72];\
1820 uint8_t halfHV[64];\
b3184779 1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1823 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1824}\
0c1a9eda
ZK
1825void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1827 uint8_t halfH[72];\
1828 uint8_t halfV[64];\
1829 uint8_t halfHV[64];\
b3184779
MN
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1834 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1835}\
0c1a9eda
ZK
1836static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[16*9];\
1838 uint8_t halfH[72];\
db794953
MN
1839 copy_block9(full, src, 16, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1842 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843}\
0c1a9eda
ZK
1844void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1846 uint8_t halfH[72];\
1847 uint8_t halfV[64];\
1848 uint8_t halfHV[64];\
b3184779
MN
1849 copy_block9(full, src, 16, stride, 9);\
1850 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1853 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1854}\
0c1a9eda
ZK
1855static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1856 uint8_t full[16*9];\
1857 uint8_t halfH[72];\
db794953
MN
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1861 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862}\
0c1a9eda
ZK
1863static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t halfH[72];\
b3184779 1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1866 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1867}\
0c1a9eda 1868static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1869 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1870}\
1871\
0c1a9eda
ZK
1872static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873 uint8_t half[256];\
b3184779
MN
1874 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1875 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1876}\
1877\
0c1a9eda 1878static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1879 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1880}\
b3184779 1881\
0c1a9eda
ZK
1882static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t half[256];\
b3184779
MN
1884 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1885 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1886}\
1887\
0c1a9eda
ZK
1888static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1889 uint8_t full[24*17];\
1890 uint8_t half[256];\
b3184779 1891 copy_block17(full, src, 24, stride, 17);\
826f429a 1892 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1893 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1894}\
1895\
0c1a9eda
ZK
1896static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[24*17];\
b3184779 1898 copy_block17(full, src, 24, stride, 17);\
826f429a 1899 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1900}\
1901\
0c1a9eda
ZK
1902static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1904 uint8_t half[256];\
b3184779 1905 copy_block17(full, src, 24, stride, 17);\
826f429a 1906 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1907 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908}\
0c1a9eda
ZK
1909void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t full[24*17];\
1911 uint8_t halfH[272];\
1912 uint8_t halfV[256];\
1913 uint8_t halfHV[256];\
b3184779
MN
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1918 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919}\
0c1a9eda
ZK
1920static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 uint8_t halfH[272];\
1923 uint8_t halfHV[256];\
db794953
MN
1924 copy_block17(full, src, 24, stride, 17);\
1925 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1926 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1927 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1928 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929}\
0c1a9eda
ZK
1930void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[24*17];\
1932 uint8_t halfH[272];\
1933 uint8_t halfV[256];\
1934 uint8_t halfHV[256];\
b3184779
MN
1935 copy_block17(full, src, 24, stride, 17);\
1936 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1939 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940}\
0c1a9eda
ZK
1941static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t full[24*17];\
1943 uint8_t halfH[272];\
1944 uint8_t halfHV[256];\
db794953
MN
1945 copy_block17(full, src, 24, stride, 17);\
1946 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950}\
0c1a9eda
ZK
1951void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t full[24*17];\
1953 uint8_t halfH[272];\
1954 uint8_t halfV[256];\
1955 uint8_t halfHV[256];\
b3184779
MN
1956 copy_block17(full, src, 24, stride, 17);\
1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1959 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1960 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961}\
0c1a9eda
ZK
1962static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[24*17];\
1964 uint8_t halfH[272];\
1965 uint8_t halfHV[256];\
db794953
MN
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971}\
0c1a9eda
ZK
1972void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 uint8_t halfV[256];\
1976 uint8_t halfHV[256];\
b3184779
MN
1977 copy_block17(full, src, 24, stride, 17);\
1978 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1981 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982}\
0c1a9eda
ZK
1983static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfHV[256];\
db794953
MN
1987 copy_block17(full, src, 24, stride, 17);\
1988 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992}\
0c1a9eda
ZK
1993static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t halfH[272];\
1995 uint8_t halfHV[256];\
b3184779 1996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1998 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999}\
0c1a9eda
ZK
2000static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t halfH[272];\
2002 uint8_t halfHV[256];\
b3184779 2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2005 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006}\
0c1a9eda
ZK
2007void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 uint8_t halfH[272];\
2010 uint8_t halfV[256];\
2011 uint8_t halfHV[256];\
b3184779
MN
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2016 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017}\
0c1a9eda
ZK
2018static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t full[24*17];\
2020 uint8_t halfH[272];\
db794953
MN
2021 copy_block17(full, src, 24, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2024 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025}\
0c1a9eda
ZK
2026void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2028 uint8_t halfH[272];\
2029 uint8_t halfV[256];\
2030 uint8_t halfHV[256];\
b3184779
MN
2031 copy_block17(full, src, 24, stride, 17);\
2032 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2034 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2035 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036}\
0c1a9eda
ZK
2037static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2038 uint8_t full[24*17];\
2039 uint8_t halfH[272];\
db794953
MN
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2043 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044}\
0c1a9eda
ZK
2045static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t halfH[272];\
b3184779 2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2048 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2049}
44eb4951 2050
b3184779
MN
2051#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2052#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2053#define op_put(a, b) a = cm[((b) + 16)>>5]
2054#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055
2056QPEL_MC(0, put_ , _ , op_put)
2057QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2058QPEL_MC(0, avg_ , _ , op_avg)
2059//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2060#undef op_avg
2061#undef op_avg_no_rnd
2062#undef op_put
2063#undef op_put_no_rnd
44eb4951 2064
0da71265
MN
2065#if 1
2066#define H264_LOWPASS(OPNAME, OP, OP2) \
80e44bc3
MN
2067static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068 const int h=2;\
2069 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2070 int i;\
2071 for(i=0; i<h; i++)\
2072 {\
2073 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2074 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2075 dst+=dstStride;\
2076 src+=srcStride;\
2077 }\
2078}\
2079\
2080static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081 const int w=2;\
2082 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2083 int i;\
2084 for(i=0; i<w; i++)\
2085 {\
2086 const int srcB= src[-2*srcStride];\
2087 const int srcA= src[-1*srcStride];\
2088 const int src0= src[0 *srcStride];\
2089 const int src1= src[1 *srcStride];\
2090 const int src2= src[2 *srcStride];\
2091 const int src3= src[3 *srcStride];\
2092 const int src4= src[4 *srcStride];\
2093 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2094 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2095 dst++;\
2096 src++;\
2097 }\
2098}\
2099\
2100static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2101 const int h=2;\
2102 const int w=2;\
2103 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104 int i;\
2105 src -= 2*srcStride;\
2106 for(i=0; i<h+5; i++)\
2107 {\
2108 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2109 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2110 tmp+=tmpStride;\
2111 src+=srcStride;\
2112 }\
2113 tmp -= tmpStride*(h+5-2);\
2114 for(i=0; i<w; i++)\
2115 {\
2116 const int tmpB= tmp[-2*tmpStride];\
2117 const int tmpA= tmp[-1*tmpStride];\
2118 const int tmp0= tmp[0 *tmpStride];\
2119 const int tmp1= tmp[1 *tmpStride];\
2120 const int tmp2= tmp[2 *tmpStride];\
2121 const int tmp3= tmp[3 *tmpStride];\
2122 const int tmp4= tmp[4 *tmpStride];\
2123 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2124 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2125 dst++;\
2126 tmp++;\
2127 }\
2128}\
0da71265
MN
2129static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130 const int h=4;\
2131 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2132 int i;\
2133 for(i=0; i<h; i++)\
2134 {\
2135 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2136 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2137 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2138 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2139 dst+=dstStride;\
2140 src+=srcStride;\
2141 }\
2142}\
2143\
2144static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145 const int w=4;\
2146 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2147 int i;\
2148 for(i=0; i<w; i++)\
2149 {\
2150 const int srcB= src[-2*srcStride];\
2151 const int srcA= src[-1*srcStride];\
2152 const int src0= src[0 *srcStride];\
2153 const int src1= src[1 *srcStride];\
2154 const int src2= src[2 *srcStride];\
2155 const int src3= src[3 *srcStride];\
2156 const int src4= src[4 *srcStride];\
2157 const int src5= src[5 *srcStride];\
2158 const int src6= src[6 *srcStride];\
2159 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2160 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2161 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2162 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2163 dst++;\
2164 src++;\
2165 }\
2166}\
2167\
2168static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2169 const int h=4;\
2170 const int w=4;\
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172 int i;\
2173 src -= 2*srcStride;\
2174 for(i=0; i<h+5; i++)\
2175 {\
2176 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2177 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2178 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2179 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2180 tmp+=tmpStride;\
2181 src+=srcStride;\
2182 }\
2183 tmp -= tmpStride*(h+5-2);\
2184 for(i=0; i<w; i++)\
2185 {\
2186 const int tmpB= tmp[-2*tmpStride];\
2187 const int tmpA= tmp[-1*tmpStride];\
2188 const int tmp0= tmp[0 *tmpStride];\
2189 const int tmp1= tmp[1 *tmpStride];\
2190 const int tmp2= tmp[2 *tmpStride];\
2191 const int tmp3= tmp[3 *tmpStride];\
2192 const int tmp4= tmp[4 *tmpStride];\
2193 const int tmp5= tmp[5 *tmpStride];\
2194 const int tmp6= tmp[6 *tmpStride];\
2195 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2196 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2197 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2198 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2199 dst++;\
2200 tmp++;\
2201 }\
2202}\
2203\
2204static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 const int h=8;\
2206 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2207 int i;\
2208 for(i=0; i<h; i++)\
2209 {\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2212 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2213 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2214 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2215 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2216 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2217 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2218 dst+=dstStride;\
2219 src+=srcStride;\
2220 }\
2221}\
2222\
2223static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224 const int w=8;\
2225 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2226 int i;\
2227 for(i=0; i<w; i++)\
2228 {\
2229 const int srcB= src[-2*srcStride];\
2230 const int srcA= src[-1*srcStride];\
2231 const int src0= src[0 *srcStride];\
2232 const int src1= src[1 *srcStride];\
2233 const int src2= src[2 *srcStride];\
2234 const int src3= src[3 *srcStride];\
2235 const int src4= src[4 *srcStride];\
2236 const int src5= src[5 *srcStride];\
2237 const int src6= src[6 *srcStride];\
2238 const int src7= src[7 *srcStride];\
2239 const int src8= src[8 *srcStride];\
2240 const int src9= src[9 *srcStride];\
2241 const int src10=src[10*srcStride];\
2242 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2247 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2248 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2249 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2250 dst++;\
2251 src++;\
2252 }\
2253}\
2254\
2255static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2256 const int h=8;\
2257 const int w=8;\
2258 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259 int i;\
2260 src -= 2*srcStride;\
2261 for(i=0; i<h+5; i++)\
2262 {\
2263 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2264 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2265 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2266 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2267 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2268 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2269 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2270 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2271 tmp+=tmpStride;\
2272 src+=srcStride;\
2273 }\
2274 tmp -= tmpStride*(h+5-2);\
2275 for(i=0; i<w; i++)\
2276 {\
2277 const int tmpB= tmp[-2*tmpStride];\
2278 const int tmpA= tmp[-1*tmpStride];\
2279 const int tmp0= tmp[0 *tmpStride];\
2280 const int tmp1= tmp[1 *tmpStride];\
2281 const int tmp2= tmp[2 *tmpStride];\
2282 const int tmp3= tmp[3 *tmpStride];\
2283 const int tmp4= tmp[4 *tmpStride];\
2284 const int tmp5= tmp[5 *tmpStride];\
2285 const int tmp6= tmp[6 *tmpStride];\
2286 const int tmp7= tmp[7 *tmpStride];\
2287 const int tmp8= tmp[8 *tmpStride];\
2288 const int tmp9= tmp[9 *tmpStride];\
2289 const int tmp10=tmp[10*tmpStride];\
2290 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2291 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2292 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2293 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2294 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2295 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2296 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2297 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2298 dst++;\
2299 tmp++;\
2300 }\
2301}\
2302\
2303static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306 src += 8*srcStride;\
2307 dst += 8*dstStride;\
2308 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2309 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310}\
2311\
2312static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315 src += 8*srcStride;\
2316 dst += 8*dstStride;\
2317 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2318 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319}\
2320\
2321static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324 src += 8*srcStride;\
0da71265
MN
2325 dst += 8*dstStride;\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2327 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328}\
2329
2330#define H264_MC(OPNAME, SIZE) \
2331static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2332 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2333}\
2334\
2335static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t half[SIZE*SIZE];\
2337 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2338 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2339}\
2340\
2341static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2342 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2343}\
2344\
2345static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2346 uint8_t half[SIZE*SIZE];\
2347 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2348 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2349}\
2350\
2351static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2352 uint8_t full[SIZE*(SIZE+5)];\
2353 uint8_t * const full_mid= full + SIZE*2;\
2354 uint8_t half[SIZE*SIZE];\
2355 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2356 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2357 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2358}\
2359\
2360static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t full[SIZE*(SIZE+5)];\
2362 uint8_t * const full_mid= full + SIZE*2;\
2363 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2364 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2365}\
2366\
2367static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2368 uint8_t full[SIZE*(SIZE+5)];\
2369 uint8_t * const full_mid= full + SIZE*2;\
2370 uint8_t half[SIZE*SIZE];\
2371 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2372 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2374}\
2375\
2376static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t full[SIZE*(SIZE+5)];\
2378 uint8_t * const full_mid= full + SIZE*2;\
2379 uint8_t halfH[SIZE*SIZE];\
2380 uint8_t halfV[SIZE*SIZE];\
2381 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2382 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2383 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2384 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2385}\
2386\
2387static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2388 uint8_t full[SIZE*(SIZE+5)];\
2389 uint8_t * const full_mid= full + SIZE*2;\
2390 uint8_t halfH[SIZE*SIZE];\
2391 uint8_t halfV[SIZE*SIZE];\
2392 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2393 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2394 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2395 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2396}\
2397\
2398static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2399 uint8_t full[SIZE*(SIZE+5)];\
2400 uint8_t * const full_mid= full + SIZE*2;\
2401 uint8_t halfH[SIZE*SIZE];\
2402 uint8_t halfV[SIZE*SIZE];\
2403 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2404 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2405 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2406 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2407}\
2408\
2409static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2410 uint8_t full[SIZE*(SIZE+5)];\
2411 uint8_t * const full_mid= full + SIZE*2;\
2412 uint8_t halfH[SIZE*SIZE];\
2413 uint8_t halfV[SIZE*SIZE];\
2414 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2415 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2416 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2417 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2418}\
2419\
2420static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2421 int16_t tmp[SIZE*(SIZE+5)];\
2422 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2423}\
2424\
2425static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2426 int16_t tmp[SIZE*(SIZE+5)];\
2427 uint8_t halfH[SIZE*SIZE];\
2428 uint8_t halfHV[SIZE*SIZE];\
2429 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2430 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2432}\
2433\
2434static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2435 int16_t tmp[SIZE*(SIZE+5)];\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfHV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2440 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2441}\
2442\
2443static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t full[SIZE*(SIZE+5)];\
2445 uint8_t * const full_mid= full + SIZE*2;\
2446 int16_t tmp[SIZE*(SIZE+5)];\
2447 uint8_t halfV[SIZE*SIZE];\
2448 uint8_t halfHV[SIZE*SIZE];\
2449 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2450 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2451 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2453}\
2454\
2455static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2456 uint8_t full[SIZE*(SIZE+5)];\
2457 uint8_t * const full_mid= full + SIZE*2;\
2458 int16_t tmp[SIZE*(SIZE+5)];\
2459 uint8_t halfV[SIZE*SIZE];\
2460 uint8_t halfHV[SIZE*SIZE];\
2461 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2462 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2465}\
2466
2467#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2468//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2469#define op_put(a, b) a = cm[((b) + 16)>>5]
2470#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2471#define op2_put(a, b) a = cm[((b) + 512)>>10]
2472
2473H264_LOWPASS(put_ , op_put, op2_put)
2474H264_LOWPASS(avg_ , op_avg, op2_avg)
80e44bc3 2475H264_MC(put_, 2)
0da71265
MN
2476H264_MC(put_, 4)
2477H264_MC(put_, 8)
2478H264_MC(put_, 16)
2479H264_MC(avg_, 4)
2480H264_MC(avg_, 8)
2481H264_MC(avg_, 16)
2482
2483#undef op_avg
2484#undef op_put
2485#undef op2_avg
2486#undef op2_put
2487#endif
2488
91c56db6
MN
2489#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2490#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2491#define H264_WEIGHT(W,H) \
2492static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
e8b56208 2493 int y; \
9f2d1b4f
LM
2494 offset <<= log2_denom; \
2495 if(log2_denom) offset += 1<<(log2_denom-1); \
2496 for(y=0; y<H; y++, block += stride){ \
2497 op_scale1(0); \
2498 op_scale1(1); \
2499 if(W==2) continue; \
2500 op_scale1(2); \
2501 op_scale1(3); \
2502 if(W==4) continue; \
2503 op_scale1(4); \
2504 op_scale1(5); \
2505 op_scale1(6); \
2506 op_scale1(7); \
2507 if(W==8) continue; \
2508 op_scale1(8); \
2509 op_scale1(9); \
2510 op_scale1(10); \
2511 op_scale1(11); \
2512 op_scale1(12); \
2513 op_scale1(13); \
2514 op_scale1(14); \
2515 op_scale1(15); \
2516 } \
2517} \
e8b56208
LM
2518static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519 int y; \
2520 offset = ((offset + 1) | 1) << log2_denom; \
9f2d1b4f
LM
2521 for(y=0; y<H; y++, dst += stride, src += stride){ \
2522 op_scale2(0); \
2523 op_scale2(1); \
2524 if(W==2) continue; \
2525 op_scale2(2); \
2526 op_scale2(3); \
2527 if(W==4) continue; \
2528 op_scale2(4); \
2529 op_scale2(5); \
2530 op_scale2(6); \
2531 op_scale2(7); \
2532 if(W==8) continue; \
2533 op_scale2(8); \
2534 op_scale2(9); \
2535 op_scale2(10); \
2536 op_scale2(11); \
2537 op_scale2(12); \
2538 op_scale2(13); \
2539 op_scale2(14); \
2540 op_scale2(15); \
2541 } \
2542}
2543
2544H264_WEIGHT(16,16)
2545H264_WEIGHT(16,8)
2546H264_WEIGHT(8,16)
2547H264_WEIGHT(8,8)
2548H264_WEIGHT(8,4)
2549H264_WEIGHT(4,8)
2550H264_WEIGHT(4,4)
2551H264_WEIGHT(4,2)
2552H264_WEIGHT(2,4)
2553H264_WEIGHT(2,2)
2554
2555#undef op_scale1
2556#undef op_scale2
2557#undef H264_WEIGHT
2558
1457ab52
MN
2559static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2560 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2561 int i;
2562
2563 for(i=0; i<h; i++){
2564 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2565 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2566 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2567 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2568 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2569 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2570 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2571 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2572 dst+=dstStride;
115329f1 2573 src+=srcStride;
1457ab52
MN
2574 }
2575}
2576
2577static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2578 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2579 int i;
2580
2581 for(i=0; i<w; i++){
2582 const int src_1= src[ -srcStride];
2583 const int src0 = src[0 ];
2584 const int src1 = src[ srcStride];
2585 const int src2 = src[2*srcStride];
2586 const int src3 = src[3*srcStride];
2587 const int src4 = src[4*srcStride];
2588 const int src5 = src[5*srcStride];
2589 const int src6 = src[6*srcStride];
2590 const int src7 = src[7*srcStride];
2591 const int src8 = src[8*srcStride];
2592 const int src9 = src[9*srcStride];
2593 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2594 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2595 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2596 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2597 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2598 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2599 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2600 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2601 src++;
2602 dst++;
2603 }
2604}
2605
2606static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2607 put_pixels8_c(dst, src, stride, 8);
2608}
2609
2610static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611 uint8_t half[64];
2612 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2614}
2615
2616static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2618}
2619
2620static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621 uint8_t half[64];
2622 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2624}
2625
2626static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2628}
2629
2630static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631 uint8_t halfH[88];
2632 uint8_t halfV[64];
2633 uint8_t halfHV[64];
2634 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638}
2639static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640 uint8_t halfH[88];
2641 uint8_t halfV[64];
2642 uint8_t halfHV[64];
2643 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647}
2648static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649 uint8_t halfH[88];
2650 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2652}
2653
332f9ac4
MN
2654static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655 int x;
2656 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2657
332f9ac4
MN
2658 for(x=0; x<8; x++){
2659 int d1, d2, ad1;
2660 int p0= src[x-2*stride];
2661 int p1= src[x-1*stride];
2662 int p2= src[x+0*stride];
2663 int p3= src[x+1*stride];
2664 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2665
2666 if (d<-2*strength) d1= 0;
2667 else if(d<- strength) d1=-2*strength - d;
2668 else if(d< strength) d1= d;
2669 else if(d< 2*strength) d1= 2*strength - d;
2670 else d1= 0;
115329f1 2671
332f9ac4
MN
2672 p1 += d1;
2673 p2 -= d1;
2674 if(p1&256) p1= ~(p1>>31);
2675 if(p2&256) p2= ~(p2>>31);
115329f1 2676
332f9ac4
MN
2677 src[x-1*stride] = p1;
2678 src[x+0*stride] = p2;
2679
5b5404e3 2680 ad1= ABS(d1)>>1;
115329f1 2681
332f9ac4 2682 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2683
332f9ac4
MN
2684 src[x-2*stride] = p0 - d2;
2685 src[x+ stride] = p3 + d2;
2686 }
2687}
2688
2689static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2690 int y;
2691 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2692
332f9ac4
MN
2693 for(y=0; y<8; y++){
2694 int d1, d2, ad1;
2695 int p0= src[y*stride-2];
2696 int p1= src[y*stride-1];
2697 int p2= src[y*stride+0];
2698 int p3= src[y*stride+1];
2699 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700
2701 if (d<-2*strength) d1= 0;
2702 else if(d<- strength) d1=-2*strength - d;
2703 else if(d< strength) d1= d;
2704 else if(d< 2*strength) d1= 2*strength - d;
2705 else d1= 0;
115329f1 2706
332f9ac4
MN
2707 p1 += d1;
2708 p2 -= d1;
2709 if(p1&256) p1= ~(p1>>31);
2710 if(p2&256) p2= ~(p2>>31);
115329f1 2711
332f9ac4
MN
2712 src[y*stride-1] = p1;
2713 src[y*stride+0] = p2;
2714
2715 ad1= ABS(d1)>>1;
115329f1 2716
332f9ac4 2717 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2718
332f9ac4
MN
2719 src[y*stride-2] = p0 - d2;
2720 src[y*stride+1] = p3 + d2;
2721 }
2722}
1457ab52 2723
fdbbf2e0
MN
2724static void h261_loop_filter_c(uint8_t *src, int stride){
2725 int x,y,xy,yz;
2726 int temp[64];
2727
2728 for(x=0; x<8; x++){
2729 temp[x ] = 4*src[x ];
2730 temp[x + 7*8] = 4*src[x + 7*stride];
2731 }
2732 for(y=1; y<7; y++){
2733 for(x=0; x<8; x++){
2734 xy = y * stride + x;
2735 yz = y * 8 + x;
2736 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2737 }
2738 }
115329f1 2739
fdbbf2e0
MN
2740 for(y=0; y<8; y++){
2741 src[ y*stride] = (temp[ y*8] + 2)>>2;
2742 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2743 for(x=1; x<7; x++){
2744 xy = y * stride + x;
2745 yz = y * 8 + x;
2746 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2747 }
2748 }
2749}
2750
5cf08f23 2751static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2752{
2753 int i, d;
2754 for( i = 0; i < 4; i++ ) {
2755 if( tc0[i] < 0 ) {
2756 pix += 4*ystride;
2757 continue;
2758 }
2759 for( d = 0; d < 4; d++ ) {
2760 const int p0 = pix[-1*xstride];
2761 const int p1 = pix[-2*xstride];
2762 const int p2 = pix[-3*xstride];
2763 const int q0 = pix[0];
2764 const int q1 = pix[1*xstride];
2765 const int q2 = pix[2*xstride];
115329f1 2766
42251a2a
LM
2767 if( ABS( p0 - q0 ) < alpha &&
2768 ABS( p1 - p0 ) < beta &&
2769 ABS( q1 - q0 ) < beta ) {
115329f1 2770
42251a2a
LM
2771 int tc = tc0[i];
2772 int i_delta;
115329f1 2773
42251a2a 2774 if( ABS( p2 - p0 ) < beta ) {
bda1c56c 2775 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
2776 tc++;
2777 }
2778 if( ABS( q2 - q0 ) < beta ) {
bda1c56c 2779 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
2780 tc++;
2781 }
115329f1 2782
42251a2a
LM
2783 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2784 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2785 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2786 }
2787 pix += ystride;
2788 }
2789 }
2790}
5cf08f23 2791static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2792{
2793 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2794}
5cf08f23 2795static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2796{
2797 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2798}
2799
5cf08f23 2800static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2801{
2802 int i, d;
2803 for( i = 0; i < 4; i++ ) {
2804 const int tc = tc0[i];
2805 if( tc <= 0 ) {
2806 pix += 2*ystride;
2807 continue;
2808 }
2809 for( d = 0; d < 2; d++ ) {
2810 const int p0 = pix[-1*xstride];
2811 const int p1 = pix[-2*xstride];
2812 const int q0 = pix[0];
2813 const int q1 = pix[1*xstride];
2814
2815 if( ABS( p0 - q0 ) < alpha &&
2816 ABS( p1 - p0 ) < beta &&
2817 ABS( q1 - q0 ) < beta ) {
2818
2819 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2820
2821 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2822 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2823 }
2824 pix += ystride;
2825 }
2826 }
2827}
5cf08f23 2828static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2829{
2830 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2831}
5cf08f23 2832static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2833{
2834 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2835}
2836
5cf08f23
LM
2837static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2838{
2839 int d;
2840 for( d = 0; d < 8; d++ ) {
2841 const int p0 = pix[-1*xstride];
2842 const int p1 = pix[-2*xstride];
2843 const int q0 = pix[0];
2844 const int q1 = pix[1*xstride];
2845
2846 if( ABS( p0 - q0 ) < alpha &&
2847 ABS( p1 - p0 ) < beta &&
2848 ABS( q1 - q0 ) < beta ) {
2849
2850 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2851 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2852 }
2853 pix += ystride;
2854 }
2855}
2856static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857{
2858 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2859}
2860static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2861{
2862 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2863}
2864
bb198e19 2865static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2866{
2867 int s, i;
2868
2869 s = 0;
bb198e19 2870 for(i=0;i<h;i++) {
de6d9b64
FB
2871 s += abs(pix1[0] - pix2[0]);
2872 s += abs(pix1[1] - pix2[1]);
2873 s += abs(pix1[2] - pix2[2]);
2874 s += abs(pix1[3] - pix2[3]);
2875 s += abs(pix1[4] - pix2[4]);
2876 s += abs(pix1[5] - pix2[5]);
2877 s += abs(pix1[6] - pix2[6]);
2878 s += abs(pix1[7] - pix2[7]);
2879 s += abs(pix1[8] - pix2[8]);
2880 s += abs(pix1[9] - pix2[9]);
2881 s += abs(pix1[10] - pix2[10]);
2882 s += abs(pix1[11] - pix2[11]);
2883 s += abs(pix1[12] - pix2[12]);
2884 s += abs(pix1[13] - pix2[13]);
2885 s += abs(pix1[14] - pix2[14]);
2886 s += abs(pix1[15] - pix2[15]);
2887 pix1 += line_size;
2888 pix2 += line_size;
2889 }
2890 return s;
2891}
2892
bb198e19 2893static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2894{
2895 int s, i;
2896
2897 s = 0;
bb198e19 2898 for(i=0;i<h;i++) {
de6d9b64
FB
2899 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2900 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2901 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2902 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2903 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2904 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2905 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2906 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2907 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2908 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2909 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2910 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2911 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2912 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2913 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2914 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2915 pix1 += line_size;
2916 pix2 += line_size;
2917 }
2918 return s;
2919}
2920
bb198e19 2921static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2922{
2923 int s, i;
0c1a9eda 2924 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2925
2926 s = 0;
bb198e19 2927 for(i=0;i<h;i++) {
de6d9b64
FB
2928 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2929 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2930 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2931 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2932 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2933 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2934 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2935 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2936 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2937 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2938 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2939 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2940 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2941 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2942 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2943 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2944 pix1 += line_size;
2945 pix2 += line_size;
2946 pix3 += line_size;
2947 }
2948 return s;
2949}
2950
bb198e19 2951static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2952{
2953 int s, i;
0c1a9eda 2954 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2955
2956 s = 0;
bb198e19 2957 for(i=0;i<h;i++) {
de6d9b64
FB
2958 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2959 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2960 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2961 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2962 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2963 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2964 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2965 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2966 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2967 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2968 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2969 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2970 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2971 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2972 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2973 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2974 pix1 += line_size;
2975 pix2 += line_size;
2976 pix3 += line_size;
2977 }
2978 return s;
2979}
2980
bb198e19 2981static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2982{
2983 int s, i;
2984
2985 s = 0;
bb198e19 2986 for(i=0;i<h;i++) {
ba6802de
MN
2987 s += abs(pix1[0] - pix2[0]);
2988 s += abs(pix1[1] - pix2[1]);
2989 s += abs(pix1[2] - pix2[2]);
2990 s += abs(pix1[3] - pix2[3]);
2991 s += abs(pix1[4] - pix2[4]);
2992 s += abs(pix1[5] - pix2[5]);
2993 s += abs(pix1[6] - pix2[6]);
2994 s += abs(pix1[7] - pix2[7]);
2995 pix1 += line_size;
2996 pix2 += line_size;
2997 }
2998 return s;
2999}
3000
bb198e19 3001static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3002{
3003 int s, i;
3004
3005 s = 0;
bb198e19 3006 for(i=0;i<h;i++) {
ba6802de
MN
3007 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3008 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3009 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3010 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3011 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3012 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3013 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3014 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3015 pix1 += line_size;
3016 pix2 += line_size;
3017 }
3018 return s;
3019}
3020
bb198e19 3021static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3022{
3023 int s, i;
0c1a9eda 3024 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3025
3026 s = 0;
bb198e19 3027 for(i=0;i<h;i++) {
ba6802de
MN
3028 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3029 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3030 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3031 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3032 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3033 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3034 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3035 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3036 pix1 += line_size;
3037 pix2 += line_size;
3038 pix3 += line_size;
3039 }
3040 return s;
3041}
3042
bb198e19 3043static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3044{
3045 int s, i;
0c1a9eda 3046 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3047
3048 s = 0;
bb198e19 3049 for(i=0;i<h;i++) {
ba6802de
MN
3050 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3051 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3052 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3053 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3054 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3055 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3056 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3057 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3058 pix1 += line_size;
3059 pix2 += line_size;
3060 pix3 += line_size;
3061 }
3062 return s;
3063}
3064
bf4e3bd2
MR
3065static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3066 MpegEncContext *c = v;
e6a2ac34
MN
3067 int score1=0;
3068 int score2=0;
3069 int x,y;
d4c5d2ad 3070
e6a2ac34
MN
3071 for(y=0; y<h; y++){
3072 for(x=0; x<16; x++){
3073 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3074 }
3075 if(y+1<h){
3076 for(x=0; x<15; x++){
3077 score2+= ABS( s1[x ] - s1[x +stride]
3078 - s1[x+1] + s1[x+1+stride])
3079 -ABS( s2[x ] - s2[x +stride]
3080 - s2[x+1] + s2[x+1+stride]);
3081 }
3082 }
3083 s1+= stride;
3084 s2+= stride;
3085 }
d4c5d2ad
MN
3086
3087 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3088 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3089}
3090
bf4e3bd2
MR
3091static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092 MpegEncContext *c = v;
e6a2ac34
MN
3093 int score1=0;
3094 int score2=0;
3095 int x,y;
115329f1 3096
e6a2ac34
MN
3097 for(y=0; y<h; y++){
3098 for(x=0; x<8; x++){
3099 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3100 }
3101 if(y+1<h){
3102 for(x=0; x<7; x++){
3103 score2+= ABS( s1[x ] - s1[x +stride]
3104 - s1[x+1] + s1[x+1+stride])
3105 -ABS( s2[x ] - s2[x +stride]
3106 - s2[x+1] + s2[x+1+stride]);
3107 }
3108 }
3109 s1+= stride;
3110 s2+= stride;
3111 }
115329f1 3112
d4c5d2ad
MN
3113 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3115}
3116
364a1797
MN
3117static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3118 int i;
3119 unsigned int sum=0;
3120
3121 for(i=0; i<8*8; i++){
3122 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3123 int w= weight[i];
3124 b>>= RECON_SHIFT;
3125 assert(-512<b && b<512);
3126
3127 sum += (w*b)*(w*b)>>4;
3128 }
3129 return sum>>2;
3130}
3131
3132static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3133 int i;
3134
3135 for(i=0; i<8*8; i++){
3136 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3137 }
364a1797
MN
3138}
3139
a9badb51
MN
3140/**
3141 * permutes an 8x8 block.
2a5700de 3142 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3143 * @param permutation the permutation vector
3144 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3145 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3146 * (inverse) permutated to scantable order!
a9badb51 3147 */
0c1a9eda 3148void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3149{
7801d21d 3150 int i;
477ab036 3151 DCTELEM temp[64];
115329f1 3152
7801d21d 3153 if(last<=0) return;
9a7b310d 3154 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 3155
7801d21d
MN
3156 for(i=0; i<=last; i++){
3157 const int j= scantable[i];
3158 temp[j]= block[j];
3159 block[j]=0;
3160 }
115329f1 3161
7801d21d
MN
3162 for(i=0; i<=last; i++){
3163 const int j= scantable[i];
3164 const int perm_j= permutation[j];
3165 block[perm_j]= temp[j];
3166 }
d962f6fd 3167}
e0eac44e 3168
622348f9
MN
3169static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3170 return 0;
3171}
3172
3173void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3174 int i;
115329f1 3175
622348f9 3176 memset(cmp, 0, sizeof(void*)*5);
115329f1 3177
622348f9
MN
3178 for(i=0; i<5; i++){
3179 switch(type&0xFF){
3180 case FF_CMP_SAD:
3181 cmp[i]= c->sad[i];
3182 break;
3183 case FF_CMP_SATD:
3184 cmp[i]= c->hadamard8_diff[i];
3185 break;
3186 case FF_CMP_SSE:
3187 cmp[i]= c->sse[i];
3188 break;
3189 case FF_CMP_DCT:
3190 cmp[i]= c->dct_sad[i];
3191 break;
27c61ac5
MN
3192 case FF_CMP_DCT264:
3193 cmp[i]= c->dct264_sad[i];
3194 break;
0fd6aea1
MN
3195 case FF_CMP_DCTMAX:
3196 cmp[i]= c->dct_max[i];
3197 break;
622348f9
MN
3198 case FF_CMP_PSNR:
3199 cmp[i]= c->quant_psnr[i];
3200 break;
3201 case FF_CMP_BIT:
3202 cmp[i]= c->bit[i];
3203 break;
3204 case FF_CMP_RD:
3205 cmp[i]= c->rd[i];
3206 break;
3207 case FF_CMP_VSAD:
3208 cmp[i]= c->vsad[i];
3209 break;
3210 case FF_CMP_VSSE:
3211 cmp[i]= c->vsse[i];
3212 break;
3213 case FF_CMP_ZERO:
3214 cmp[i]= zero_cmp;
3215 break;
e6a2ac34
MN
3216 case FF_CMP_NSSE:
3217 cmp[i]= c->nsse[i];
3218 break;
26efc54e
MN
3219 case FF_CMP_W53:
3220 cmp[i]= c->w53[i];
3221 break;
3222 case FF_CMP_W97:
3223 cmp[i]= c->w97[i];
3224 break;
622348f9
MN
3225 default:
3226 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3227 }
3228 }
3229}
3230
2a5700de
MN
3231/**
3232 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3233 */
eb4b3dd3 3234static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3235{
3236 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3237}
3238
11f18faf
MN
3239static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3240 int i;
d32ac509 3241 for(i=0; i+7<w; i+=8){
11f18faf
MN
3242 dst[i+0] += src[i+0];
3243 dst[i+1] += src[i+1];
3244 dst[i+2] += src[i+2];
3245 dst[i+3] += src[i+3];
3246 dst[i+4] += src[i+4];
3247 dst[i+5] += src[i+5];
3248 dst[i+6] += src[i+6];
3249 dst[i+7] += src[i+7];
3250 }
3251 for(; i<w; i++)
3252 dst[i+0] += src[i+0];
3253}
3254
3255static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3256 int i;
d32ac509 3257 for(i=0; i+7<w; i+=8){
11f18faf
MN
3258 dst[i+0] = src1[i+0]-src2[i+0];
3259 dst[i+1] = src1[i+1]-src2[i+1];
3260 dst[i+2] = src1[i+2]-src2[i+2];
3261 dst[i+3] = src1[i+3]-src2[i+3];
3262 dst[i+4] = src1[i+4]-src2[i+4];
3263 dst[i+5] = src1[i+5]-src2[i+5];
3264 dst[i+6] = src1[i+6]-src2[i+6];
3265 dst[i+7] = src1[i+7]-src2[i+7];
3266 }
3267 for(; i<w; i++)
3268 dst[i+0] = src1[i+0]-src2[i+0];
3269}
3270
84705403
MN
3271static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3272 int i;
3273 uint8_t l, lt;
3274
3275 l= *left;
3276 lt= *left_top;
3277
3278 for(i=0; i<w; i++){
3279 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3280 lt= src1[i];
3281 l= src2[i];
3282 dst[i]= l - pred;
115329f1 3283 }
84705403
MN
3284
3285 *left= l;
3286 *left_top= lt;
3287}
3288
1457ab52
MN
3289#define BUTTERFLY2(o1,o2,i1,i2) \
3290o1= (i1)+(i2);\
3291o2= (i1)-(i2);
3292
3293#define BUTTERFLY1(x,y) \
3294{\
3295 int a,b;\
3296 a= x;\
3297 b= y;\
3298 x= a+b;\
3299 y= a-b;\
3300}
3301
3302#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3303
bb198e19 3304static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3305 int i;
3306 int temp[64];
3307 int sum=0;
115329f1 3308
bb198e19 3309 assert(h==8);
1457ab52
MN
3310
3311 for(i=0; i<8; i++){
3312 //FIXME try pointer walks
3313 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3314 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3315 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3316 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1 3317
1457ab52
MN
3318 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3319 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3320 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3321 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3322
1457ab52
MN
3323 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3324 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3325 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3326 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3327 }
3328
3329 for(i=0; i<8; i++){
3330 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3331 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3332 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3333 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3334
1457ab52
MN
3335 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3336 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3337 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3338 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3339
115329f1 3340 sum +=
1457ab52
MN
3341 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3342 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3343 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3344 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3345 }
3346#if 0
3347static int maxi=0;
3348if(sum>maxi){
3349 maxi=sum;
3350 printf("MAX:%d\n", maxi);
3351}
3352#endif
3353 return sum;
3354}
3355
622348f9 3356static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3357 int i;
3358 int temp[64];
3359 int sum=0;
115329f1 3360
622348f9 3361 assert(h==8);
115329f1 3362
1457ab52
MN
3363 for(i=0; i<8; i++){
3364 //FIXME try pointer walks
622348f9
MN
3365 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3366 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3367 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3368 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1 3369
1457ab52
MN
3370 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3374
1457ab52
MN
3375 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3379 }
3380
3381 for(i=0; i<8; i++){
3382 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3386
1457ab52
MN
3387 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
DB
3391
3392 sum +=
1457ab52
MN
3393 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3397 }
115329f1 3398
622348f9 3399 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
115329f1 3400
1457ab52
MN
3401 return sum;
3402}
3403
bb198e19 3404static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3405 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3406 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3407 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3408 int sum=0, i;
115329f1 3409
bb198e19 3410 assert(h==8);
1457ab52
MN
3411
3412 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3413 s->dsp.fdct(temp);
1457ab52
MN
3414
3415 for(i=0; i<64; i++)
3416 sum+= ABS(temp[i]);
115329f1 3417
1457ab52
MN
3418 return sum;
3419}
3420
27c61ac5
MN
3421#ifdef CONFIG_GPL
3422#define DCT8_1D {\
3423 const int s07 = SRC(0) + SRC(7);\
3424 const int s16 = SRC(1) + SRC(6);\
3425 const int s25 = SRC(2) + SRC(5);\
3426 const int s34 = SRC(3) + SRC(4);\
3427 const int a0 = s07 + s34;\
3428 const int a1 = s16 + s25;\
3429 const int a2 = s07 - s34;\
3430 const int a3 = s16 - s25;\
3431 const int d07 = SRC(0) - SRC(7);\
3432 const int d16 = SRC(1) - SRC(6);\
3433 const int d25 = SRC(2) - SRC(5);\
3434 const int d34 = SRC(3) - SRC(4);\
3435 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3436 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3437 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3438 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3439 DST(0, a0 + a1 ) ;\
3440 DST(1, a4 + (a7>>2)) ;\
3441 DST(2, a2 + (a3>>1)) ;\
3442 DST(3, a5 + (a6>>2)) ;\
3443 DST(4, a0 - a1 ) ;\
3444 DST(5, a6 - (a5>>2)) ;\
3445 DST(6, (a2>>1) - a3 ) ;\
3446 DST(7, (a4>>2) - a7 ) ;\
3447}
3448
3449static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3450 MpegEncContext * const s= (MpegEncContext *)c;
3451 int16_t dct[8][8];
3452 int i;
3453 int sum=0;
3454
3455 s->dsp.diff_pixels(dct, src1, src2, stride);
3456
3457#define SRC(x) dct[i][x]
3458#define DST(x,v) dct[i][x]= v
3459 for( i = 0; i < 8; i++ )
3460 DCT8_1D
3461#undef SRC
3462#undef DST
3463
3464#define SRC(x) dct[x][i]
3465#define DST(x,v) sum += ABS(v)
3466 for( i = 0; i < 8; i++ )
3467 DCT8_1D
3468#undef SRC
3469#undef DST
3470 return sum;
3471}
3472#endif
3473
0fd6aea1
MN
3474static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3476 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
0fd6aea1
MN
3477 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3478 int sum=0, i;
115329f1 3479
0fd6aea1
MN
3480 assert(h==8);
3481
3482 s->dsp.diff_pixels(temp, src1, src2, stride);
3483 s->dsp.fdct(temp);
3484
3485 for(i=0; i<64; i++)
3486 sum= FFMAX(sum, ABS(temp[i]));
115329f1 3487
0fd6aea1
MN
3488 return sum;
3489}
3490
0e15384d 3491void simple_idct(DCTELEM *block); //FIXME
1457ab52 3492
bb198e19 3493static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3494 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3495 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
76fbb024
MN
3496 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3497 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3498 int sum=0, i;
3499
bb198e19 3500 assert(h==8);
1457ab52 3501 s->mb_intra=0;
115329f1 3502
1457ab52 3503 s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1 3504
1457ab52 3505 memcpy(bak, temp, 64*sizeof(DCTELEM));
115329f1 3506
67725183 3507 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3508 s->dct_unquantize_inter(s, temp, 0, s->qscale);
115329f1
DB
3509 simple_idct(temp); //FIXME
3510
1457ab52
MN
3511 for(i=0; i<64; i++)
3512 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1 3513
1457ab52
MN
3514 return sum;
3515}
3516
bb198e19 3517static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3518 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3519 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58
SH
3520 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3521 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
76fbb024
MN
3522 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3523 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3524 int i, last, run, bits, level, distoration, start_i;
3525 const int esc_length= s->ac_esc_length;
3526 uint8_t * length;
3527 uint8_t * last_length;
115329f1 3528
bb198e19
MN
3529 assert(h==8);
3530
67725183
MN
3531 for(i=0; i<8; i++){
3532 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3533 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3534 }
3a87ac94 3535
67725183
MN
3536 s->dsp.diff_pixels(temp, src1, src2, stride);
3537
3538 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3539
3540 bits=0;
115329f1 3541
3a87ac94 3542 if (s->mb_intra) {
115329f1 3543 start_i = 1;
3a87ac94
MN
3544 length = s->intra_ac_vlc_length;
3545 last_length= s->intra_ac_vlc_last_length;
67725183 3546 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3547 } else {
3548 start_i = 0;
3549 length = s->inter_ac_vlc_length;
3550 last_length= s->inter_ac_vlc_last_length;
3551 }
115329f1 3552
67725183 3553 if(last>=start_i){
3a87ac94
MN
3554 run=0;
3555 for(i=start_i; i<last; i++){
3556 int j= scantable[i];
3557 level= temp[j];
115329f1 3558
3a87ac94
MN
3559 if(level){
3560 level+=64;
3561 if((level&(~127)) == 0){
3562 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3563 }else
3564 bits+= esc_length;
3565 run=0;
3566 }else
3567 run++;
3568 }
3569 i= scantable[last];
115329f1 3570
3a87ac94 3571 level= temp[i] + 64;
1d0eab1d
MN
3572
3573 assert(level - 64);
115329f1 3574
3a87ac94
MN
3575 if((level&(~127)) == 0){
3576 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3577 }else
3578 bits+= esc_length;
115329f1 3579
67725183
MN
3580 }
3581
3582 if(last>=0){
d50635cd
MN
3583 if(s->mb_intra)
3584 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3585 else
3586 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94 3587 }
115329f1 3588
b0368839 3589 s->dsp.idct_add(bak, stride, temp);
115329f1 3590
bb198e19 3591 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3592
67725183 3593 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3594}
3595
bb198e19 3596static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3597 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3598 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58 3599 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3600 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3601 int i, last, run, bits, level, start_i;
3602 const int esc_length= s->ac_esc_length;
3603 uint8_t * length;
3604 uint8_t * last_length;
bb198e19
MN
3605
3606 assert(h==8);
115329f1 3607
67725183 3608 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3609
67725183
MN
3610 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3611
3612 bits=0;
115329f1 3613
3a87ac94 3614 if (s->mb_intra) {
115329f1 3615 start_i = 1;
3a87ac94
MN
3616 length = s->intra_ac_vlc_length;
3617 last_length= s->intra_ac_vlc_last_length;
67725183 3618 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3619 } else {
3620 start_i = 0;
3621 length = s->inter_ac_vlc_length;
3622 last_length= s->inter_ac_vlc_last_length;
3623 }
115329f1 3624
67725183 3625 if(last>=start_i){
3a87ac94
MN
3626 run=0;
3627 for(i=start_i; i<last; i++){
3628 int j= scantable[i];
3629 level= temp[j];