Incorrect check removed.
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
115329f1 22
983e3246
MN
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
115329f1 27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
5596c60c 33
88730be6
MR
34/* snow.c */
35void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36
8b69867f
MN
37uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38uint32_t squareTbl[512] = {0, };
de6d9b64 39
0c1a9eda 40const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 43 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 44 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
49};
50
10acc479
RS
51/* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
62};
63
2f349de2 64/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
8b69867f 65uint16_t __align8 inv_zigzag_direct16[64] = {0, };
2f349de2 66
0c1a9eda 67const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 68 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 69 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 70 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 71 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 72 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 73 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 74 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
75 52, 53, 54, 55, 60, 61, 62, 63,
76};
77
0c1a9eda 78const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 79 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 80 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 81 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 82 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 83 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 84 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 85 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
86 38, 46, 54, 62, 39, 47, 55, 63,
87};
88
2f349de2 89/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 90const uint32_t inverse[256]={
115329f1
DB
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2
MN
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
123};
124
b0368839
MN
125/* Input permutation for the simple_idct_mmx */
126static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
135};
136
0c1a9eda 137static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
138{
139 int s, i, j;
140
141 s = 0;
142 for (i = 0; i < 16; i++) {
bb270c08
DB
143 for (j = 0; j < 16; j += 8) {
144 s += pix[0];
145 s += pix[1];
146 s += pix[2];
147 s += pix[3];
148 s += pix[4];
149 s += pix[5];
150 s += pix[6];
151 s += pix[7];
152 pix += 8;
153 }
154 pix += line_size - 16;
3aa102be
MN
155 }
156 return s;
157}
158
0c1a9eda 159static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
160{
161 int s, i, j;
0c1a9eda 162 uint32_t *sq = squareTbl + 256;
3aa102be
MN
163
164 s = 0;
165 for (i = 0; i < 16; i++) {
bb270c08 166 for (j = 0; j < 16; j += 8) {
2a006cd3 167#if 0
bb270c08
DB
168 s += sq[pix[0]];
169 s += sq[pix[1]];
170 s += sq[pix[2]];
171 s += sq[pix[3]];
172 s += sq[pix[4]];
173 s += sq[pix[5]];
174 s += sq[pix[6]];
175 s += sq[pix[7]];
2a006cd3
FL
176#else
177#if LONG_MAX > 2147483647
bb270c08
DB
178 register uint64_t x=*(uint64_t*)pix;
179 s += sq[x&0xff];
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
2a006cd3
FL
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
187#else
bb270c08
DB
188 register uint32_t x=*(uint32_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
2a006cd3
FL
193 x=*(uint32_t*)(pix+4);
194 s += sq[x&0xff];
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
198#endif
199#endif
bb270c08
DB
200 pix += 8;
201 }
202 pix += line_size - 16;
3aa102be
MN
203 }
204 return s;
205}
206
3d2e8cce
MN
207static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208 int i;
115329f1 209
3d2e8cce
MN
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
219 }
220 for(;i<w; i++){
221 dst[i+0]= bswap_32(src[i+0]);
222 }
223}
3aa102be 224
26efc54e
MN
225static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226{
227 int s, i;
228 uint32_t *sq = squareTbl + 256;
229
230 s = 0;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240}
241
bb198e19 242static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
243{
244 int s, i;
0c1a9eda 245 uint32_t *sq = squareTbl + 256;
1457ab52
MN
246
247 s = 0;
bb198e19 248 for (i = 0; i < h; i++) {
1457ab52
MN
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
257 pix1 += line_size;
258 pix2 += line_size;
259 }
260 return s;
261}
262
bb198e19 263static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 264{
6b026927
FH
265 int s, i;
266 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
267
268 s = 0;
bb198e19 269 for (i = 0; i < h; i++) {
6b026927
FH
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
2a006cd3 286
6b026927
FH
287 pix1 += line_size;
288 pix2 += line_size;
9c76bd48
BF
289 }
290 return s;
291}
292
26efc54e
MN
293
294static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
8b975b7c 295#ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
26efc54e
MN
296 int s, i, j;
297 const int dec_count= w==8 ? 3 : 4;
298 int tmp[16*16];
299#if 0
300 int level, ori;
115329f1 301 static const int scale[2][2][4][4]={
26efc54e
MN
302 {
303 {
304 //8x8 dec=3
305 {268, 239, 239, 213},
306 { 0, 224, 224, 152},
307 { 0, 135, 135, 110},
308 },{
309 //16x16 dec=4
310 {344, 310, 310, 280},
311 { 0, 320, 320, 228},
312 { 0, 175, 175, 136},
313 { 0, 129, 129, 102},
314 }
315 },{
316 {//FIXME 5/3
317 //8x8 dec=3
318 {275, 245, 245, 218},
319 { 0, 230, 230, 156},
320 { 0, 138, 138, 113},
321 },{
322 //16x16 dec=4
323 {352, 317, 317, 286},
324 { 0, 328, 328, 233},
325 { 0, 180, 180, 140},
326 { 0, 132, 132, 105},
327 }
328 }
329 };
330#endif
331
332 for (i = 0; i < h; i++) {
333 for (j = 0; j < w; j+=4) {
334 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338 }
339 pix1 += line_size;
340 pix2 += line_size;
341 }
8b975b7c 342
26efc54e
MN
343 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
344
345 s=0;
346#if 0
347 for(level=0; level<dec_count; level++){
348 for(ori= level ? 1 : 0; ori<4; ori++){
349 int sx= (ori&1) ? 1<<level: 0;
350 int stride= 16<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
352 int size= 1<<level;
115329f1 353
26efc54e
MN
354 for(i=0; i<size; i++){
355 for(j=0; j<size; j++){
356 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
357 s += ABS(v);
358 }
359 }
360 }
361 }
362#endif
363 for (i = 0; i < h; i++) {
364 for (j = 0; j < w; j+=4) {
365 s+= ABS(tmp[16*i+j+0]);
366 s+= ABS(tmp[16*i+j+1]);
367 s+= ABS(tmp[16*i+j+2]);
368 s+= ABS(tmp[16*i+j+3]);
369 }
370 }
115329f1
DB
371 assert(s>=0);
372
26efc54e 373 return s>>2;
8b975b7c 374#endif
26efc54e
MN
375}
376
377static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 8, h, 1);
379}
380
381static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 8, h, 0);
383}
384
385static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 16, h, 1);
387}
388
389static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 16, h, 0);
391}
392
0c1a9eda 393static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 394{
de6d9b64
FB
395 int i;
396
397 /* read the pixels */
de6d9b64 398 for(i=0;i<8;i++) {
c13e1abd
FH
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
407 pixels += line_size;
408 block += 8;
de6d9b64
FB
409 }
410}
411
0c1a9eda 412static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 413 const uint8_t *s2, int stride){
9dbcbd92
MN
414 int i;
415
416 /* read the pixels */
9dbcbd92 417 for(i=0;i<8;i++) {
c13e1abd
FH
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
9dbcbd92
MN
426 s1 += stride;
427 s2 += stride;
c13e1abd 428 block += 8;
9dbcbd92
MN
429 }
430}
431
432
0c1a9eda 433static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 434 int line_size)
de6d9b64 435{
de6d9b64 436 int i;
0c1a9eda 437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 438
de6d9b64 439 /* read the pixels */
de6d9b64 440 for(i=0;i<8;i++) {
c13e1abd
FH
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
449
450 pixels += line_size;
451 block += 8;
de6d9b64
FB
452 }
453}
454
178fcca8 455static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 456 int line_size)
178fcca8
MN
457{
458 int i;
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 460
178fcca8
MN
461 /* read the pixels */
462 for(i=0;i<4;i++) {
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
467
468 pixels += line_size;
469 block += 8;
470 }
471}
472
9ca358b9 473static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 474 int line_size)
9ca358b9
MN
475{
476 int i;
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 478
9ca358b9
MN
479 /* read the pixels */
480 for(i=0;i<2;i++) {
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
483
484 pixels += line_size;
485 block += 8;
486 }
487}
488
115329f1 489static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
490 uint8_t *restrict pixels,
491 int line_size)
492{
493 int i, j;
494
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
497 if (*block < -128)
498 *pixels = 0;
499 else if (*block > 127)
500 *pixels = 255;
501 else
502 *pixels = (uint8_t)(*block + 128);
503 block++;
504 pixels++;
505 }
506 pixels += (line_size - 8);
507 }
508}
509
0c1a9eda 510static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 511 int line_size)
de6d9b64 512{
de6d9b64 513 int i;
0c1a9eda 514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 515
de6d9b64 516 /* read the pixels */
de6d9b64 517 for(i=0;i<8;i++) {
c13e1abd
FH
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
526 pixels += line_size;
527 block += 8;
de6d9b64
FB
528 }
529}
178fcca8
MN
530
531static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532 int line_size)
533{
534 int i;
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 536
178fcca8
MN
537 /* read the pixels */
538 for(i=0;i<4;i++) {
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
543 pixels += line_size;
544 block += 8;
545 }
546}
9ca358b9
MN
547
548static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549 int line_size)
550{
551 int i;
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 553
9ca358b9
MN
554 /* read the pixels */
555 for(i=0;i<2;i++) {
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
558 pixels += line_size;
559 block += 8;
560 }
561}
36940eca
LM
562
563static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
564{
565 int i;
566 for(i=0;i<8;i++) {
567 pixels[0] += block[0];
568 pixels[1] += block[1];
569 pixels[2] += block[2];
570 pixels[3] += block[3];
571 pixels[4] += block[4];
572 pixels[5] += block[5];
573 pixels[6] += block[6];
574 pixels[7] += block[7];
575 pixels += line_size;
576 block += 8;
577 }
578}
579
580static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
581{
582 int i;
583 for(i=0;i<4;i++) {
584 pixels[0] += block[0];
585 pixels[1] += block[1];
586 pixels[2] += block[2];
587 pixels[3] += block[3];
588 pixels += line_size;
589 block += 4;
590 }
591}
592
59fe111e
MN
593#if 0
594
595#define PIXOP2(OPNAME, OP) \
b3184779 596static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
597{\
598 int i;\
599 for(i=0; i<h; i++){\
600 OP(*((uint64_t*)block), LD64(pixels));\
601 pixels+=line_size;\
602 block +=line_size;\
603 }\
604}\
605\
45553457 606static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
607{\
608 int i;\
609 for(i=0; i<h; i++){\
610 const uint64_t a= LD64(pixels );\
611 const uint64_t b= LD64(pixels+1);\
612 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
613 pixels+=line_size;\
614 block +=line_size;\
615 }\
616}\
617\
45553457 618static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
619{\
620 int i;\
621 for(i=0; i<h; i++){\
622 const uint64_t a= LD64(pixels );\
623 const uint64_t b= LD64(pixels+1);\
624 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
625 pixels+=line_size;\
626 block +=line_size;\
627 }\
628}\
629\
45553457 630static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
631{\
632 int i;\
633 for(i=0; i<h; i++){\
634 const uint64_t a= LD64(pixels );\
635 const uint64_t b= LD64(pixels+line_size);\
636 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
637 pixels+=line_size;\
638 block +=line_size;\
639 }\
640}\
641\
45553457 642static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
643{\
644 int i;\
645 for(i=0; i<h; i++){\
646 const uint64_t a= LD64(pixels );\
647 const uint64_t b= LD64(pixels+line_size);\
648 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
649 pixels+=line_size;\
650 block +=line_size;\
651 }\
652}\
653\
45553457 654static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
655{\
656 int i;\
657 const uint64_t a= LD64(pixels );\
658 const uint64_t b= LD64(pixels+1);\
659 uint64_t l0= (a&0x0303030303030303ULL)\
660 + (b&0x0303030303030303ULL)\
661 + 0x0202020202020202ULL;\
662 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
664 uint64_t l1,h1;\
665\
666 pixels+=line_size;\
667 for(i=0; i<h; i+=2){\
668 uint64_t a= LD64(pixels );\
669 uint64_t b= LD64(pixels+1);\
670 l1= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL);\
672 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
675 pixels+=line_size;\
676 block +=line_size;\
677 a= LD64(pixels );\
678 b= LD64(pixels+1);\
679 l0= (a&0x0303030303030303ULL)\
680 + (b&0x0303030303030303ULL)\
681 + 0x0202020202020202ULL;\
682 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
685 pixels+=line_size;\
686 block +=line_size;\
687 }\
688}\
689\
45553457 690static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
691{\
692 int i;\
693 const uint64_t a= LD64(pixels );\
694 const uint64_t b= LD64(pixels+1);\
695 uint64_t l0= (a&0x0303030303030303ULL)\
696 + (b&0x0303030303030303ULL)\
697 + 0x0101010101010101ULL;\
698 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
700 uint64_t l1,h1;\
701\
702 pixels+=line_size;\
703 for(i=0; i<h; i+=2){\
704 uint64_t a= LD64(pixels );\
705 uint64_t b= LD64(pixels+1);\
706 l1= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL);\
708 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
711 pixels+=line_size;\
712 block +=line_size;\
713 a= LD64(pixels );\
714 b= LD64(pixels+1);\
715 l0= (a&0x0303030303030303ULL)\
716 + (b&0x0303030303030303ULL)\
717 + 0x0101010101010101ULL;\
718 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
721 pixels+=line_size;\
722 block +=line_size;\
723 }\
724}\
725\
45553457
ZK
726CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
727CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
733
734#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735#else // 64 bit variant
736
737#define PIXOP2(OPNAME, OP) \
669ac79c
MN
738static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
739 int i;\
740 for(i=0; i<h; i++){\
741 OP(*((uint16_t*)(block )), LD16(pixels ));\
742 pixels+=line_size;\
743 block +=line_size;\
744 }\
745}\
0da71265
MN
746static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
747 int i;\
748 for(i=0; i<h; i++){\
749 OP(*((uint32_t*)(block )), LD32(pixels ));\
750 pixels+=line_size;\
751 block +=line_size;\
752 }\
753}\
45553457 754static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
755 int i;\
756 for(i=0; i<h; i++){\
757 OP(*((uint32_t*)(block )), LD32(pixels ));\
758 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
759 pixels+=line_size;\
760 block +=line_size;\
761 }\
762}\
45553457
ZK
763static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 765}\
59fe111e 766\
b3184779
MN
767static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768 int src_stride1, int src_stride2, int h){\
59fe111e
MN
769 int i;\
770 for(i=0; i<h; i++){\
b3184779
MN
771 uint32_t a,b;\
772 a= LD32(&src1[i*src_stride1 ]);\
773 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 774 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
775 a= LD32(&src1[i*src_stride1+4]);\
776 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 777 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
778 }\
779}\
780\
b3184779
MN
781static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
59fe111e
MN
783 int i;\
784 for(i=0; i<h; i++){\
b3184779
MN
785 uint32_t a,b;\
786 a= LD32(&src1[i*src_stride1 ]);\
787 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 788 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
789 a= LD32(&src1[i*src_stride1+4]);\
790 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 791 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
792 }\
793}\
794\
0da71265
MN
795static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
797 int i;\
798 for(i=0; i<h; i++){\
799 uint32_t a,b;\
800 a= LD32(&src1[i*src_stride1 ]);\
801 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
803 }\
804}\
805\
669ac79c
MN
806static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
808 int i;\
809 for(i=0; i<h; i++){\
810 uint32_t a,b;\
811 a= LD16(&src1[i*src_stride1 ]);\
812 b= LD16(&src2[i*src_stride2 ]);\
813 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
814 }\
815}\
816\
b3184779
MN
817static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
819 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
820 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
821}\
822\
823static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824 int src_stride1, int src_stride2, int h){\
825 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
826 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
827}\
828\
45553457 829static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
830 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
831}\
832\
45553457 833static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
834 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
835}\
836\
45553457 837static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
838 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
839}\
840\
45553457 841static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
842 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
843}\
844\
845static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
847 int i;\
848 for(i=0; i<h; i++){\
b3184779
MN
849 uint32_t a, b, c, d, l0, l1, h0, h1;\
850 a= LD32(&src1[i*src_stride1]);\
851 b= LD32(&src2[i*src_stride2]);\
852 c= LD32(&src3[i*src_stride3]);\
853 d= LD32(&src4[i*src_stride4]);\
854 l0= (a&0x03030303UL)\
855 + (b&0x03030303UL)\
856 + 0x02020202UL;\
857 h0= ((a&0xFCFCFCFCUL)>>2)\
858 + ((b&0xFCFCFCFCUL)>>2);\
859 l1= (c&0x03030303UL)\
860 + (d&0x03030303UL);\
861 h1= ((c&0xFCFCFCFCUL)>>2)\
862 + ((d&0xFCFCFCFCUL)>>2);\
863 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864 a= LD32(&src1[i*src_stride1+4]);\
865 b= LD32(&src2[i*src_stride2+4]);\
866 c= LD32(&src3[i*src_stride3+4]);\
867 d= LD32(&src4[i*src_stride4+4]);\
868 l0= (a&0x03030303UL)\
869 + (b&0x03030303UL)\
870 + 0x02020202UL;\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
874 + (d&0x03030303UL);\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
878 }\
879}\
669ac79c
MN
880\
881static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
883}\
884\
885static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
887}\
888\
889static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
891}\
892\
893static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
895}\
896\
b3184779
MN
897static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
899 int i;\
900 for(i=0; i<h; i++){\
b3184779
MN
901 uint32_t a, b, c, d, l0, l1, h0, h1;\
902 a= LD32(&src1[i*src_stride1]);\
903 b= LD32(&src2[i*src_stride2]);\
904 c= LD32(&src3[i*src_stride3]);\
905 d= LD32(&src4[i*src_stride4]);\
906 l0= (a&0x03030303UL)\
907 + (b&0x03030303UL)\
908 + 0x01010101UL;\
909 h0= ((a&0xFCFCFCFCUL)>>2)\
910 + ((b&0xFCFCFCFCUL)>>2);\
911 l1= (c&0x03030303UL)\
912 + (d&0x03030303UL);\
913 h1= ((c&0xFCFCFCFCUL)>>2)\
914 + ((d&0xFCFCFCFCUL)>>2);\
915 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916 a= LD32(&src1[i*src_stride1+4]);\
917 b= LD32(&src2[i*src_stride2+4]);\
918 c= LD32(&src3[i*src_stride3+4]);\
919 d= LD32(&src4[i*src_stride4+4]);\
920 l0= (a&0x03030303UL)\
921 + (b&0x03030303UL)\
922 + 0x01010101UL;\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
926 + (d&0x03030303UL);\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
930 }\
931}\
b3184779
MN
932static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936}\
937static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941}\
59fe111e 942\
669ac79c
MN
943static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
944{\
945 int i, a0, b0, a1, b1;\
946 a0= pixels[0];\
947 b0= pixels[1] + 2;\
948 a0 += b0;\
949 b0 += pixels[2];\
950\
951 pixels+=line_size;\
952 for(i=0; i<h; i+=2){\
953 a1= pixels[0];\
954 b1= pixels[1];\
955 a1 += b1;\
956 b1 += pixels[2];\
957\
958 block[0]= (a1+a0)>>2; /* FIXME non put */\
959 block[1]= (b1+b0)>>2;\
960\
961 pixels+=line_size;\
962 block +=line_size;\
963\
964 a0= pixels[0];\
965 b0= pixels[1] + 2;\
966 a0 += b0;\
967 b0 += pixels[2];\
968\
969 block[0]= (a1+a0)>>2;\
970 block[1]= (b1+b0)>>2;\
971 pixels+=line_size;\
972 block +=line_size;\
973 }\
974}\
975\
976static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
977{\
978 int i;\
979 const uint32_t a= LD32(pixels );\
980 const uint32_t b= LD32(pixels+1);\
981 uint32_t l0= (a&0x03030303UL)\
982 + (b&0x03030303UL)\
983 + 0x02020202UL;\
984 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985 + ((b&0xFCFCFCFCUL)>>2);\
986 uint32_t l1,h1;\
987\
988 pixels+=line_size;\
989 for(i=0; i<h; i+=2){\
990 uint32_t a= LD32(pixels );\
991 uint32_t b= LD32(pixels+1);\
992 l1= (a&0x03030303UL)\
993 + (b&0x03030303UL);\
994 h1= ((a&0xFCFCFCFCUL)>>2)\
995 + ((b&0xFCFCFCFCUL)>>2);\
996 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
997 pixels+=line_size;\
998 block +=line_size;\
999 a= LD32(pixels );\
1000 b= LD32(pixels+1);\
1001 l0= (a&0x03030303UL)\
1002 + (b&0x03030303UL)\
1003 + 0x02020202UL;\
1004 h0= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007 pixels+=line_size;\
1008 block +=line_size;\
1009 }\
1010}\
1011\
45553457 1012static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1013{\
1014 int j;\
1015 for(j=0; j<2; j++){\
1016 int i;\
1017 const uint32_t a= LD32(pixels );\
1018 const uint32_t b= LD32(pixels+1);\
1019 uint32_t l0= (a&0x03030303UL)\
1020 + (b&0x03030303UL)\
1021 + 0x02020202UL;\
1022 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023 + ((b&0xFCFCFCFCUL)>>2);\
1024 uint32_t l1,h1;\
1025\
1026 pixels+=line_size;\
1027 for(i=0; i<h; i+=2){\
1028 uint32_t a= LD32(pixels );\
1029 uint32_t b= LD32(pixels+1);\
1030 l1= (a&0x03030303UL)\
1031 + (b&0x03030303UL);\
1032 h1= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035 pixels+=line_size;\
1036 block +=line_size;\
1037 a= LD32(pixels );\
1038 b= LD32(pixels+1);\
1039 l0= (a&0x03030303UL)\
1040 + (b&0x03030303UL)\
1041 + 0x02020202UL;\
1042 h0= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045 pixels+=line_size;\
1046 block +=line_size;\
1047 }\
1048 pixels+=4-line_size*(h+1);\
1049 block +=4-line_size*h;\
1050 }\
1051}\
1052\
45553457 1053static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1054{\
1055 int j;\
1056 for(j=0; j<2; j++){\
1057 int i;\
1058 const uint32_t a= LD32(pixels );\
1059 const uint32_t b= LD32(pixels+1);\
1060 uint32_t l0= (a&0x03030303UL)\
1061 + (b&0x03030303UL)\
1062 + 0x01010101UL;\
1063 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1065 uint32_t l1,h1;\
1066\
1067 pixels+=line_size;\
1068 for(i=0; i<h; i+=2){\
1069 uint32_t a= LD32(pixels );\
1070 uint32_t b= LD32(pixels+1);\
1071 l1= (a&0x03030303UL)\
1072 + (b&0x03030303UL);\
1073 h1= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1075 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1076 pixels+=line_size;\
1077 block +=line_size;\
1078 a= LD32(pixels );\
1079 b= LD32(pixels+1);\
1080 l0= (a&0x03030303UL)\
1081 + (b&0x03030303UL)\
1082 + 0x01010101UL;\
1083 h0= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086 pixels+=line_size;\
1087 block +=line_size;\
1088 }\
1089 pixels+=4-line_size*(h+1);\
1090 block +=4-line_size*h;\
1091 }\
1092}\
1093\
45553457
ZK
1094CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1095CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1099CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1102
d8085ea7 1103#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1104#endif
59fe111e
MN
1105#define op_put(a, b) a = b
1106
1107PIXOP2(avg, op_avg)
1108PIXOP2(put, op_put)
1109#undef op_avg
1110#undef op_put
1111
de6d9b64
FB
1112#define avg2(a,b) ((a+b+1)>>1)
1113#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1114
c0a0170c
MN
1115static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1117}
1118
1119static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1121}
073b013d 1122
0c1a9eda 1123static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1124{
1125 const int A=(16-x16)*(16-y16);
1126 const int B=( x16)*(16-y16);
1127 const int C=(16-x16)*( y16);
1128 const int D=( x16)*( y16);
1129 int i;
44eb4951
MN
1130
1131 for(i=0; i<h; i++)
1132 {
b3184779
MN
1133 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1141 dst+= stride;
1142 src+= stride;
44eb4951
MN
1143 }
1144}
1145
115329f1 1146static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1147 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1148{
1149 int y, vx, vy;
1150 const int s= 1<<shift;
115329f1 1151
073b013d
MN
1152 width--;
1153 height--;
1154
1155 for(y=0; y<h; y++){
1156 int x;
1157
1158 vx= ox;
1159 vy= oy;
1160 for(x=0; x<8; x++){ //XXX FIXME optimize
1161 int src_x, src_y, frac_x, frac_y, index;
1162
1163 src_x= vx>>16;
1164 src_y= vy>>16;
1165 frac_x= src_x&(s-1);
1166 frac_y= src_y&(s-1);
1167 src_x>>=shift;
1168 src_y>>=shift;
115329f1 1169
073b013d
MN
1170 if((unsigned)src_x < width){
1171 if((unsigned)src_y < height){
1172 index= src_x + src_y*stride;
1173 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1174 + src[index +1]* frac_x )*(s-frac_y)
1175 + ( src[index+stride ]*(s-frac_x)
1176 + src[index+stride+1]* frac_x )* frac_y
1177 + r)>>(shift*2);
1178 }else{
115329f1
DB
1179 index= src_x + clip(src_y, 0, height)*stride;
1180 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1181 + src[index +1]* frac_x )*s
1182 + r)>>(shift*2);
1183 }
1184 }else{
1185 if((unsigned)src_y < height){
115329f1
DB
1186 index= clip(src_x, 0, width) + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1188 + src[index+stride ]* frac_y )*s
1189 + r)>>(shift*2);
1190 }else{
115329f1 1191 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
073b013d
MN
1192 dst[y*stride + x]= src[index ];
1193 }
1194 }
115329f1 1195
073b013d
MN
1196 vx+= dxx;
1197 vy+= dyx;
1198 }
1199 ox += dxy;
1200 oy += dyy;
1201 }
1202}
669ac79c
MN
1203
1204static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1205 switch(width){
1206 case 2: put_pixels2_c (dst, src, stride, height); break;
1207 case 4: put_pixels4_c (dst, src, stride, height); break;
1208 case 8: put_pixels8_c (dst, src, stride, height); break;
1209 case 16:put_pixels16_c(dst, src, stride, height); break;
1210 }
1211}
1212
1213static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214 int i,j;
1215 for (i=0; i < height; i++) {
1216 for (j=0; j < width; j++) {
bb270c08 1217 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1218 }
1219 src += stride;
1220 dst += stride;
1221 }
1222}
1223
1224static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225 int i,j;
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
bb270c08 1228 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1229 }
1230 src += stride;
1231 dst += stride;
1232 }
1233}
115329f1 1234
669ac79c
MN
1235static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236 int i,j;
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
bb270c08 1239 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1240 }
1241 src += stride;
1242 dst += stride;
1243 }
1244}
115329f1 1245
669ac79c
MN
1246static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247 int i,j;
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
bb270c08 1250 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1251 }
1252 src += stride;
1253 dst += stride;
1254 }
1255}
1256
1257static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258 int i,j;
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
bb270c08 1261 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1262 }
1263 src += stride;
1264 dst += stride;
1265 }
1266}
1267
1268static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269 int i,j;
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
bb270c08 1272 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1273 }
1274 src += stride;
1275 dst += stride;
1276 }
1277}
1278
1279static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280 int i,j;
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
bb270c08 1283 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1284 }
1285 src += stride;
1286 dst += stride;
1287 }
1288}
1289
1290static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291 int i,j;
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
bb270c08 1294 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1295 }
1296 src += stride;
1297 dst += stride;
1298 }
1299}
da3b9756
MM
1300
1301static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302 switch(width){
1303 case 2: avg_pixels2_c (dst, src, stride, height); break;
1304 case 4: avg_pixels4_c (dst, src, stride, height); break;
1305 case 8: avg_pixels8_c (dst, src, stride, height); break;
1306 case 16:avg_pixels16_c(dst, src, stride, height); break;
1307 }
1308}
1309
1310static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311 int i,j;
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
bb270c08 1314 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1315 }
1316 src += stride;
1317 dst += stride;
1318 }
1319}
1320
1321static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322 int i,j;
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
bb270c08 1325 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1326 }
1327 src += stride;
1328 dst += stride;
1329 }
1330}
115329f1 1331
da3b9756
MM
1332static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333 int i,j;
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
bb270c08 1336 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1337 }
1338 src += stride;
1339 dst += stride;
1340 }
1341}
115329f1 1342
da3b9756
MM
1343static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344 int i,j;
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
bb270c08 1347 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1348 }
1349 src += stride;
1350 dst += stride;
1351 }
1352}
1353
1354static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355 int i,j;
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
bb270c08 1358 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1359 }
1360 src += stride;
1361 dst += stride;
1362 }
1363}
1364
1365static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366 int i,j;
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
bb270c08 1369 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1370 }
1371 src += stride;
1372 dst += stride;
1373 }
1374}
1375
1376static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377 int i,j;
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
bb270c08 1380 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1381 }
1382 src += stride;
1383 dst += stride;
1384 }
1385}
1386
1387static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388 int i,j;
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
bb270c08 1391 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1392 }
1393 src += stride;
1394 dst += stride;
1395 }
1396}
669ac79c
MN
1397#if 0
1398#define TPEL_WIDTH(width)\
1399static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1417#endif
1418
0da71265
MN
1419#define H264_CHROMA_MC(OPNAME, OP)\
1420static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421 const int A=(8-x)*(8-y);\
1422 const int B=( x)*(8-y);\
1423 const int C=(8-x)*( y);\
1424 const int D=( x)*( y);\
1425 int i;\
1426 \
1427 assert(x<8 && y<8 && x>=0 && y>=0);\
1428\
1429 for(i=0; i<h; i++)\
1430 {\
1431 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1433 dst+= stride;\
1434 src+= stride;\
1435 }\
1436}\
1437\
1438static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439 const int A=(8-x)*(8-y);\
1440 const int B=( x)*(8-y);\
1441 const int C=(8-x)*( y);\
1442 const int D=( x)*( y);\
1443 int i;\
1444 \
1445 assert(x<8 && y<8 && x>=0 && y>=0);\
1446\
1447 for(i=0; i<h; i++)\
1448 {\
1449 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1453 dst+= stride;\
1454 src+= stride;\
1455 }\
1456}\
1457\
1458static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459 const int A=(8-x)*(8-y);\
1460 const int B=( x)*(8-y);\
1461 const int C=(8-x)*( y);\
1462 const int D=( x)*( y);\
1463 int i;\
1464 \
1465 assert(x<8 && y<8 && x>=0 && y>=0);\
1466\
1467 for(i=0; i<h; i++)\
1468 {\
1469 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1477 dst+= stride;\
1478 src+= stride;\
1479 }\
1480}
1481
1482#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483#define op_put(a, b) a = (((b) + 32)>>6)
1484
1485H264_CHROMA_MC(put_ , op_put)
1486H264_CHROMA_MC(avg_ , op_avg)
1487#undef op_avg
1488#undef op_put
1489
1490static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1491{
1492 int i;
1493 for(i=0; i<h; i++)
1494 {
1495 ST32(dst , LD32(src ));
1496 dst+=dstStride;
1497 src+=srcStride;
1498 }
1499}
1500
1501static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1502{
1503 int i;
1504 for(i=0; i<h; i++)
1505 {
1506 ST32(dst , LD32(src ));
1507 ST32(dst+4 , LD32(src+4 ));
1508 dst+=dstStride;
1509 src+=srcStride;
1510 }
1511}
1512
1513static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514{
1515 int i;
1516 for(i=0; i<h; i++)
1517 {
1518 ST32(dst , LD32(src ));
1519 ST32(dst+4 , LD32(src+4 ));
1520 ST32(dst+8 , LD32(src+8 ));
1521 ST32(dst+12, LD32(src+12));
1522 dst+=dstStride;
1523 src+=srcStride;
1524 }
1525}
073b013d 1526
0c1a9eda 1527static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1528{
44eb4951
MN
1529 int i;
1530 for(i=0; i<h; i++)
1531 {
b3184779
MN
1532 ST32(dst , LD32(src ));
1533 ST32(dst+4 , LD32(src+4 ));
1534 ST32(dst+8 , LD32(src+8 ));
1535 ST32(dst+12, LD32(src+12));
1536 dst[16]= src[16];
44eb4951
MN
1537 dst+=dstStride;
1538 src+=srcStride;
1539 }
1540}
1541
0c1a9eda 1542static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1543{
1544 int i;
b3184779 1545 for(i=0; i<h; i++)
44eb4951 1546 {
b3184779
MN
1547 ST32(dst , LD32(src ));
1548 ST32(dst+4 , LD32(src+4 ));
1549 dst[8]= src[8];
44eb4951
MN
1550 dst+=dstStride;
1551 src+=srcStride;
1552 }
1553}
1554
826f429a 1555
b3184779 1556#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1557static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1558 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1559 int i;\
1560 for(i=0; i<h; i++)\
1561 {\
1562 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1563 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1564 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1565 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1566 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1567 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1568 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1569 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1570 dst+=dstStride;\
1571 src+=srcStride;\
1572 }\
44eb4951
MN
1573}\
1574\
0c1a9eda 1575static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1576 const int w=8;\
0c1a9eda 1577 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1578 int i;\
1579 for(i=0; i<w; i++)\
1580 {\
1581 const int src0= src[0*srcStride];\
1582 const int src1= src[1*srcStride];\
1583 const int src2= src[2*srcStride];\
1584 const int src3= src[3*srcStride];\
1585 const int src4= src[4*srcStride];\
1586 const int src5= src[5*srcStride];\
1587 const int src6= src[6*srcStride];\
1588 const int src7= src[7*srcStride];\
1589 const int src8= src[8*srcStride];\
1590 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1591 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1592 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1593 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1594 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1595 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1596 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1597 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1598 dst++;\
1599 src++;\
1600 }\
1601}\
1602\
0c1a9eda
ZK
1603static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1604 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1605 int i;\
826f429a 1606 \
b3184779
MN
1607 for(i=0; i<h; i++)\
1608 {\
1609 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1610 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1611 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1612 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1613 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1614 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1615 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1616 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1617 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1618 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1619 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1620 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1621 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1622 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1623 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1624 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1625 dst+=dstStride;\
1626 src+=srcStride;\
1627 }\
1628}\
1629\
0c1a9eda
ZK
1630static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1631 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1632 int i;\
826f429a 1633 const int w=16;\
b3184779
MN
1634 for(i=0; i<w; i++)\
1635 {\
1636 const int src0= src[0*srcStride];\
1637 const int src1= src[1*srcStride];\
1638 const int src2= src[2*srcStride];\
1639 const int src3= src[3*srcStride];\
1640 const int src4= src[4*srcStride];\
1641 const int src5= src[5*srcStride];\
1642 const int src6= src[6*srcStride];\
1643 const int src7= src[7*srcStride];\
1644 const int src8= src[8*srcStride];\
1645 const int src9= src[9*srcStride];\
1646 const int src10= src[10*srcStride];\
1647 const int src11= src[11*srcStride];\
1648 const int src12= src[12*srcStride];\
1649 const int src13= src[13*srcStride];\
1650 const int src14= src[14*srcStride];\
1651 const int src15= src[15*srcStride];\
1652 const int src16= src[16*srcStride];\
1653 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1654 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1655 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1656 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1657 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1658 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1659 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1660 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1661 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1662 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1663 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1664 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1665 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1666 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1667 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1668 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1669 dst++;\
1670 src++;\
1671 }\
1672}\
1673\
0c1a9eda 1674static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1675 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1676}\
1677\
0c1a9eda
ZK
1678static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1679 uint8_t half[64];\
b3184779
MN
1680 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1681 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1682}\
1683\
0c1a9eda 1684static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1685 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1686}\
1687\
0c1a9eda
ZK
1688static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1689 uint8_t half[64];\
b3184779
MN
1690 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1692}\
1693\
0c1a9eda
ZK
1694static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1695 uint8_t full[16*9];\
1696 uint8_t half[64];\
b3184779 1697 copy_block9(full, src, 16, stride, 9);\
db794953 1698 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1699 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1700}\
1701\
0c1a9eda
ZK
1702static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
b3184779 1704 copy_block9(full, src, 16, stride, 9);\
db794953 1705 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1706}\
1707\
0c1a9eda
ZK
1708static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1709 uint8_t full[16*9];\
1710 uint8_t half[64];\
b3184779 1711 copy_block9(full, src, 16, stride, 9);\
db794953 1712 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1713 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1714}\
0c1a9eda
ZK
1715void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716 uint8_t full[16*9];\
1717 uint8_t halfH[72];\
1718 uint8_t halfV[64];\
1719 uint8_t halfHV[64];\
b3184779
MN
1720 copy_block9(full, src, 16, stride, 9);\
1721 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1722 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1724 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1725}\
0c1a9eda
ZK
1726static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1728 uint8_t halfH[72];\
1729 uint8_t halfHV[64];\
db794953
MN
1730 copy_block9(full, src, 16, stride, 9);\
1731 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1735}\
0c1a9eda
ZK
1736void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[16*9];\
1738 uint8_t halfH[72];\
1739 uint8_t halfV[64];\
1740 uint8_t halfHV[64];\
b3184779
MN
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1743 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1745 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1746}\
0c1a9eda
ZK
1747static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1749 uint8_t halfH[72];\
1750 uint8_t halfHV[64];\
db794953
MN
1751 copy_block9(full, src, 16, stride, 9);\
1752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1756}\
0c1a9eda
ZK
1757void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[16*9];\
1759 uint8_t halfH[72];\
1760 uint8_t halfV[64];\
1761 uint8_t halfHV[64];\
b3184779
MN
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1764 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1766 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1767}\
0c1a9eda
ZK
1768static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[16*9];\
1770 uint8_t halfH[72];\
1771 uint8_t halfHV[64];\
db794953
MN
1772 copy_block9(full, src, 16, stride, 9);\
1773 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1777}\
0c1a9eda
ZK
1778void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[16*9];\
1780 uint8_t halfH[72];\
1781 uint8_t halfV[64];\
1782 uint8_t halfHV[64];\
b3184779
MN
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1785 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1787 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1788}\
0c1a9eda
ZK
1789static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[16*9];\
1791 uint8_t halfH[72];\
1792 uint8_t halfHV[64];\
db794953
MN
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1798}\
0c1a9eda
ZK
1799static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1800 uint8_t halfH[72];\
1801 uint8_t halfHV[64];\
b3184779 1802 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1804 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1805}\
0c1a9eda
ZK
1806static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t halfH[72];\
1808 uint8_t halfHV[64];\
b3184779 1809 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1810 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1811 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1812}\
0c1a9eda
ZK
1813void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1814 uint8_t full[16*9];\
1815 uint8_t halfH[72];\
1816 uint8_t halfV[64];\
1817 uint8_t halfHV[64];\
b3184779
MN
1818 copy_block9(full, src, 16, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1820 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1822 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1823}\
0c1a9eda
ZK
1824static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t full[16*9];\
1826 uint8_t halfH[72];\
db794953
MN
1827 copy_block9(full, src, 16, stride, 9);\
1828 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1830 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1831}\
0c1a9eda
ZK
1832void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1834 uint8_t halfH[72];\
1835 uint8_t halfV[64];\
1836 uint8_t halfHV[64];\
b3184779
MN
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1839 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1841 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1842}\
0c1a9eda
ZK
1843static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1845 uint8_t halfH[72];\
db794953
MN
1846 copy_block9(full, src, 16, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1849 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1850}\
0c1a9eda
ZK
1851static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t halfH[72];\
b3184779 1853 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1854 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1855}\
0c1a9eda 1856static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1857 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1858}\
1859\
0c1a9eda
ZK
1860static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t half[256];\
b3184779
MN
1862 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1863 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1864}\
1865\
0c1a9eda 1866static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1867 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1868}\
b3184779 1869\
0c1a9eda
ZK
1870static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1871 uint8_t half[256];\
b3184779
MN
1872 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1874}\
1875\
0c1a9eda
ZK
1876static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[24*17];\
1878 uint8_t half[256];\
b3184779 1879 copy_block17(full, src, 24, stride, 17);\
826f429a 1880 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1881 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1882}\
1883\
0c1a9eda
ZK
1884static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
b3184779 1886 copy_block17(full, src, 24, stride, 17);\
826f429a 1887 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1888}\
1889\
0c1a9eda
ZK
1890static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1891 uint8_t full[24*17];\
1892 uint8_t half[256];\
b3184779 1893 copy_block17(full, src, 24, stride, 17);\
826f429a 1894 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1895 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1896}\
0c1a9eda
ZK
1897void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[24*17];\
1899 uint8_t halfH[272];\
1900 uint8_t halfV[256];\
1901 uint8_t halfHV[256];\
b3184779
MN
1902 copy_block17(full, src, 24, stride, 17);\
1903 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1904 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1906 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1907}\
0c1a9eda
ZK
1908static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfHV[256];\
db794953
MN
1912 copy_block17(full, src, 24, stride, 17);\
1913 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1917}\
0c1a9eda
ZK
1918void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[24*17];\
1920 uint8_t halfH[272];\
1921 uint8_t halfV[256];\
1922 uint8_t halfHV[256];\
b3184779
MN
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1925 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1927 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1928}\
0c1a9eda
ZK
1929static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfHV[256];\
db794953
MN
1933 copy_block17(full, src, 24, stride, 17);\
1934 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1938}\
0c1a9eda
ZK
1939void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[24*17];\
1941 uint8_t halfH[272];\
1942 uint8_t halfV[256];\
1943 uint8_t halfHV[256];\
b3184779
MN
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1946 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1948 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1949}\
0c1a9eda
ZK
1950static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t full[24*17];\
1952 uint8_t halfH[272];\
1953 uint8_t halfHV[256];\
db794953
MN
1954 copy_block17(full, src, 24, stride, 17);\
1955 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1959}\
0c1a9eda
ZK
1960void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t full[24*17];\
1962 uint8_t halfH[272];\
1963 uint8_t halfV[256];\
1964 uint8_t halfHV[256];\
b3184779
MN
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1967 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1969 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1970}\
0c1a9eda
ZK
1971static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1972 uint8_t full[24*17];\
1973 uint8_t halfH[272];\
1974 uint8_t halfHV[256];\
db794953
MN
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1978 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1980}\
0c1a9eda
ZK
1981static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t halfH[272];\
1983 uint8_t halfHV[256];\
b3184779 1984 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1985 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1986 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1987}\
0c1a9eda
ZK
1988static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t halfH[272];\
1990 uint8_t halfHV[256];\
b3184779 1991 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1992 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1993 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1994}\
0c1a9eda
ZK
1995void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1996 uint8_t full[24*17];\
1997 uint8_t halfH[272];\
1998 uint8_t halfV[256];\
1999 uint8_t halfHV[256];\
b3184779
MN
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2002 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2004 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2005}\
0c1a9eda
ZK
2006static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
db794953
MN
2009 copy_block17(full, src, 24, stride, 17);\
2010 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2012 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2013}\
0c1a9eda
ZK
2014void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2015 uint8_t full[24*17];\
2016 uint8_t halfH[272];\
2017 uint8_t halfV[256];\
2018 uint8_t halfHV[256];\
b3184779
MN
2019 copy_block17(full, src, 24, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2021 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2023 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2024}\
0c1a9eda
ZK
2025static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t halfH[272];\
db794953
MN
2028 copy_block17(full, src, 24, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2030 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2031 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2032}\
0c1a9eda
ZK
2033static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t halfH[272];\
b3184779 2035 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2036 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2037}
44eb4951 2038
b3184779
MN
2039#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2040#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2041#define op_put(a, b) a = cm[((b) + 16)>>5]
2042#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2043
2044QPEL_MC(0, put_ , _ , op_put)
2045QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2046QPEL_MC(0, avg_ , _ , op_avg)
2047//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2048#undef op_avg
2049#undef op_avg_no_rnd
2050#undef op_put
2051#undef op_put_no_rnd
44eb4951 2052
0da71265
MN
2053#if 1
2054#define H264_LOWPASS(OPNAME, OP, OP2) \
2055static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2056 const int h=4;\
2057 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2058 int i;\
2059 for(i=0; i<h; i++)\
2060 {\
2061 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2062 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2063 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2064 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2065 dst+=dstStride;\
2066 src+=srcStride;\
2067 }\
2068}\
2069\
2070static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2071 const int w=4;\
2072 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2073 int i;\
2074 for(i=0; i<w; i++)\
2075 {\
2076 const int srcB= src[-2*srcStride];\
2077 const int srcA= src[-1*srcStride];\
2078 const int src0= src[0 *srcStride];\
2079 const int src1= src[1 *srcStride];\
2080 const int src2= src[2 *srcStride];\
2081 const int src3= src[3 *srcStride];\
2082 const int src4= src[4 *srcStride];\
2083 const int src5= src[5 *srcStride];\
2084 const int src6= src[6 *srcStride];\
2085 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2086 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2087 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2088 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2089 dst++;\
2090 src++;\
2091 }\
2092}\
2093\
2094static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2095 const int h=4;\
2096 const int w=4;\
2097 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2098 int i;\
2099 src -= 2*srcStride;\
2100 for(i=0; i<h+5; i++)\
2101 {\
2102 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2103 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2104 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2105 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2106 tmp+=tmpStride;\
2107 src+=srcStride;\
2108 }\
2109 tmp -= tmpStride*(h+5-2);\
2110 for(i=0; i<w; i++)\
2111 {\
2112 const int tmpB= tmp[-2*tmpStride];\
2113 const int tmpA= tmp[-1*tmpStride];\
2114 const int tmp0= tmp[0 *tmpStride];\
2115 const int tmp1= tmp[1 *tmpStride];\
2116 const int tmp2= tmp[2 *tmpStride];\
2117 const int tmp3= tmp[3 *tmpStride];\
2118 const int tmp4= tmp[4 *tmpStride];\
2119 const int tmp5= tmp[5 *tmpStride];\
2120 const int tmp6= tmp[6 *tmpStride];\
2121 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2123 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2124 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2125 dst++;\
2126 tmp++;\
2127 }\
2128}\
2129\
2130static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131 const int h=8;\
2132 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2133 int i;\
2134 for(i=0; i<h; i++)\
2135 {\
2136 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2137 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2138 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2139 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2140 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2141 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2142 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2143 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2144 dst+=dstStride;\
2145 src+=srcStride;\
2146 }\
2147}\
2148\
2149static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150 const int w=8;\
2151 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2152 int i;\
2153 for(i=0; i<w; i++)\
2154 {\
2155 const int srcB= src[-2*srcStride];\
2156 const int srcA= src[-1*srcStride];\
2157 const int src0= src[0 *srcStride];\
2158 const int src1= src[1 *srcStride];\
2159 const int src2= src[2 *srcStride];\
2160 const int src3= src[3 *srcStride];\
2161 const int src4= src[4 *srcStride];\
2162 const int src5= src[5 *srcStride];\
2163 const int src6= src[6 *srcStride];\
2164 const int src7= src[7 *srcStride];\
2165 const int src8= src[8 *srcStride];\
2166 const int src9= src[9 *srcStride];\
2167 const int src10=src[10*srcStride];\
2168 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2169 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2170 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2171 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2172 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2173 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2174 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2175 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2176 dst++;\
2177 src++;\
2178 }\
2179}\
2180\
2181static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2182 const int h=8;\
2183 const int w=8;\
2184 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2185 int i;\
2186 src -= 2*srcStride;\
2187 for(i=0; i<h+5; i++)\
2188 {\
2189 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2190 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2191 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2192 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2193 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2194 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2195 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2196 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2197 tmp+=tmpStride;\
2198 src+=srcStride;\
2199 }\
2200 tmp -= tmpStride*(h+5-2);\
2201 for(i=0; i<w; i++)\
2202 {\
2203 const int tmpB= tmp[-2*tmpStride];\
2204 const int tmpA= tmp[-1*tmpStride];\
2205 const int tmp0= tmp[0 *tmpStride];\
2206 const int tmp1= tmp[1 *tmpStride];\
2207 const int tmp2= tmp[2 *tmpStride];\
2208 const int tmp3= tmp[3 *tmpStride];\
2209 const int tmp4= tmp[4 *tmpStride];\
2210 const int tmp5= tmp[5 *tmpStride];\
2211 const int tmp6= tmp[6 *tmpStride];\
2212 const int tmp7= tmp[7 *tmpStride];\
2213 const int tmp8= tmp[8 *tmpStride];\
2214 const int tmp9= tmp[9 *tmpStride];\
2215 const int tmp10=tmp[10*tmpStride];\
2216 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2217 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2218 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2219 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2220 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2221 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2222 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2223 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2224 dst++;\
2225 tmp++;\
2226 }\
2227}\
2228\
2229static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2231 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2232 src += 8*srcStride;\
2233 dst += 8*dstStride;\
2234 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2235 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2236}\
2237\
2238static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2239 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2240 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2241 src += 8*srcStride;\
2242 dst += 8*dstStride;\
2243 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2244 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2245}\
2246\
2247static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2248 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2249 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2250 src += 8*srcStride;\
0da71265
MN
2251 dst += 8*dstStride;\
2252 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2253 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2254}\
2255
2256#define H264_MC(OPNAME, SIZE) \
2257static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2258 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2259}\
2260\
2261static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2262 uint8_t half[SIZE*SIZE];\
2263 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2264 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2265}\
2266\
2267static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2268 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2269}\
2270\
2271static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2272 uint8_t half[SIZE*SIZE];\
2273 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2274 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2275}\
2276\
2277static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2278 uint8_t full[SIZE*(SIZE+5)];\
2279 uint8_t * const full_mid= full + SIZE*2;\
2280 uint8_t half[SIZE*SIZE];\
2281 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2282 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2283 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2284}\
2285\
2286static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2287 uint8_t full[SIZE*(SIZE+5)];\
2288 uint8_t * const full_mid= full + SIZE*2;\
2289 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2290 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2291}\
2292\
2293static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2294 uint8_t full[SIZE*(SIZE+5)];\
2295 uint8_t * const full_mid= full + SIZE*2;\
2296 uint8_t half[SIZE*SIZE];\
2297 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2298 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2299 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2300}\
2301\
2302static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2303 uint8_t full[SIZE*(SIZE+5)];\
2304 uint8_t * const full_mid= full + SIZE*2;\
2305 uint8_t halfH[SIZE*SIZE];\
2306 uint8_t halfV[SIZE*SIZE];\
2307 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2308 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2309 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2310 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2311}\
2312\
2313static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2314 uint8_t full[SIZE*(SIZE+5)];\
2315 uint8_t * const full_mid= full + SIZE*2;\
2316 uint8_t halfH[SIZE*SIZE];\
2317 uint8_t halfV[SIZE*SIZE];\
2318 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2320 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2322}\
2323\
2324static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2325 uint8_t full[SIZE*(SIZE+5)];\
2326 uint8_t * const full_mid= full + SIZE*2;\
2327 uint8_t halfH[SIZE*SIZE];\
2328 uint8_t halfV[SIZE*SIZE];\
2329 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2330 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2331 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333}\
2334\
2335static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 uint8_t halfH[SIZE*SIZE];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2341 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344}\
2345\
2346static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2347 int16_t tmp[SIZE*(SIZE+5)];\
2348 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2349}\
2350\
2351static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2352 int16_t tmp[SIZE*(SIZE+5)];\
2353 uint8_t halfH[SIZE*SIZE];\
2354 uint8_t halfHV[SIZE*SIZE];\
2355 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2356 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2357 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2358}\
2359\
2360static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2361 int16_t tmp[SIZE*(SIZE+5)];\
2362 uint8_t halfH[SIZE*SIZE];\
2363 uint8_t halfHV[SIZE*SIZE];\
2364 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2365 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2366 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2367}\
2368\
2369static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2370 uint8_t full[SIZE*(SIZE+5)];\
2371 uint8_t * const full_mid= full + SIZE*2;\
2372 int16_t tmp[SIZE*(SIZE+5)];\
2373 uint8_t halfV[SIZE*SIZE];\
2374 uint8_t halfHV[SIZE*SIZE];\
2375 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2376 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2378 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2379}\
2380\
2381static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2382 uint8_t full[SIZE*(SIZE+5)];\
2383 uint8_t * const full_mid= full + SIZE*2;\
2384 int16_t tmp[SIZE*(SIZE+5)];\
2385 uint8_t halfV[SIZE*SIZE];\
2386 uint8_t halfHV[SIZE*SIZE];\
2387 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2388 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2389 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2390 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2391}\
2392
2393#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2394//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2395#define op_put(a, b) a = cm[((b) + 16)>>5]
2396#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2397#define op2_put(a, b) a = cm[((b) + 512)>>10]
2398
2399H264_LOWPASS(put_ , op_put, op2_put)
2400H264_LOWPASS(avg_ , op_avg, op2_avg)
2401H264_MC(put_, 4)
2402H264_MC(put_, 8)
2403H264_MC(put_, 16)
2404H264_MC(avg_, 4)
2405H264_MC(avg_, 8)
2406H264_MC(avg_, 16)
2407
2408#undef op_avg
2409#undef op_put
2410#undef op2_avg
2411#undef op2_put
2412#endif
2413
91c56db6
MN
2414#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2415#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2416#define H264_WEIGHT(W,H) \
2417static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
88730be6 2418 int attribute_unused x, y; \
9f2d1b4f
LM
2419 offset <<= log2_denom; \
2420 if(log2_denom) offset += 1<<(log2_denom-1); \
2421 for(y=0; y<H; y++, block += stride){ \
2422 op_scale1(0); \
2423 op_scale1(1); \
2424 if(W==2) continue; \
2425 op_scale1(2); \
2426 op_scale1(3); \
2427 if(W==4) continue; \
2428 op_scale1(4); \
2429 op_scale1(5); \
2430 op_scale1(6); \
2431 op_scale1(7); \
2432 if(W==8) continue; \
2433 op_scale1(8); \
2434 op_scale1(9); \
2435 op_scale1(10); \
2436 op_scale1(11); \
2437 op_scale1(12); \
2438 op_scale1(13); \
2439 op_scale1(14); \
2440 op_scale1(15); \
2441 } \
2442} \
2443static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
88730be6 2444 int attribute_unused x, y; \
9f2d1b4f
LM
2445 int offset = (offsets + offsetd + 1) >> 1; \
2446 offset = ((offset << 1) + 1) << log2_denom; \
2447 for(y=0; y<H; y++, dst += stride, src += stride){ \
2448 op_scale2(0); \
2449 op_scale2(1); \
2450 if(W==2) continue; \
2451 op_scale2(2); \
2452 op_scale2(3); \
2453 if(W==4) continue; \
2454 op_scale2(4); \
2455 op_scale2(5); \
2456 op_scale2(6); \
2457 op_scale2(7); \
2458 if(W==8) continue; \
2459 op_scale2(8); \
2460 op_scale2(9); \
2461 op_scale2(10); \
2462 op_scale2(11); \
2463 op_scale2(12); \
2464 op_scale2(13); \
2465 op_scale2(14); \
2466 op_scale2(15); \
2467 } \
2468}
2469
2470H264_WEIGHT(16,16)
2471H264_WEIGHT(16,8)
2472H264_WEIGHT(8,16)
2473H264_WEIGHT(8,8)
2474H264_WEIGHT(8,4)
2475H264_WEIGHT(4,8)
2476H264_WEIGHT(4,4)
2477H264_WEIGHT(4,2)
2478H264_WEIGHT(2,4)
2479H264_WEIGHT(2,2)
2480
2481#undef op_scale1
2482#undef op_scale2
2483#undef H264_WEIGHT
2484
1457ab52
MN
2485static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2486 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2487 int i;
2488
2489 for(i=0; i<h; i++){
2490 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2491 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2492 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2493 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2494 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2495 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2496 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2497 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2498 dst+=dstStride;
115329f1 2499 src+=srcStride;
1457ab52
MN
2500 }
2501}
2502
2503static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2504 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2505 int i;
2506
2507 for(i=0; i<w; i++){
2508 const int src_1= src[ -srcStride];
2509 const int src0 = src[0 ];
2510 const int src1 = src[ srcStride];
2511 const int src2 = src[2*srcStride];
2512 const int src3 = src[3*srcStride];
2513 const int src4 = src[4*srcStride];
2514 const int src5 = src[5*srcStride];
2515 const int src6 = src[6*srcStride];
2516 const int src7 = src[7*srcStride];
2517 const int src8 = src[8*srcStride];
2518 const int src9 = src[9*srcStride];
2519 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2520 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2521 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2522 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2523 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2524 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2525 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2526 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2527 src++;
2528 dst++;
2529 }
2530}
2531
2532static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2533 put_pixels8_c(dst, src, stride, 8);
2534}
2535
2536static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2537 uint8_t half[64];
2538 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2539 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2540}
2541
2542static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2543 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2544}
2545
2546static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2547 uint8_t half[64];
2548 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2549 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2550}
2551
2552static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2553 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2554}
2555
2556static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2557 uint8_t halfH[88];
2558 uint8_t halfV[64];
2559 uint8_t halfHV[64];
2560 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2561 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2562 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2563 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2564}
2565static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2566 uint8_t halfH[88];
2567 uint8_t halfV[64];
2568 uint8_t halfHV[64];
2569 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2570 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2571 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2572 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2573}
2574static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2575 uint8_t halfH[88];
2576 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2577 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2578}
2579
332f9ac4
MN
2580static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2581 int x;
2582 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2583
332f9ac4
MN
2584 for(x=0; x<8; x++){
2585 int d1, d2, ad1;
2586 int p0= src[x-2*stride];
2587 int p1= src[x-1*stride];
2588 int p2= src[x+0*stride];
2589 int p3= src[x+1*stride];
2590 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2591
2592 if (d<-2*strength) d1= 0;
2593 else if(d<- strength) d1=-2*strength - d;
2594 else if(d< strength) d1= d;
2595 else if(d< 2*strength) d1= 2*strength - d;
2596 else d1= 0;
115329f1 2597
332f9ac4
MN
2598 p1 += d1;
2599 p2 -= d1;
2600 if(p1&256) p1= ~(p1>>31);
2601 if(p2&256) p2= ~(p2>>31);
115329f1 2602
332f9ac4
MN
2603 src[x-1*stride] = p1;
2604 src[x+0*stride] = p2;
2605
5b5404e3 2606 ad1= ABS(d1)>>1;
115329f1 2607
332f9ac4 2608 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2609
332f9ac4
MN
2610 src[x-2*stride] = p0 - d2;
2611 src[x+ stride] = p3 + d2;
2612 }
2613}
2614
2615static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2616 int y;
2617 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2618
332f9ac4
MN
2619 for(y=0; y<8; y++){
2620 int d1, d2, ad1;
2621 int p0= src[y*stride-2];
2622 int p1= src[y*stride-1];
2623 int p2= src[y*stride+0];
2624 int p3= src[y*stride+1];
2625 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2626
2627 if (d<-2*strength) d1= 0;
2628 else if(d<- strength) d1=-2*strength - d;
2629 else if(d< strength) d1= d;
2630 else if(d< 2*strength) d1= 2*strength - d;
2631 else d1= 0;
115329f1 2632
332f9ac4
MN
2633 p1 += d1;
2634 p2 -= d1;
2635 if(p1&256) p1= ~(p1>>31);
2636 if(p2&256) p2= ~(p2>>31);
115329f1 2637
332f9ac4
MN
2638 src[y*stride-1] = p1;
2639 src[y*stride+0] = p2;
2640
2641 ad1= ABS(d1)>>1;
115329f1 2642
332f9ac4 2643 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2644
332f9ac4
MN
2645 src[y*stride-2] = p0 - d2;
2646 src[y*stride+1] = p3 + d2;
2647 }
2648}
1457ab52 2649
fdbbf2e0
MN
2650static void h261_loop_filter_c(uint8_t *src, int stride){
2651 int x,y,xy,yz;
2652 int temp[64];
2653
2654 for(x=0; x<8; x++){
2655 temp[x ] = 4*src[x ];
2656 temp[x + 7*8] = 4*src[x + 7*stride];
2657 }
2658 for(y=1; y<7; y++){
2659 for(x=0; x<8; x++){
2660 xy = y * stride + x;
2661 yz = y * 8 + x;
2662 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2663 }
2664 }
115329f1 2665
fdbbf2e0
MN
2666 for(y=0; y<8; y++){
2667 src[ y*stride] = (temp[ y*8] + 2)>>2;
2668 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2669 for(x=1; x<7; x++){
2670 xy = y * stride + x;
2671 yz = y * 8 + x;
2672 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2673 }
2674 }
2675}
2676
5cf08f23 2677static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2678{
2679 int i, d;
2680 for( i = 0; i < 4; i++ ) {
2681 if( tc0[i] < 0 ) {
2682 pix += 4*ystride;
2683 continue;
2684 }
2685 for( d = 0; d < 4; d++ ) {
2686 const int p0 = pix[-1*xstride];
2687 const int p1 = pix[-2*xstride];
2688 const int p2 = pix[-3*xstride];
2689 const int q0 = pix[0];
2690 const int q1 = pix[1*xstride];
2691 const int q2 = pix[2*xstride];
115329f1 2692
42251a2a
LM
2693 if( ABS( p0 - q0 ) < alpha &&
2694 ABS( p1 - p0 ) < beta &&
2695 ABS( q1 - q0 ) < beta ) {
115329f1 2696
42251a2a
LM
2697 int tc = tc0[i];
2698 int i_delta;
115329f1 2699
42251a2a 2700 if( ABS( p2 - p0 ) < beta ) {
bda1c56c 2701 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
2702 tc++;
2703 }
2704 if( ABS( q2 - q0 ) < beta ) {
bda1c56c 2705 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
2706 tc++;
2707 }
115329f1 2708
42251a2a
LM
2709 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2710 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2711 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2712 }
2713 pix += ystride;
2714 }
2715 }
2716}
5cf08f23 2717static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2718{
2719 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2720}
5cf08f23 2721static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2722{
2723 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2724}
2725
5cf08f23 2726static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2727{
2728 int i, d;
2729 for( i = 0; i < 4; i++ ) {
2730 const int tc = tc0[i];
2731 if( tc <= 0 ) {
2732 pix += 2*ystride;
2733 continue;
2734 }
2735 for( d = 0; d < 2; d++ ) {
2736 const int p0 = pix[-1*xstride];
2737 const int p1 = pix[-2*xstride];
2738 const int q0 = pix[0];
2739 const int q1 = pix[1*xstride];
2740
2741 if( ABS( p0 - q0 ) < alpha &&
2742 ABS( p1 - p0 ) < beta &&
2743 ABS( q1 - q0 ) < beta ) {
2744
2745 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2746
2747 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2748 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2749 }
2750 pix += ystride;
2751 }
2752 }
2753}
5cf08f23 2754static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2755{
2756 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2757}
5cf08f23 2758static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2759{
2760 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2761}
2762
5cf08f23
LM
2763static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2764{
2765 int d;
2766 for( d = 0; d < 8; d++ ) {
2767 const int p0 = pix[-1*xstride];
2768 const int p1 = pix[-2*xstride];
2769 const int q0 = pix[0];
2770 const int q1 = pix[1*xstride];
2771
2772 if( ABS( p0 - q0 ) < alpha &&
2773 ABS( p1 - p0 ) < beta &&
2774 ABS( q1 - q0 ) < beta ) {
2775
2776 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2777 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2778 }
2779 pix += ystride;
2780 }
2781}
2782static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2783{
2784 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2785}
2786static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2787{
2788 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2789}
2790
bb198e19 2791static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2792{
2793 int s, i;
2794
2795 s = 0;
bb198e19 2796 for(i=0;i<h;i++) {
de6d9b64
FB
2797 s += abs(pix1[0] - pix2[0]);
2798 s += abs(pix1[1] - pix2[1]);
2799 s += abs(pix1[2] - pix2[2]);
2800 s += abs(pix1[3] - pix2[3]);
2801 s += abs(pix1[4] - pix2[4]);
2802 s += abs(pix1[5] - pix2[5]);
2803 s += abs(pix1[6] - pix2[6]);
2804 s += abs(pix1[7] - pix2[7]);
2805 s += abs(pix1[8] - pix2[8]);
2806 s += abs(pix1[9] - pix2[9]);
2807 s += abs(pix1[10] - pix2[10]);
2808 s += abs(pix1[11] - pix2[11]);
2809 s += abs(pix1[12] - pix2[12]);
2810 s += abs(pix1[13] - pix2[13]);
2811 s += abs(pix1[14] - pix2[14]);
2812 s += abs(pix1[15] - pix2[15]);
2813 pix1 += line_size;
2814 pix2 += line_size;
2815 }
2816 return s;
2817}
2818
bb198e19 2819static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2820{
2821 int s, i;
2822
2823 s = 0;
bb198e19 2824 for(i=0;i<h;i++) {
de6d9b64
FB
2825 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2841 pix1 += line_size;
2842 pix2 += line_size;
2843 }
2844 return s;
2845}
2846
bb198e19 2847static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2848{
2849 int s, i;
0c1a9eda 2850 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2851
2852 s = 0;
bb198e19 2853 for(i=0;i<h;i++) {
de6d9b64
FB
2854 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2870 pix1 += line_size;
2871 pix2 += line_size;
2872 pix3 += line_size;
2873 }
2874 return s;
2875}
2876
bb198e19 2877static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2878{
2879 int s, i;
0c1a9eda 2880 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2881
2882 s = 0;
bb198e19 2883 for(i=0;i<h;i++) {
de6d9b64
FB
2884 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2900 pix1 += line_size;
2901 pix2 += line_size;
2902 pix3 += line_size;
2903 }
2904 return s;
2905}
2906
bb198e19 2907static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2908{
2909 int s, i;
2910
2911 s = 0;
bb198e19 2912 for(i=0;i<h;i++) {
ba6802de
MN
2913 s += abs(pix1[0] - pix2[0]);
2914 s += abs(pix1[1] - pix2[1]);
2915 s += abs(pix1[2] - pix2[2]);
2916 s += abs(pix1[3] - pix2[3]);
2917 s += abs(pix1[4] - pix2[4]);
2918 s += abs(pix1[5] - pix2[5]);
2919 s += abs(pix1[6] - pix2[6]);
2920 s += abs(pix1[7] - pix2[7]);
2921 pix1 += line_size;
2922 pix2 += line_size;
2923 }
2924 return s;
2925}
2926
bb198e19 2927static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2928{
2929 int s, i;
2930
2931 s = 0;
bb198e19 2932 for(i=0;i<h;i++) {
ba6802de
MN
2933 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941 pix1 += line_size;
2942 pix2 += line_size;
2943 }
2944 return s;
2945}
2946
bb198e19 2947static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2948{
2949 int s, i;
0c1a9eda 2950 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2951
2952 s = 0;
bb198e19 2953 for(i=0;i<h;i++) {
ba6802de
MN
2954 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962 pix1 += line_size;
2963 pix2 += line_size;
2964 pix3 += line_size;
2965 }
2966 return s;
2967}
2968
bb198e19 2969static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2970{
2971 int s, i;
0c1a9eda 2972 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2973
2974 s = 0;
bb198e19 2975 for(i=0;i<h;i++) {
ba6802de
MN
2976 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2984 pix1 += line_size;
2985 pix2 += line_size;
2986 pix3 += line_size;
2987 }
2988 return s;
2989}
2990
bf4e3bd2
MR
2991static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2992 MpegEncContext *c = v;
e6a2ac34
MN
2993 int score1=0;
2994 int score2=0;
2995 int x,y;
d4c5d2ad 2996
e6a2ac34
MN
2997 for(y=0; y<h; y++){
2998 for(x=0; x<16; x++){
2999 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3000 }
3001 if(y+1<h){
3002 for(x=0; x<15; x++){
3003 score2+= ABS( s1[x ] - s1[x +stride]
3004 - s1[x+1] + s1[x+1+stride])
3005 -ABS( s2[x ] - s2[x +stride]
3006 - s2[x+1] + s2[x+1+stride]);
3007 }
3008 }
3009 s1+= stride;
3010 s2+= stride;
3011 }
d4c5d2ad
MN
3012
3013 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3014 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3015}
3016
bf4e3bd2
MR
3017static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3018 MpegEncContext *c = v;
e6a2ac34
MN
3019 int score1=0;
3020 int score2=0;
3021 int x,y;
115329f1 3022
e6a2ac34
MN
3023 for(y=0; y<h; y++){
3024 for(x=0; x<8; x++){
3025 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3026 }
3027 if(y+1<h){
3028 for(x=0; x<7; x++){
3029 score2+= ABS( s1[x ] - s1[x +stride]
3030 - s1[x+1] + s1[x+1+stride])
3031 -ABS( s2[x ] - s2[x +stride]
3032 - s2[x+1] + s2[x+1+stride]);
3033 }
3034 }
3035 s1+= stride;
3036 s2+= stride;
3037 }
115329f1 3038
d4c5d2ad
MN
3039 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3040 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3041}
3042
364a1797
MN
3043static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3044 int i;
3045 unsigned int sum=0;
3046
3047 for(i=0; i<8*8; i++){
3048 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3049 int w= weight[i];
3050 b>>= RECON_SHIFT;
3051 assert(-512<b && b<512);
3052
3053 sum += (w*b)*(w*b)>>4;
3054 }
3055 return sum>>2;
3056}
3057
3058static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3059 int i;
3060
3061 for(i=0; i<8*8; i++){
3062 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3063 }
364a1797
MN
3064}
3065
a9badb51
MN
3066/**
3067 * permutes an 8x8 block.
2a5700de 3068 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3069 * @param permutation the permutation vector
3070 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3071 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3072 * (inverse) permutated to scantable order!
a9badb51 3073 */
0c1a9eda 3074void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3075{
7801d21d 3076 int i;
477ab036 3077 DCTELEM temp[64];
115329f1 3078
7801d21d 3079 if(last<=0) return;
9a7b310d 3080 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 3081
7801d21d
MN
3082 for(i=0; i<=last; i++){
3083 const int j= scantable[i];
3084 temp[j]= block[j];
3085 block[j]=0;
3086 }
115329f1 3087
7801d21d
MN
3088 for(i=0; i<=last; i++){
3089 const int j= scantable[i];
3090 const int perm_j= permutation[j];
3091 block[perm_j]= temp[j];
3092 }
d962f6fd 3093}
e0eac44e 3094
622348f9
MN
3095static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3096 return 0;
3097}
3098
3099void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3100 int i;
115329f1 3101
622348f9 3102 memset(cmp, 0, sizeof(void*)*5);
115329f1 3103
622348f9
MN
3104 for(i=0; i<5; i++){
3105 switch(type&0xFF){
3106 case FF_CMP_SAD:
3107 cmp[i]= c->sad[i];
3108 break;
3109 case FF_CMP_SATD:
3110 cmp[i]= c->hadamard8_diff[i];
3111 break;
3112 case FF_CMP_SSE:
3113 cmp[i]= c->sse[i];
3114 break;
3115 case FF_CMP_DCT:
3116 cmp[i]= c->dct_sad[i];
3117 break;
27c61ac5
MN
3118 case FF_CMP_DCT264:
3119 cmp[i]= c->dct264_sad[i];
3120 break;
0fd6aea1
MN
3121 case FF_CMP_DCTMAX:
3122 cmp[i]= c->dct_max[i];
3123 break;
622348f9
MN
3124 case FF_CMP_PSNR:
3125 cmp[i]= c->quant_psnr[i];
3126 break;
3127 case FF_CMP_BIT:
3128 cmp[i]= c->bit[i];
3129 break;
3130 case FF_CMP_RD:
3131 cmp[i]= c->rd[i];
3132 break;
3133 case FF_CMP_VSAD:
3134 cmp[i]= c->vsad[i];
3135 break;
3136 case FF_CMP_VSSE:
3137 cmp[i]= c->vsse[i];
3138 break;
3139 case FF_CMP_ZERO:
3140 cmp[i]= zero_cmp;
3141 break;
e6a2ac34
MN
3142 case FF_CMP_NSSE:
3143 cmp[i]= c->nsse[i];
3144 break;
26efc54e
MN
3145 case FF_CMP_W53:
3146 cmp[i]= c->w53[i];
3147 break;
3148 case FF_CMP_W97:
3149 cmp[i]= c->w97[i];
3150 break;
622348f9
MN
3151 default:
3152 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3153 }
3154 }
3155}
3156
2a5700de
MN
3157/**
3158 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3159 */
eb4b3dd3 3160static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3161{
3162 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3163}
3164
11f18faf
MN
3165static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3166 int i;
d32ac509 3167 for(i=0; i+7<w; i+=8){
11f18faf
MN
3168 dst[i+0] += src[i+0];
3169 dst[i+1] += src[i+1];
3170 dst[i+2] += src[i+2];
3171 dst[i+3] += src[i+3];
3172 dst[i+4] += src[i+4];
3173 dst[i+5] += src[i+5];
3174 dst[i+6] += src[i+6];
3175 dst[i+7] += src[i+7];
3176 }
3177 for(; i<w; i++)
3178 dst[i+0] += src[i+0];
3179}
3180
3181static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3182 int i;
d32ac509 3183 for(i=0; i+7<w; i+=8){
11f18faf
MN
3184 dst[i+0] = src1[i+0]-src2[i+0];
3185 dst[i+1] = src1[i+1]-src2[i+1];
3186 dst[i+2] = src1[i+2]-src2[i+2];
3187 dst[i+3] = src1[i+3]-src2[i+3];
3188 dst[i+4] = src1[i+4]-src2[i+4];
3189 dst[i+5] = src1[i+5]-src2[i+5];
3190 dst[i+6] = src1[i+6]-src2[i+6];
3191 dst[i+7] = src1[i+7]-src2[i+7];
3192 }
3193 for(; i<w; i++)
3194 dst[i+0] = src1[i+0]-src2[i+0];
3195}
3196
84705403
MN
3197static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3198 int i;
3199 uint8_t l, lt;
3200
3201 l= *left;
3202 lt= *left_top;
3203
3204 for(i=0; i<w; i++){
3205 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3206 lt= src1[i];
3207 l= src2[i];
3208 dst[i]= l - pred;
115329f1 3209 }
84705403
MN
3210
3211 *left= l;
3212 *left_top= lt;
3213}
3214
1457ab52
MN
3215#define BUTTERFLY2(o1,o2,i1,i2) \
3216o1= (i1)+(i2);\
3217o2= (i1)-(i2);
3218
3219#define BUTTERFLY1(x,y) \
3220{\
3221 int a,b;\
3222 a= x;\
3223 b= y;\
3224 x= a+b;\
3225 y= a-b;\
3226}
3227
3228#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3229
bb198e19 3230static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3231 int i;
3232 int temp[64];
3233 int sum=0;
115329f1 3234
bb198e19 3235 assert(h==8);
1457ab52
MN
3236
3237 for(i=0; i<8; i++){
3238 //FIXME try pointer walks
3239 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3240 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3241 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3242 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1 3243
1457ab52
MN
3244 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3245 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3246 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3247 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3248
1457ab52
MN
3249 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3250 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3251 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3252 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3253 }
3254
3255 for(i=0; i<8; i++){
3256 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3257 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3258 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3259 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3260
1457ab52
MN
3261 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3262 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3263 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3264 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3265
115329f1 3266 sum +=
1457ab52
MN
3267 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3268 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3269 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3270 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3271 }
3272#if 0
3273static int maxi=0;
3274if(sum>maxi){
3275 maxi=sum;
3276 printf("MAX:%d\n", maxi);
3277}
3278#endif
3279 return sum;
3280}
3281
622348f9 3282static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3283 int i;
3284 int temp[64];
3285 int sum=0;
115329f1 3286
622348f9 3287 assert(h==8);
115329f1 3288
1457ab52
MN
3289 for(i=0; i<8; i++){
3290 //FIXME try pointer walks
622348f9
MN
3291 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3292 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3293 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3294 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1 3295
1457ab52
MN
3296 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3297 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3298 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3299 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3300
1457ab52
MN
3301 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3302 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3303 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3304 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3305 }
3306
3307 for(i=0; i<8; i++){
3308 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3309 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3310 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3311 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3312
1457ab52
MN
3313 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3314 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3315 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3316 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
DB
3317
3318 sum +=
1457ab52
MN
3319 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3320 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3321 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3322 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3323 }
115329f1 3324
622348f9 3325 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
115329f1 3326
1457ab52
MN
3327 return sum;
3328}
3329
bb198e19 3330static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3331 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3332 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3333 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3334 int sum=0, i;
115329f1 3335
bb198e19 3336 assert(h==8);
1457ab52
MN
3337
3338 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3339 s->dsp.fdct(temp);
1457ab52
MN
3340
3341 for(i=0; i<64; i++)
3342 sum+= ABS(temp[i]);
115329f1 3343
1457ab52
MN
3344 return sum;
3345}
3346
27c61ac5
MN
3347#ifdef CONFIG_GPL
3348#define DCT8_1D {\
3349 const int s07 = SRC(0) + SRC(7);\
3350 const int s16 = SRC(1) + SRC(6);\
3351 const int s25 = SRC(2) + SRC(5);\
3352 const int s34 = SRC(3) + SRC(4);\
3353 const int a0 = s07 + s34;\
3354 const int a1 = s16 + s25;\
3355 const int a2 = s07 - s34;\
3356 const int a3 = s16 - s25;\
3357 const int d07 = SRC(0) - SRC(7);\
3358 const int d16 = SRC(1) - SRC(6);\
3359 const int d25 = SRC(2) - SRC(5);\
3360 const int d34 = SRC(3) - SRC(4);\
3361 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3362 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3363 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3364 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3365 DST(0, a0 + a1 ) ;\
3366 DST(1, a4 + (a7>>2)) ;\
3367 DST(2, a2 + (a3>>1)) ;\
3368 DST(3, a5 + (a6>>2)) ;\
3369 DST(4, a0 - a1 ) ;\
3370 DST(5, a6 - (a5>>2)) ;\
3371 DST(6, (a2>>1) - a3 ) ;\
3372 DST(7, (a4>>2) - a7 ) ;\
3373}
3374
3375static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3376 MpegEncContext * const s= (MpegEncContext *)c;
3377 int16_t dct[8][8];
3378 int i;
3379 int sum=0;
3380
3381 s->dsp.diff_pixels(dct, src1, src2, stride);
3382
3383#define SRC(x) dct[i][x]
3384#define DST(x,v) dct[i][x]= v
3385 for( i = 0; i < 8; i++ )
3386 DCT8_1D
3387#undef SRC
3388#undef DST
3389
3390#define SRC(x) dct[x][i]
3391#define DST(x,v) sum += ABS(v)
3392 for( i = 0; i < 8; i++ )
3393 DCT8_1D
3394#undef SRC
3395#undef DST
3396 return sum;
3397}
3398#endif
3399
0fd6aea1
MN
3400static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3401 MpegEncContext * const s= (MpegEncContext *)c;
3402 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3403 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3404 int sum=0, i;
115329f1 3405
0fd6aea1
MN
3406 assert(h==8);
3407
3408 s->dsp.diff_pixels(temp, src1, src2, stride);
3409 s->dsp.fdct(temp);
3410
3411 for(i=0; i<64; i++)
3412 sum= FFMAX(sum, ABS(temp[i]));
115329f1 3413
0fd6aea1
MN
3414 return sum;
3415}
3416
0e15384d 3417void simple_idct(DCTELEM *block); //FIXME
1457ab52 3418
bb198e19 3419static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3420 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3421 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3422 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3423 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3424 int sum=0, i;
3425
bb198e19 3426 assert(h==8);
1457ab52 3427 s->mb_intra=0;
115329f1 3428
1457ab52 3429 s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1 3430
1457ab52 3431 memcpy(bak, temp, 64*sizeof(DCTELEM));
115329f1 3432
67725183 3433 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3434 s->dct_unquantize_inter(s, temp, 0, s->qscale);
115329f1
DB
3435 simple_idct(temp); //FIXME
3436
1457ab52
MN
3437 for(i=0; i<64; i++)
3438 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1 3439
1457ab52
MN
3440 return sum;
3441}
3442
bb198e19 3443static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3444 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3445 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3446 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3447 uint64_t __align8 aligned_bak[stride];
3448 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3449 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3450 int i, last, run, bits, level, distoration, start_i;
3451 const int esc_length= s->ac_esc_length;
3452 uint8_t * length;
3453 uint8_t * last_length;
115329f1 3454
bb198e19
MN
3455 assert(h==8);
3456
67725183
MN
3457 for(i=0; i<8; i++){
3458 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3459 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3460 }
3a87ac94 3461
67725183
MN
3462 s->dsp.diff_pixels(temp, src1, src2, stride);
3463
3464 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3465
3466 bits=0;
115329f1 3467
3a87ac94 3468 if (s->mb_intra) {
115329f1 3469 start_i = 1;
3a87ac94
MN
3470 length = s->intra_ac_vlc_length;
3471 last_length= s->intra_ac_vlc_last_length;
67725183 3472 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3473 } else {
3474 start_i = 0;
3475 length = s->inter_ac_vlc_length;
3476 last_length= s->inter_ac_vlc_last_length;
3477 }
115329f1 3478
67725183 3479 if(last>=start_i){
3a87ac94
MN
3480 run=0;
3481 for(i=start_i; i<last; i++){
3482 int j= scantable[i];
3483 level= temp[j];
115329f1 3484
3a87ac94
MN
3485 if(level){
3486 level+=64;
3487 if((level&(~127)) == 0){
3488 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3489 }else
3490 bits+= esc_length;
3491 run=0;
3492 }else
3493 run++;
3494 }
3495 i= scantable[last];
115329f1 3496
3a87ac94 3497 level= temp[i] + 64;
1d0eab1d
MN
3498
3499 assert(level - 64);
115329f1 3500
3a87ac94
MN
3501 if((level&(~127)) == 0){
3502 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3503 }else
3504 bits+= esc_length;
115329f1 3505
67725183
MN
3506 }
3507
3508 if(last>=0){
d50635cd
MN
3509 if(s->mb_intra)
3510 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3511 else
3512 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94 3513 }
115329f1 3514
b0368839 3515 s->dsp.idct_add(bak, stride, temp);
115329f1 3516
bb198e19 3517 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3518
67725183 3519 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3520}
3521
bb198e19 3522static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3523 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3524 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3525 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3526 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3527 int i, last, run, bits, level, start_i;
3528 const int esc_length= s->ac_esc_length;
3529 uint8_t * length;
3530 uint8_t * last_length;
bb198e19
MN
3531
3532 assert(h==8);
115329f1 3533
67725183 3534 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3535
67725183
MN
3536 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3537
3538 bits=0;
115329f1 3539
3a87ac94 3540 if (s->mb_intra) {
115329f1 3541 start_i = 1;
3a87ac94
MN
3542 length = s->intra_ac_vlc_length;
3543 last_length= s->intra_ac_vlc_last_length;
67725183 3544 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3545 } else {
3546 start_i = 0;
3547 length = s->inter_ac_vlc_length;
3548 last_length= s->inter_ac_vlc_last_length;
3549 }
115329f1 3550
67725183 3551 if(last>=start_i){
3a87ac94
MN
3552 run=0;
3553 for(i=start_i; i<last; i++){
3554 int j= scantable[i];
3555 level= temp[j];
115329f1 3556
3a87ac94
MN
3557 if(level){
3558 level+=64;
3559 if((level&(~127)) == 0){
3560 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3561 }else
3562 bits+= esc_length;
3563 run=0;
3564 }else
3565 run++;
3566 }
3567 i= scantable[last];
115329f1 3568
67725183 3569 level= temp[i] + 64;
115329f1 3570
67725183 3571 assert(level - 64);
115329f1 3572
3a87ac94
MN
3573 if((level&(~127)) == 0){
3574 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3575 }else
3576 bits+= esc_length;
3577 }
3578
3579 return bits;
3580}
3581
622348f9
MN
3582static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3583 int score=0;
3584 int x,y;
115329f1 3585
622348f9
MN
3586 for(y=1; y<h; y++){
3587 for(x=0; x<16; x+=4){
115329f1 3588 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
622348f9
MN
3589 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3590 }
3591 s+= stride;
3592 }
115329f1 3593
622348f9
MN
3594 return score;
3595}
3596
3597static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3598 int score=0;
3599 int x,y;
115329f1 3600
622348f9
MN
3601 for(y=1; y<h; y++){
3602 for(x=0; x<16; x++){
3603 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3604 }
3605 s1+= stride;
3606 s2+= stride;
3607 }
115329f1 3608
622348f9
MN
3609 return score;
3610}
3611
3612#define SQ(a) ((a)*(a))
3613static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3614 int score=0;
3615 int x,y;
115329f1 3616
622348f9
MN
3617 for(y=1; y<h; y++){
3618 for(x=0; x<16; x+=4){
115329f1 3619 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
622348f9
MN
3620 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3621 }
3622 s+= stride;
3623 }
115329f1