H.264 weighted prediction.
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
983e3246
MN
22
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
5596c60c 33
8b69867f
MN
34uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35uint32_t squareTbl[512] = {0, };
de6d9b64 36
0c1a9eda 37const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 40 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 41 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46};
47
10acc479
RS
48/* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59};
60
2f349de2 61/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
8b69867f 62uint16_t __align8 inv_zigzag_direct16[64] = {0, };
2f349de2 63
0c1a9eda 64const uint8_t ff_alternate_horizontal_scan[64] = {
2ad1516a 65 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73};
74
0c1a9eda 75const uint8_t ff_alternate_vertical_scan[64] = {
2ad1516a 76 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84};
85
2f349de2 86/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 87const uint32_t inverse[256]={
2f349de2
MN
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120};
121
b0368839
MN
122/* Input permutation for the simple_idct_mmx */
123static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132};
133
0c1a9eda 134static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
135{
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154}
155
0c1a9eda 156static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
157{
158 int s, i, j;
0c1a9eda 159 uint32_t *sq = squareTbl + 256;
3aa102be
MN
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
2a006cd3 164#if 0
3aa102be
MN
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
2a006cd3
FL
173#else
174#if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184#else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195#endif
196#endif
3aa102be
MN
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202}
203
3d2e8cce
MN
204static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220}
3aa102be 221
26efc54e
MN
222static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223{
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 pix1 += line_size;
234 pix2 += line_size;
235 }
236 return s;
237}
238
bb198e19 239static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
240{
241 int s, i;
0c1a9eda 242 uint32_t *sq = squareTbl + 256;
1457ab52
MN
243
244 s = 0;
bb198e19 245 for (i = 0; i < h; i++) {
1457ab52
MN
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
254 pix1 += line_size;
255 pix2 += line_size;
256 }
257 return s;
258}
259
bb198e19 260static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 261{
6b026927
FH
262 int s, i;
263 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
264
265 s = 0;
bb198e19 266 for (i = 0; i < h; i++) {
6b026927
FH
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
2a006cd3 283
6b026927
FH
284 pix1 += line_size;
285 pix2 += line_size;
9c76bd48
BF
286 }
287 return s;
288}
289
26efc54e
MN
290
291static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
292 int s, i, j;
293 const int dec_count= w==8 ? 3 : 4;
294 int tmp[16*16];
295#if 0
296 int level, ori;
297 static const int scale[2][2][4][4]={
298 {
299 {
300 //8x8 dec=3
301 {268, 239, 239, 213},
302 { 0, 224, 224, 152},
303 { 0, 135, 135, 110},
304 },{
305 //16x16 dec=4
306 {344, 310, 310, 280},
307 { 0, 320, 320, 228},
308 { 0, 175, 175, 136},
309 { 0, 129, 129, 102},
310 }
311 },{
312 {//FIXME 5/3
313 //8x8 dec=3
314 {275, 245, 245, 218},
315 { 0, 230, 230, 156},
316 { 0, 138, 138, 113},
317 },{
318 //16x16 dec=4
319 {352, 317, 317, 286},
320 { 0, 328, 328, 233},
321 { 0, 180, 180, 140},
322 { 0, 132, 132, 105},
323 }
324 }
325 };
326#endif
327
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
334 }
335 pix1 += line_size;
336 pix2 += line_size;
337 }
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
339
340 s=0;
341#if 0
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
347 int size= 1<<level;
348
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
352 s += ABS(v);
353 }
354 }
355 }
356 }
357#endif
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
364 }
365 }
366 assert(s>=0);
367
368 return s>>2;
369}
370
371static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
373}
374
375static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
377}
378
379static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
381}
382
383static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
385}
386
0c1a9eda 387static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 388{
de6d9b64
FB
389 int i;
390
391 /* read the pixels */
de6d9b64 392 for(i=0;i<8;i++) {
c13e1abd
FH
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
401 pixels += line_size;
402 block += 8;
de6d9b64
FB
403 }
404}
405
0c1a9eda
ZK
406static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
9dbcbd92
MN
408 int i;
409
410 /* read the pixels */
9dbcbd92 411 for(i=0;i<8;i++) {
c13e1abd
FH
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
9dbcbd92
MN
420 s1 += stride;
421 s2 += stride;
c13e1abd 422 block += 8;
9dbcbd92
MN
423 }
424}
425
426
0c1a9eda 427static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
eb4b3dd3 428 int line_size)
de6d9b64 429{
de6d9b64 430 int i;
0c1a9eda 431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
432
433 /* read the pixels */
de6d9b64 434 for(i=0;i<8;i++) {
c13e1abd
FH
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
443
444 pixels += line_size;
445 block += 8;
de6d9b64
FB
446 }
447}
448
178fcca8
MN
449static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
450 int line_size)
451{
452 int i;
453 uint8_t *cm = cropTbl + MAX_NEG_CROP;
454
455 /* read the pixels */
456 for(i=0;i<4;i++) {
457 pixels[0] = cm[block[0]];
458 pixels[1] = cm[block[1]];
459 pixels[2] = cm[block[2]];
460 pixels[3] = cm[block[3]];
461
462 pixels += line_size;
463 block += 8;
464 }
465}
466
9ca358b9
MN
467static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
468 int line_size)
469{
470 int i;
471 uint8_t *cm = cropTbl + MAX_NEG_CROP;
472
473 /* read the pixels */
474 for(i=0;i<2;i++) {
475 pixels[0] = cm[block[0]];
476 pixels[1] = cm[block[1]];
477
478 pixels += line_size;
479 block += 8;
480 }
481}
482
f9ed9d85
MM
483static void put_signed_pixels_clamped_c(const DCTELEM *block,
484 uint8_t *restrict pixels,
485 int line_size)
486{
487 int i, j;
488
489 for (i = 0; i < 8; i++) {
490 for (j = 0; j < 8; j++) {
491 if (*block < -128)
492 *pixels = 0;
493 else if (*block > 127)
494 *pixels = 255;
495 else
496 *pixels = (uint8_t)(*block + 128);
497 block++;
498 pixels++;
499 }
500 pixels += (line_size - 8);
501 }
502}
503
0c1a9eda 504static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 505 int line_size)
de6d9b64 506{
de6d9b64 507 int i;
0c1a9eda 508 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
509
510 /* read the pixels */
de6d9b64 511 for(i=0;i<8;i++) {
c13e1abd
FH
512 pixels[0] = cm[pixels[0] + block[0]];
513 pixels[1] = cm[pixels[1] + block[1]];
514 pixels[2] = cm[pixels[2] + block[2]];
515 pixels[3] = cm[pixels[3] + block[3]];
516 pixels[4] = cm[pixels[4] + block[4]];
517 pixels[5] = cm[pixels[5] + block[5]];
518 pixels[6] = cm[pixels[6] + block[6]];
519 pixels[7] = cm[pixels[7] + block[7]];
520 pixels += line_size;
521 block += 8;
de6d9b64
FB
522 }
523}
178fcca8
MN
524
525static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
526 int line_size)
527{
528 int i;
529 uint8_t *cm = cropTbl + MAX_NEG_CROP;
530
531 /* read the pixels */
532 for(i=0;i<4;i++) {
533 pixels[0] = cm[pixels[0] + block[0]];
534 pixels[1] = cm[pixels[1] + block[1]];
535 pixels[2] = cm[pixels[2] + block[2]];
536 pixels[3] = cm[pixels[3] + block[3]];
537 pixels += line_size;
538 block += 8;
539 }
540}
9ca358b9
MN
541
542static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
543 int line_size)
544{
545 int i;
546 uint8_t *cm = cropTbl + MAX_NEG_CROP;
547
548 /* read the pixels */
549 for(i=0;i<2;i++) {
550 pixels[0] = cm[pixels[0] + block[0]];
551 pixels[1] = cm[pixels[1] + block[1]];
552 pixels += line_size;
553 block += 8;
554 }
555}
59fe111e
MN
556#if 0
557
558#define PIXOP2(OPNAME, OP) \
b3184779 559static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
560{\
561 int i;\
562 for(i=0; i<h; i++){\
563 OP(*((uint64_t*)block), LD64(pixels));\
564 pixels+=line_size;\
565 block +=line_size;\
566 }\
567}\
568\
45553457 569static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
570{\
571 int i;\
572 for(i=0; i<h; i++){\
573 const uint64_t a= LD64(pixels );\
574 const uint64_t b= LD64(pixels+1);\
575 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
576 pixels+=line_size;\
577 block +=line_size;\
578 }\
579}\
580\
45553457 581static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
582{\
583 int i;\
584 for(i=0; i<h; i++){\
585 const uint64_t a= LD64(pixels );\
586 const uint64_t b= LD64(pixels+1);\
587 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
588 pixels+=line_size;\
589 block +=line_size;\
590 }\
591}\
592\
45553457 593static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
594{\
595 int i;\
596 for(i=0; i<h; i++){\
597 const uint64_t a= LD64(pixels );\
598 const uint64_t b= LD64(pixels+line_size);\
599 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
600 pixels+=line_size;\
601 block +=line_size;\
602 }\
603}\
604\
45553457 605static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
606{\
607 int i;\
608 for(i=0; i<h; i++){\
609 const uint64_t a= LD64(pixels );\
610 const uint64_t b= LD64(pixels+line_size);\
611 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
612 pixels+=line_size;\
613 block +=line_size;\
614 }\
615}\
616\
45553457 617static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
618{\
619 int i;\
620 const uint64_t a= LD64(pixels );\
621 const uint64_t b= LD64(pixels+1);\
622 uint64_t l0= (a&0x0303030303030303ULL)\
623 + (b&0x0303030303030303ULL)\
624 + 0x0202020202020202ULL;\
625 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
626 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
627 uint64_t l1,h1;\
628\
629 pixels+=line_size;\
630 for(i=0; i<h; i+=2){\
631 uint64_t a= LD64(pixels );\
632 uint64_t b= LD64(pixels+1);\
633 l1= (a&0x0303030303030303ULL)\
634 + (b&0x0303030303030303ULL);\
635 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
636 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
637 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
638 pixels+=line_size;\
639 block +=line_size;\
640 a= LD64(pixels );\
641 b= LD64(pixels+1);\
642 l0= (a&0x0303030303030303ULL)\
643 + (b&0x0303030303030303ULL)\
644 + 0x0202020202020202ULL;\
645 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
646 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
647 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
648 pixels+=line_size;\
649 block +=line_size;\
650 }\
651}\
652\
45553457 653static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
654{\
655 int i;\
656 const uint64_t a= LD64(pixels );\
657 const uint64_t b= LD64(pixels+1);\
658 uint64_t l0= (a&0x0303030303030303ULL)\
659 + (b&0x0303030303030303ULL)\
660 + 0x0101010101010101ULL;\
661 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
662 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
663 uint64_t l1,h1;\
664\
665 pixels+=line_size;\
666 for(i=0; i<h; i+=2){\
667 uint64_t a= LD64(pixels );\
668 uint64_t b= LD64(pixels+1);\
669 l1= (a&0x0303030303030303ULL)\
670 + (b&0x0303030303030303ULL);\
671 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
672 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
673 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
674 pixels+=line_size;\
675 block +=line_size;\
676 a= LD64(pixels );\
677 b= LD64(pixels+1);\
678 l0= (a&0x0303030303030303ULL)\
679 + (b&0x0303030303030303ULL)\
680 + 0x0101010101010101ULL;\
681 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
682 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
683 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
684 pixels+=line_size;\
685 block +=line_size;\
686 }\
687}\
688\
45553457
ZK
689CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
690CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
691CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
692CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
693CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
694CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
695CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
696
697#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
698#else // 64 bit variant
699
700#define PIXOP2(OPNAME, OP) \
669ac79c
MN
701static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
702 int i;\
703 for(i=0; i<h; i++){\
704 OP(*((uint16_t*)(block )), LD16(pixels ));\
705 pixels+=line_size;\
706 block +=line_size;\
707 }\
708}\
0da71265
MN
709static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
710 int i;\
711 for(i=0; i<h; i++){\
712 OP(*((uint32_t*)(block )), LD32(pixels ));\
713 pixels+=line_size;\
714 block +=line_size;\
715 }\
716}\
45553457 717static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
718 int i;\
719 for(i=0; i<h; i++){\
720 OP(*((uint32_t*)(block )), LD32(pixels ));\
721 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
722 pixels+=line_size;\
723 block +=line_size;\
724 }\
725}\
45553457
ZK
726static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 728}\
59fe111e 729\
b3184779
MN
730static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
731 int src_stride1, int src_stride2, int h){\
59fe111e
MN
732 int i;\
733 for(i=0; i<h; i++){\
b3184779
MN
734 uint32_t a,b;\
735 a= LD32(&src1[i*src_stride1 ]);\
736 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 737 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
738 a= LD32(&src1[i*src_stride1+4]);\
739 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 740 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
741 }\
742}\
743\
b3184779
MN
744static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
745 int src_stride1, int src_stride2, int h){\
59fe111e
MN
746 int i;\
747 for(i=0; i<h; i++){\
b3184779
MN
748 uint32_t a,b;\
749 a= LD32(&src1[i*src_stride1 ]);\
750 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 751 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
752 a= LD32(&src1[i*src_stride1+4]);\
753 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 754 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
755 }\
756}\
757\
0da71265
MN
758static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
759 int src_stride1, int src_stride2, int h){\
760 int i;\
761 for(i=0; i<h; i++){\
762 uint32_t a,b;\
763 a= LD32(&src1[i*src_stride1 ]);\
764 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 765 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
766 }\
767}\
768\
669ac79c
MN
769static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
771 int i;\
772 for(i=0; i<h; i++){\
773 uint32_t a,b;\
774 a= LD16(&src1[i*src_stride1 ]);\
775 b= LD16(&src2[i*src_stride2 ]);\
776 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
777 }\
778}\
779\
b3184779
MN
780static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
781 int src_stride1, int src_stride2, int h){\
782 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
783 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
784}\
785\
786static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
790}\
791\
45553457 792static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
793 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
794}\
795\
45553457 796static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
797 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
798}\
799\
45553457 800static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
801 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
802}\
803\
45553457 804static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
805 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
806}\
807\
808static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
809 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
810 int i;\
811 for(i=0; i<h; i++){\
b3184779
MN
812 uint32_t a, b, c, d, l0, l1, h0, h1;\
813 a= LD32(&src1[i*src_stride1]);\
814 b= LD32(&src2[i*src_stride2]);\
815 c= LD32(&src3[i*src_stride3]);\
816 d= LD32(&src4[i*src_stride4]);\
817 l0= (a&0x03030303UL)\
818 + (b&0x03030303UL)\
819 + 0x02020202UL;\
820 h0= ((a&0xFCFCFCFCUL)>>2)\
821 + ((b&0xFCFCFCFCUL)>>2);\
822 l1= (c&0x03030303UL)\
823 + (d&0x03030303UL);\
824 h1= ((c&0xFCFCFCFCUL)>>2)\
825 + ((d&0xFCFCFCFCUL)>>2);\
826 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827 a= LD32(&src1[i*src_stride1+4]);\
828 b= LD32(&src2[i*src_stride2+4]);\
829 c= LD32(&src3[i*src_stride3+4]);\
830 d= LD32(&src4[i*src_stride4+4]);\
831 l0= (a&0x03030303UL)\
832 + (b&0x03030303UL)\
833 + 0x02020202UL;\
834 h0= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 l1= (c&0x03030303UL)\
837 + (d&0x03030303UL);\
838 h1= ((c&0xFCFCFCFCUL)>>2)\
839 + ((d&0xFCFCFCFCUL)>>2);\
840 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
841 }\
842}\
669ac79c
MN
843\
844static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
846}\
847\
848static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
850}\
851\
852static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
854}\
855\
856static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
857 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
858}\
859\
b3184779
MN
860static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
861 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
862 int i;\
863 for(i=0; i<h; i++){\
b3184779
MN
864 uint32_t a, b, c, d, l0, l1, h0, h1;\
865 a= LD32(&src1[i*src_stride1]);\
866 b= LD32(&src2[i*src_stride2]);\
867 c= LD32(&src3[i*src_stride3]);\
868 d= LD32(&src4[i*src_stride4]);\
869 l0= (a&0x03030303UL)\
870 + (b&0x03030303UL)\
871 + 0x01010101UL;\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
875 + (d&0x03030303UL);\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879 a= LD32(&src1[i*src_stride1+4]);\
880 b= LD32(&src2[i*src_stride2+4]);\
881 c= LD32(&src3[i*src_stride3+4]);\
882 d= LD32(&src4[i*src_stride4+4]);\
883 l0= (a&0x03030303UL)\
884 + (b&0x03030303UL)\
885 + 0x01010101UL;\
886 h0= ((a&0xFCFCFCFCUL)>>2)\
887 + ((b&0xFCFCFCFCUL)>>2);\
888 l1= (c&0x03030303UL)\
889 + (d&0x03030303UL);\
890 h1= ((c&0xFCFCFCFCUL)>>2)\
891 + ((d&0xFCFCFCFCUL)>>2);\
892 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
893 }\
894}\
b3184779
MN
895static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
896 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
897 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
898 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
899}\
900static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
901 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
902 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
903 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904}\
59fe111e 905\
669ac79c
MN
906static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
907{\
908 int i, a0, b0, a1, b1;\
909 a0= pixels[0];\
910 b0= pixels[1] + 2;\
911 a0 += b0;\
912 b0 += pixels[2];\
913\
914 pixels+=line_size;\
915 for(i=0; i<h; i+=2){\
916 a1= pixels[0];\
917 b1= pixels[1];\
918 a1 += b1;\
919 b1 += pixels[2];\
920\
921 block[0]= (a1+a0)>>2; /* FIXME non put */\
922 block[1]= (b1+b0)>>2;\
923\
924 pixels+=line_size;\
925 block +=line_size;\
926\
927 a0= pixels[0];\
928 b0= pixels[1] + 2;\
929 a0 += b0;\
930 b0 += pixels[2];\
931\
932 block[0]= (a1+a0)>>2;\
933 block[1]= (b1+b0)>>2;\
934 pixels+=line_size;\
935 block +=line_size;\
936 }\
937}\
938\
939static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
940{\
941 int i;\
942 const uint32_t a= LD32(pixels );\
943 const uint32_t b= LD32(pixels+1);\
944 uint32_t l0= (a&0x03030303UL)\
945 + (b&0x03030303UL)\
946 + 0x02020202UL;\
947 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
948 + ((b&0xFCFCFCFCUL)>>2);\
949 uint32_t l1,h1;\
950\
951 pixels+=line_size;\
952 for(i=0; i<h; i+=2){\
953 uint32_t a= LD32(pixels );\
954 uint32_t b= LD32(pixels+1);\
955 l1= (a&0x03030303UL)\
956 + (b&0x03030303UL);\
957 h1= ((a&0xFCFCFCFCUL)>>2)\
958 + ((b&0xFCFCFCFCUL)>>2);\
959 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
960 pixels+=line_size;\
961 block +=line_size;\
962 a= LD32(pixels );\
963 b= LD32(pixels+1);\
964 l0= (a&0x03030303UL)\
965 + (b&0x03030303UL)\
966 + 0x02020202UL;\
967 h0= ((a&0xFCFCFCFCUL)>>2)\
968 + ((b&0xFCFCFCFCUL)>>2);\
969 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
970 pixels+=line_size;\
971 block +=line_size;\
972 }\
973}\
974\
45553457 975static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
976{\
977 int j;\
978 for(j=0; j<2; j++){\
979 int i;\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
983 + (b&0x03030303UL)\
984 + 0x02020202UL;\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
987 uint32_t l1,h1;\
988\
989 pixels+=line_size;\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
994 + (b&0x03030303UL);\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 pixels+=line_size;\
999 block +=line_size;\
1000 a= LD32(pixels );\
1001 b= LD32(pixels+1);\
1002 l0= (a&0x03030303UL)\
1003 + (b&0x03030303UL)\
1004 + 0x02020202UL;\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 pixels+=line_size;\
1009 block +=line_size;\
1010 }\
1011 pixels+=4-line_size*(h+1);\
1012 block +=4-line_size*h;\
1013 }\
1014}\
1015\
45553457 1016static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1017{\
1018 int j;\
1019 for(j=0; j<2; j++){\
1020 int i;\
1021 const uint32_t a= LD32(pixels );\
1022 const uint32_t b= LD32(pixels+1);\
1023 uint32_t l0= (a&0x03030303UL)\
1024 + (b&0x03030303UL)\
1025 + 0x01010101UL;\
1026 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1027 + ((b&0xFCFCFCFCUL)>>2);\
1028 uint32_t l1,h1;\
1029\
1030 pixels+=line_size;\
1031 for(i=0; i<h; i+=2){\
1032 uint32_t a= LD32(pixels );\
1033 uint32_t b= LD32(pixels+1);\
1034 l1= (a&0x03030303UL)\
1035 + (b&0x03030303UL);\
1036 h1= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1038 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039 pixels+=line_size;\
1040 block +=line_size;\
1041 a= LD32(pixels );\
1042 b= LD32(pixels+1);\
1043 l0= (a&0x03030303UL)\
1044 + (b&0x03030303UL)\
1045 + 0x01010101UL;\
1046 h0= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 pixels+=line_size;\
1050 block +=line_size;\
1051 }\
1052 pixels+=4-line_size*(h+1);\
1053 block +=4-line_size*h;\
1054 }\
1055}\
1056\
45553457
ZK
1057CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1058CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1059CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1060CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1061CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1062CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1063CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1064CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1065
d8085ea7 1066#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1067#endif
59fe111e
MN
1068#define op_put(a, b) a = b
1069
1070PIXOP2(avg, op_avg)
1071PIXOP2(put, op_put)
1072#undef op_avg
1073#undef op_put
1074
de6d9b64
FB
1075#define avg2(a,b) ((a+b+1)>>1)
1076#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1077
c0a0170c
MN
1078static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1079 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1080}
1081
1082static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1083 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1084}
073b013d 1085
0c1a9eda 1086static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1087{
1088 const int A=(16-x16)*(16-y16);
1089 const int B=( x16)*(16-y16);
1090 const int C=(16-x16)*( y16);
1091 const int D=( x16)*( y16);
1092 int i;
44eb4951
MN
1093
1094 for(i=0; i<h; i++)
1095 {
b3184779
MN
1096 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1097 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1098 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1099 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1100 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1101 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1102 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1103 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1104 dst+= stride;
1105 src+= stride;
44eb4951
MN
1106 }
1107}
1108
0c1a9eda 1109static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1110 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1111{
1112 int y, vx, vy;
1113 const int s= 1<<shift;
1114
1115 width--;
1116 height--;
1117
1118 for(y=0; y<h; y++){
1119 int x;
1120
1121 vx= ox;
1122 vy= oy;
1123 for(x=0; x<8; x++){ //XXX FIXME optimize
1124 int src_x, src_y, frac_x, frac_y, index;
1125
1126 src_x= vx>>16;
1127 src_y= vy>>16;
1128 frac_x= src_x&(s-1);
1129 frac_y= src_y&(s-1);
1130 src_x>>=shift;
1131 src_y>>=shift;
1132
1133 if((unsigned)src_x < width){
1134 if((unsigned)src_y < height){
1135 index= src_x + src_y*stride;
1136 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1137 + src[index +1]* frac_x )*(s-frac_y)
1138 + ( src[index+stride ]*(s-frac_x)
1139 + src[index+stride+1]* frac_x )* frac_y
1140 + r)>>(shift*2);
1141 }else{
1142 index= src_x + clip(src_y, 0, height)*stride;
1143 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1144 + src[index +1]* frac_x )*s
1145 + r)>>(shift*2);
1146 }
1147 }else{
1148 if((unsigned)src_y < height){
1149 index= clip(src_x, 0, width) + src_y*stride;
1150 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1151 + src[index+stride ]* frac_y )*s
1152 + r)>>(shift*2);
1153 }else{
1154 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1155 dst[y*stride + x]= src[index ];
1156 }
1157 }
1158
1159 vx+= dxx;
1160 vy+= dyx;
1161 }
1162 ox += dxy;
1163 oy += dyy;
1164 }
1165}
669ac79c
MN
1166
1167static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1168 switch(width){
1169 case 2: put_pixels2_c (dst, src, stride, height); break;
1170 case 4: put_pixels4_c (dst, src, stride, height); break;
1171 case 8: put_pixels8_c (dst, src, stride, height); break;
1172 case 16:put_pixels16_c(dst, src, stride, height); break;
1173 }
1174}
1175
1176static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1177 int i,j;
1178 for (i=0; i < height; i++) {
1179 for (j=0; j < width; j++) {
1180 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1181 }
1182 src += stride;
1183 dst += stride;
1184 }
1185}
1186
1187static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1188 int i,j;
1189 for (i=0; i < height; i++) {
1190 for (j=0; j < width; j++) {
1191 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1192 }
1193 src += stride;
1194 dst += stride;
1195 }
1196}
1197
1198static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1199 int i,j;
1200 for (i=0; i < height; i++) {
1201 for (j=0; j < width; j++) {
1202 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1203 }
1204 src += stride;
1205 dst += stride;
1206 }
1207}
1208
1209static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1210 int i,j;
1211 for (i=0; i < height; i++) {
1212 for (j=0; j < width; j++) {
1213 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1214 }
1215 src += stride;
1216 dst += stride;
1217 }
1218}
1219
1220static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1221 int i,j;
1222 for (i=0; i < height; i++) {
1223 for (j=0; j < width; j++) {
89ebf4e8 1224 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1225 }
1226 src += stride;
1227 dst += stride;
1228 }
1229}
1230
1231static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1232 int i,j;
1233 for (i=0; i < height; i++) {
1234 for (j=0; j < width; j++) {
1235 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1236 }
1237 src += stride;
1238 dst += stride;
1239 }
1240}
1241
1242static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1243 int i,j;
1244 for (i=0; i < height; i++) {
1245 for (j=0; j < width; j++) {
89ebf4e8 1246 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1247 }
1248 src += stride;
1249 dst += stride;
1250 }
1251}
1252
1253static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1254 int i,j;
1255 for (i=0; i < height; i++) {
1256 for (j=0; j < width; j++) {
1257 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1258 }
1259 src += stride;
1260 dst += stride;
1261 }
1262}
da3b9756
MM
1263
1264static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1265 switch(width){
1266 case 2: avg_pixels2_c (dst, src, stride, height); break;
1267 case 4: avg_pixels4_c (dst, src, stride, height); break;
1268 case 8: avg_pixels8_c (dst, src, stride, height); break;
1269 case 16:avg_pixels16_c(dst, src, stride, height); break;
1270 }
1271}
1272
1273static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274 int i,j;
1275 for (i=0; i < height; i++) {
1276 for (j=0; j < width; j++) {
1277 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1278 }
1279 src += stride;
1280 dst += stride;
1281 }
1282}
1283
1284static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285 int i,j;
1286 for (i=0; i < height; i++) {
1287 for (j=0; j < width; j++) {
1288 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1289 }
1290 src += stride;
1291 dst += stride;
1292 }
1293}
1294
1295static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1296 int i,j;
1297 for (i=0; i < height; i++) {
1298 for (j=0; j < width; j++) {
1299 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1300 }
1301 src += stride;
1302 dst += stride;
1303 }
1304}
1305
1306static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1307 int i,j;
1308 for (i=0; i < height; i++) {
1309 for (j=0; j < width; j++) {
1310 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1311 }
1312 src += stride;
1313 dst += stride;
1314 }
1315}
1316
1317static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318 int i,j;
1319 for (i=0; i < height; i++) {
1320 for (j=0; j < width; j++) {
89ebf4e8 1321 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1322 }
1323 src += stride;
1324 dst += stride;
1325 }
1326}
1327
1328static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329 int i,j;
1330 for (i=0; i < height; i++) {
1331 for (j=0; j < width; j++) {
1332 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1333 }
1334 src += stride;
1335 dst += stride;
1336 }
1337}
1338
1339static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340 int i,j;
1341 for (i=0; i < height; i++) {
1342 for (j=0; j < width; j++) {
89ebf4e8 1343 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1344 }
1345 src += stride;
1346 dst += stride;
1347 }
1348}
1349
1350static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 int i,j;
1352 for (i=0; i < height; i++) {
1353 for (j=0; j < width; j++) {
1354 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1355 }
1356 src += stride;
1357 dst += stride;
1358 }
1359}
669ac79c
MN
1360#if 0
1361#define TPEL_WIDTH(width)\
1362static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1363 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1364static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1365 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1366static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1367 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1368static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1370static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1372static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1374static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1376static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1378static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1380#endif
1381
0da71265
MN
1382#define H264_CHROMA_MC(OPNAME, OP)\
1383static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1384 const int A=(8-x)*(8-y);\
1385 const int B=( x)*(8-y);\
1386 const int C=(8-x)*( y);\
1387 const int D=( x)*( y);\
1388 int i;\
1389 \
1390 assert(x<8 && y<8 && x>=0 && y>=0);\
1391\
1392 for(i=0; i<h; i++)\
1393 {\
1394 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1395 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1396 dst+= stride;\
1397 src+= stride;\
1398 }\
1399}\
1400\
1401static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1402 const int A=(8-x)*(8-y);\
1403 const int B=( x)*(8-y);\
1404 const int C=(8-x)*( y);\
1405 const int D=( x)*( y);\
1406 int i;\
1407 \
1408 assert(x<8 && y<8 && x>=0 && y>=0);\
1409\
1410 for(i=0; i<h; i++)\
1411 {\
1412 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1413 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1414 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1415 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1416 dst+= stride;\
1417 src+= stride;\
1418 }\
1419}\
1420\
1421static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1426 int i;\
1427 \
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1429\
1430 for(i=0; i<h; i++)\
1431 {\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1435 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1436 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1437 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1438 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1439 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1440 dst+= stride;\
1441 src+= stride;\
1442 }\
1443}
1444
1445#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1446#define op_put(a, b) a = (((b) + 32)>>6)
1447
1448H264_CHROMA_MC(put_ , op_put)
1449H264_CHROMA_MC(avg_ , op_avg)
1450#undef op_avg
1451#undef op_put
1452
1453static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1454{
1455 int i;
1456 for(i=0; i<h; i++)
1457 {
1458 ST32(dst , LD32(src ));
1459 dst+=dstStride;
1460 src+=srcStride;
1461 }
1462}
1463
1464static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1465{
1466 int i;
1467 for(i=0; i<h; i++)
1468 {
1469 ST32(dst , LD32(src ));
1470 ST32(dst+4 , LD32(src+4 ));
1471 dst+=dstStride;
1472 src+=srcStride;
1473 }
1474}
1475
1476static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1477{
1478 int i;
1479 for(i=0; i<h; i++)
1480 {
1481 ST32(dst , LD32(src ));
1482 ST32(dst+4 , LD32(src+4 ));
1483 ST32(dst+8 , LD32(src+8 ));
1484 ST32(dst+12, LD32(src+12));
1485 dst+=dstStride;
1486 src+=srcStride;
1487 }
1488}
073b013d 1489
0c1a9eda 1490static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1491{
44eb4951
MN
1492 int i;
1493 for(i=0; i<h; i++)
1494 {
b3184779
MN
1495 ST32(dst , LD32(src ));
1496 ST32(dst+4 , LD32(src+4 ));
1497 ST32(dst+8 , LD32(src+8 ));
1498 ST32(dst+12, LD32(src+12));
1499 dst[16]= src[16];
44eb4951
MN
1500 dst+=dstStride;
1501 src+=srcStride;
1502 }
1503}
1504
0c1a9eda 1505static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1506{
1507 int i;
b3184779 1508 for(i=0; i<h; i++)
44eb4951 1509 {
b3184779
MN
1510 ST32(dst , LD32(src ));
1511 ST32(dst+4 , LD32(src+4 ));
1512 dst[8]= src[8];
44eb4951
MN
1513 dst+=dstStride;
1514 src+=srcStride;
1515 }
1516}
1517
826f429a 1518
b3184779 1519#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1520static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1521 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1522 int i;\
1523 for(i=0; i<h; i++)\
1524 {\
1525 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1526 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1527 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1528 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1529 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1530 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1531 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1532 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1533 dst+=dstStride;\
1534 src+=srcStride;\
1535 }\
44eb4951
MN
1536}\
1537\
0c1a9eda 1538static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1539 const int w=8;\
0c1a9eda 1540 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1541 int i;\
1542 for(i=0; i<w; i++)\
1543 {\
1544 const int src0= src[0*srcStride];\
1545 const int src1= src[1*srcStride];\
1546 const int src2= src[2*srcStride];\
1547 const int src3= src[3*srcStride];\
1548 const int src4= src[4*srcStride];\
1549 const int src5= src[5*srcStride];\
1550 const int src6= src[6*srcStride];\
1551 const int src7= src[7*srcStride];\
1552 const int src8= src[8*srcStride];\
1553 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1554 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1555 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1556 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1557 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1558 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1559 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1560 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1561 dst++;\
1562 src++;\
1563 }\
1564}\
1565\
0c1a9eda
ZK
1566static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1567 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1568 int i;\
826f429a 1569 \
b3184779
MN
1570 for(i=0; i<h; i++)\
1571 {\
1572 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1573 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1574 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1575 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1576 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1577 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1578 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1579 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1580 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1581 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1582 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1583 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1584 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1585 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1586 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1587 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1588 dst+=dstStride;\
1589 src+=srcStride;\
1590 }\
1591}\
1592\
0c1a9eda
ZK
1593static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1594 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1595 int i;\
826f429a 1596 const int w=16;\
b3184779
MN
1597 for(i=0; i<w; i++)\
1598 {\
1599 const int src0= src[0*srcStride];\
1600 const int src1= src[1*srcStride];\
1601 const int src2= src[2*srcStride];\
1602 const int src3= src[3*srcStride];\
1603 const int src4= src[4*srcStride];\
1604 const int src5= src[5*srcStride];\
1605 const int src6= src[6*srcStride];\
1606 const int src7= src[7*srcStride];\
1607 const int src8= src[8*srcStride];\
1608 const int src9= src[9*srcStride];\
1609 const int src10= src[10*srcStride];\
1610 const int src11= src[11*srcStride];\
1611 const int src12= src[12*srcStride];\
1612 const int src13= src[13*srcStride];\
1613 const int src14= src[14*srcStride];\
1614 const int src15= src[15*srcStride];\
1615 const int src16= src[16*srcStride];\
1616 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1617 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1618 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1619 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1620 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1621 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1622 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1623 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1624 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1625 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1626 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1627 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1628 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1629 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1630 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1631 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1632 dst++;\
1633 src++;\
1634 }\
1635}\
1636\
0c1a9eda 1637static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1638 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1639}\
1640\
0c1a9eda
ZK
1641static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1642 uint8_t half[64];\
b3184779
MN
1643 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1644 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1645}\
1646\
0c1a9eda 1647static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1648 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1649}\
1650\
0c1a9eda
ZK
1651static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1652 uint8_t half[64];\
b3184779
MN
1653 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1654 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1655}\
1656\
0c1a9eda
ZK
1657static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1658 uint8_t full[16*9];\
1659 uint8_t half[64];\
b3184779 1660 copy_block9(full, src, 16, stride, 9);\
db794953 1661 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1662 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1663}\
1664\
0c1a9eda
ZK
1665static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t full[16*9];\
b3184779 1667 copy_block9(full, src, 16, stride, 9);\
db794953 1668 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1669}\
1670\
0c1a9eda
ZK
1671static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 uint8_t half[64];\
b3184779 1674 copy_block9(full, src, 16, stride, 9);\
db794953 1675 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1676 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1677}\
0c1a9eda
ZK
1678void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1679 uint8_t full[16*9];\
1680 uint8_t halfH[72];\
1681 uint8_t halfV[64];\
1682 uint8_t halfHV[64];\
b3184779
MN
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1685 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1686 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1687 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1688}\
0c1a9eda
ZK
1689static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1690 uint8_t full[16*9];\
1691 uint8_t halfH[72];\
1692 uint8_t halfHV[64];\
db794953
MN
1693 copy_block9(full, src, 16, stride, 9);\
1694 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1695 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1696 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1697 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1698}\
0c1a9eda
ZK
1699void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1700 uint8_t full[16*9];\
1701 uint8_t halfH[72];\
1702 uint8_t halfV[64];\
1703 uint8_t halfHV[64];\
b3184779
MN
1704 copy_block9(full, src, 16, stride, 9);\
1705 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1706 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1708 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1709}\
0c1a9eda
ZK
1710static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 uint8_t halfH[72];\
1713 uint8_t halfHV[64];\
db794953
MN
1714 copy_block9(full, src, 16, stride, 9);\
1715 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1716 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1718 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1719}\
0c1a9eda
ZK
1720void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1722 uint8_t halfH[72];\
1723 uint8_t halfV[64];\
1724 uint8_t halfHV[64];\
b3184779
MN
1725 copy_block9(full, src, 16, stride, 9);\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1727 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1729 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1730}\
0c1a9eda
ZK
1731static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1733 uint8_t halfH[72];\
1734 uint8_t halfHV[64];\
db794953
MN
1735 copy_block9(full, src, 16, stride, 9);\
1736 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1737 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1738 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1739 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1740}\
0c1a9eda
ZK
1741void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1742 uint8_t full[16*9];\
1743 uint8_t halfH[72];\
1744 uint8_t halfV[64];\
1745 uint8_t halfHV[64];\
b3184779
MN
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1748 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1750 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1751}\
0c1a9eda
ZK
1752static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1754 uint8_t halfH[72];\
1755 uint8_t halfHV[64];\
db794953
MN
1756 copy_block9(full, src, 16, stride, 9);\
1757 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1758 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1760 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1761}\
0c1a9eda
ZK
1762static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t halfH[72];\
1764 uint8_t halfHV[64];\
b3184779 1765 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1768}\
0c1a9eda
ZK
1769static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t halfH[72];\
1771 uint8_t halfHV[64];\
b3184779 1772 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1774 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1775}\
0c1a9eda
ZK
1776void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1778 uint8_t halfH[72];\
1779 uint8_t halfV[64];\
1780 uint8_t halfHV[64];\
b3184779
MN
1781 copy_block9(full, src, 16, stride, 9);\
1782 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1785 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1786}\
0c1a9eda
ZK
1787static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[16*9];\
1789 uint8_t halfH[72];\
db794953
MN
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1793 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1794}\
0c1a9eda
ZK
1795void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1796 uint8_t full[16*9];\
1797 uint8_t halfH[72];\
1798 uint8_t halfV[64];\
1799 uint8_t halfHV[64];\
b3184779
MN
1800 copy_block9(full, src, 16, stride, 9);\
1801 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1804 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1805}\
0c1a9eda
ZK
1806static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t full[16*9];\
1808 uint8_t halfH[72];\
db794953
MN
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1813}\
0c1a9eda
ZK
1814static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t halfH[72];\
b3184779 1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1817 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1818}\
0c1a9eda 1819static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1820 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1821}\
1822\
0c1a9eda
ZK
1823static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t half[256];\
b3184779
MN
1825 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1826 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1827}\
1828\
0c1a9eda 1829static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1830 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1831}\
b3184779 1832\
0c1a9eda
ZK
1833static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1834 uint8_t half[256];\
b3184779
MN
1835 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1836 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1837}\
1838\
0c1a9eda
ZK
1839static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1840 uint8_t full[24*17];\
1841 uint8_t half[256];\
b3184779 1842 copy_block17(full, src, 24, stride, 17);\
826f429a 1843 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1844 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1845}\
1846\
0c1a9eda
ZK
1847static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t full[24*17];\
b3184779 1849 copy_block17(full, src, 24, stride, 17);\
826f429a 1850 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1851}\
1852\
0c1a9eda
ZK
1853static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 uint8_t half[256];\
b3184779 1856 copy_block17(full, src, 24, stride, 17);\
826f429a 1857 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1858 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1859}\
0c1a9eda
ZK
1860void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t full[24*17];\
1862 uint8_t halfH[272];\
1863 uint8_t halfV[256];\
1864 uint8_t halfHV[256];\
b3184779
MN
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1867 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1868 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1869 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1870}\
0c1a9eda
ZK
1871static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[24*17];\
1873 uint8_t halfH[272];\
1874 uint8_t halfHV[256];\
db794953
MN
1875 copy_block17(full, src, 24, stride, 17);\
1876 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1877 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1878 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1879 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1880}\
0c1a9eda
ZK
1881void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882 uint8_t full[24*17];\
1883 uint8_t halfH[272];\
1884 uint8_t halfV[256];\
1885 uint8_t halfHV[256];\
b3184779
MN
1886 copy_block17(full, src, 24, stride, 17);\
1887 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1888 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1890 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1891}\
0c1a9eda
ZK
1892static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfHV[256];\
db794953
MN
1896 copy_block17(full, src, 24, stride, 17);\
1897 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1898 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1900 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1901}\
0c1a9eda
ZK
1902void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1904 uint8_t halfH[272];\
1905 uint8_t halfV[256];\
1906 uint8_t halfHV[256];\
b3184779
MN
1907 copy_block17(full, src, 24, stride, 17);\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1909 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1911 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1912}\
0c1a9eda
ZK
1913static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1915 uint8_t halfH[272];\
1916 uint8_t halfHV[256];\
db794953
MN
1917 copy_block17(full, src, 24, stride, 17);\
1918 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1919 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1920 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1921 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1922}\
0c1a9eda
ZK
1923void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924 uint8_t full[24*17];\
1925 uint8_t halfH[272];\
1926 uint8_t halfV[256];\
1927 uint8_t halfHV[256];\
b3184779
MN
1928 copy_block17(full, src, 24, stride, 17);\
1929 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1930 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1932 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1933}\
0c1a9eda
ZK
1934static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfHV[256];\
db794953
MN
1938 copy_block17(full, src, 24, stride, 17);\
1939 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1940 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1942 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1943}\
0c1a9eda
ZK
1944static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
b3184779 1947 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950}\
0c1a9eda
ZK
1951static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t halfH[272];\
1953 uint8_t halfHV[256];\
b3184779 1954 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1956 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1957}\
0c1a9eda
ZK
1958void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfV[256];\
1962 uint8_t halfHV[256];\
b3184779
MN
1963 copy_block17(full, src, 24, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1967 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1968}\
0c1a9eda
ZK
1969static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1970 uint8_t full[24*17];\
1971 uint8_t halfH[272];\
db794953
MN
1972 copy_block17(full, src, 24, stride, 17);\
1973 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1975 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1976}\
0c1a9eda
ZK
1977void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t full[24*17];\
1979 uint8_t halfH[272];\
1980 uint8_t halfV[256];\
1981 uint8_t halfHV[256];\
b3184779
MN
1982 copy_block17(full, src, 24, stride, 17);\
1983 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1985 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1986 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1987}\
0c1a9eda
ZK
1988static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[24*17];\
1990 uint8_t halfH[272];\
db794953
MN
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1995}\
0c1a9eda
ZK
1996static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
b3184779 1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1999 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2000}
44eb4951 2001
b3184779
MN
2002#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2003#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2004#define op_put(a, b) a = cm[((b) + 16)>>5]
2005#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2006
2007QPEL_MC(0, put_ , _ , op_put)
2008QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2009QPEL_MC(0, avg_ , _ , op_avg)
2010//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2011#undef op_avg
2012#undef op_avg_no_rnd
2013#undef op_put
2014#undef op_put_no_rnd
44eb4951 2015
0da71265
MN
2016#if 1
2017#define H264_LOWPASS(OPNAME, OP, OP2) \
2018static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2019 const int h=4;\
2020 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2021 int i;\
2022 for(i=0; i<h; i++)\
2023 {\
2024 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2025 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2026 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2027 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2028 dst+=dstStride;\
2029 src+=srcStride;\
2030 }\
2031}\
2032\
2033static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2034 const int w=4;\
2035 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2036 int i;\
2037 for(i=0; i<w; i++)\
2038 {\
2039 const int srcB= src[-2*srcStride];\
2040 const int srcA= src[-1*srcStride];\
2041 const int src0= src[0 *srcStride];\
2042 const int src1= src[1 *srcStride];\
2043 const int src2= src[2 *srcStride];\
2044 const int src3= src[3 *srcStride];\
2045 const int src4= src[4 *srcStride];\
2046 const int src5= src[5 *srcStride];\
2047 const int src6= src[6 *srcStride];\
2048 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2049 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2050 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2051 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2052 dst++;\
2053 src++;\
2054 }\
2055}\
2056\
2057static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2058 const int h=4;\
2059 const int w=4;\
2060 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2061 int i;\
2062 src -= 2*srcStride;\
2063 for(i=0; i<h+5; i++)\
2064 {\
2065 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2068 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2069 tmp+=tmpStride;\
2070 src+=srcStride;\
2071 }\
2072 tmp -= tmpStride*(h+5-2);\
2073 for(i=0; i<w; i++)\
2074 {\
2075 const int tmpB= tmp[-2*tmpStride];\
2076 const int tmpA= tmp[-1*tmpStride];\
2077 const int tmp0= tmp[0 *tmpStride];\
2078 const int tmp1= tmp[1 *tmpStride];\
2079 const int tmp2= tmp[2 *tmpStride];\
2080 const int tmp3= tmp[3 *tmpStride];\
2081 const int tmp4= tmp[4 *tmpStride];\
2082 const int tmp5= tmp[5 *tmpStride];\
2083 const int tmp6= tmp[6 *tmpStride];\
2084 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2085 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2087 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2088 dst++;\
2089 tmp++;\
2090 }\
2091}\
2092\
2093static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2094 const int h=8;\
2095 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2096 int i;\
2097 for(i=0; i<h; i++)\
2098 {\
2099 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2100 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2101 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2102 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2103 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2104 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2105 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2106 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2107 dst+=dstStride;\
2108 src+=srcStride;\
2109 }\
2110}\
2111\
2112static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2113 const int w=8;\
2114 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2115 int i;\
2116 for(i=0; i<w; i++)\
2117 {\
2118 const int srcB= src[-2*srcStride];\
2119 const int srcA= src[-1*srcStride];\
2120 const int src0= src[0 *srcStride];\
2121 const int src1= src[1 *srcStride];\
2122 const int src2= src[2 *srcStride];\
2123 const int src3= src[3 *srcStride];\
2124 const int src4= src[4 *srcStride];\
2125 const int src5= src[5 *srcStride];\
2126 const int src6= src[6 *srcStride];\
2127 const int src7= src[7 *srcStride];\
2128 const int src8= src[8 *srcStride];\
2129 const int src9= src[9 *srcStride];\
2130 const int src10=src[10*srcStride];\
2131 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2134 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2135 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2136 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2137 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2138 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2139 dst++;\
2140 src++;\
2141 }\
2142}\
2143\
2144static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2145 const int h=8;\
2146 const int w=8;\
2147 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2148 int i;\
2149 src -= 2*srcStride;\
2150 for(i=0; i<h+5; i++)\
2151 {\
2152 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2153 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2154 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2155 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2156 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2157 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2158 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2159 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2160 tmp+=tmpStride;\
2161 src+=srcStride;\
2162 }\
2163 tmp -= tmpStride*(h+5-2);\
2164 for(i=0; i<w; i++)\
2165 {\
2166 const int tmpB= tmp[-2*tmpStride];\
2167 const int tmpA= tmp[-1*tmpStride];\
2168 const int tmp0= tmp[0 *tmpStride];\
2169 const int tmp1= tmp[1 *tmpStride];\
2170 const int tmp2= tmp[2 *tmpStride];\
2171 const int tmp3= tmp[3 *tmpStride];\
2172 const int tmp4= tmp[4 *tmpStride];\
2173 const int tmp5= tmp[5 *tmpStride];\
2174 const int tmp6= tmp[6 *tmpStride];\
2175 const int tmp7= tmp[7 *tmpStride];\
2176 const int tmp8= tmp[8 *tmpStride];\
2177 const int tmp9= tmp[9 *tmpStride];\
2178 const int tmp10=tmp[10*tmpStride];\
2179 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2180 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2181 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2182 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2183 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2184 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2185 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2186 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2187 dst++;\
2188 tmp++;\
2189 }\
2190}\
2191\
2192static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2194 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2195 src += 8*srcStride;\
2196 dst += 8*dstStride;\
2197 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2198 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2199}\
2200\
2201static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2203 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2204 src += 8*srcStride;\
2205 dst += 8*dstStride;\
2206 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2207 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2208}\
2209\
2210static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2211 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2212 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2213 src += 8*srcStride;\
0da71265
MN
2214 dst += 8*dstStride;\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2216 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2217}\
2218
2219#define H264_MC(OPNAME, SIZE) \
2220static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2221 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2222}\
2223\
2224static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2225 uint8_t half[SIZE*SIZE];\
2226 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2227 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2228}\
2229\
2230static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2231 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2232}\
2233\
2234static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t half[SIZE*SIZE];\
2236 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2237 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2238}\
2239\
2240static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2241 uint8_t full[SIZE*(SIZE+5)];\
2242 uint8_t * const full_mid= full + SIZE*2;\
2243 uint8_t half[SIZE*SIZE];\
2244 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2245 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2246 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2247}\
2248\
2249static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2250 uint8_t full[SIZE*(SIZE+5)];\
2251 uint8_t * const full_mid= full + SIZE*2;\
2252 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2253 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2254}\
2255\
2256static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[SIZE*(SIZE+5)];\
2258 uint8_t * const full_mid= full + SIZE*2;\
2259 uint8_t half[SIZE*SIZE];\
2260 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2261 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2262 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2263}\
2264\
2265static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2266 uint8_t full[SIZE*(SIZE+5)];\
2267 uint8_t * const full_mid= full + SIZE*2;\
2268 uint8_t halfH[SIZE*SIZE];\
2269 uint8_t halfV[SIZE*SIZE];\
2270 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2271 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2272 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2273 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2274}\
2275\
2276static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2277 uint8_t full[SIZE*(SIZE+5)];\
2278 uint8_t * const full_mid= full + SIZE*2;\
2279 uint8_t halfH[SIZE*SIZE];\
2280 uint8_t halfV[SIZE*SIZE];\
2281 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2282 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2283 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2284 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2285}\
2286\
2287static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2288 uint8_t full[SIZE*(SIZE+5)];\
2289 uint8_t * const full_mid= full + SIZE*2;\
2290 uint8_t halfH[SIZE*SIZE];\
2291 uint8_t halfV[SIZE*SIZE];\
2292 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2293 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2294 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2295 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2296}\
2297\
2298static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2299 uint8_t full[SIZE*(SIZE+5)];\
2300 uint8_t * const full_mid= full + SIZE*2;\
2301 uint8_t halfH[SIZE*SIZE];\
2302 uint8_t halfV[SIZE*SIZE];\
2303 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2304 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2305 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2306 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2307}\
2308\
2309static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2310 int16_t tmp[SIZE*(SIZE+5)];\
2311 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2312}\
2313\
2314static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2315 int16_t tmp[SIZE*(SIZE+5)];\
2316 uint8_t halfH[SIZE*SIZE];\
2317 uint8_t halfHV[SIZE*SIZE];\
2318 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2320 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2321}\
2322\
2323static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2324 int16_t tmp[SIZE*(SIZE+5)];\
2325 uint8_t halfH[SIZE*SIZE];\
2326 uint8_t halfHV[SIZE*SIZE];\
2327 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2328 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2329 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2330}\
2331\
2332static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2333 uint8_t full[SIZE*(SIZE+5)];\
2334 uint8_t * const full_mid= full + SIZE*2;\
2335 int16_t tmp[SIZE*(SIZE+5)];\
2336 uint8_t halfV[SIZE*SIZE];\
2337 uint8_t halfHV[SIZE*SIZE];\
2338 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2339 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2341 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2342}\
2343\
2344static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t full[SIZE*(SIZE+5)];\
2346 uint8_t * const full_mid= full + SIZE*2;\
2347 int16_t tmp[SIZE*(SIZE+5)];\
2348 uint8_t halfV[SIZE*SIZE];\
2349 uint8_t halfHV[SIZE*SIZE];\
2350 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2351 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2354}\
2355
2356#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2357//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2358#define op_put(a, b) a = cm[((b) + 16)>>5]
2359#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2360#define op2_put(a, b) a = cm[((b) + 512)>>10]
2361
2362H264_LOWPASS(put_ , op_put, op2_put)
2363H264_LOWPASS(avg_ , op_avg, op2_avg)
2364H264_MC(put_, 4)
2365H264_MC(put_, 8)
2366H264_MC(put_, 16)
2367H264_MC(avg_, 4)
2368H264_MC(avg_, 8)
2369H264_MC(avg_, 16)
2370
2371#undef op_avg
2372#undef op_put
2373#undef op2_avg
2374#undef op2_put
2375#endif
2376
9f2d1b4f
LM
2377static inline uint8_t clip1(int x){
2378 if(x > 255) return 255;
2379 if(x < 0) return 0;
2380 return x;
2381}
2382#define op_scale1(x) block[x] = clip1( (block[x]*weight + offset) >> log2_denom )
2383#define op_scale2(x) dst[x] = clip( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1), 0, 255 )
2384#define H264_WEIGHT(W,H) \
2385static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2386 int x, y; \
2387 offset <<= log2_denom; \
2388 if(log2_denom) offset += 1<<(log2_denom-1); \
2389 for(y=0; y<H; y++, block += stride){ \
2390 op_scale1(0); \
2391 op_scale1(1); \
2392 if(W==2) continue; \
2393 op_scale1(2); \
2394 op_scale1(3); \
2395 if(W==4) continue; \
2396 op_scale1(4); \
2397 op_scale1(5); \
2398 op_scale1(6); \
2399 op_scale1(7); \
2400 if(W==8) continue; \
2401 op_scale1(8); \
2402 op_scale1(9); \
2403 op_scale1(10); \
2404 op_scale1(11); \
2405 op_scale1(12); \
2406 op_scale1(13); \
2407 op_scale1(14); \
2408 op_scale1(15); \
2409 } \
2410} \
2411static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2412 int x, y; \
2413 int offset = (offsets + offsetd + 1) >> 1; \
2414 offset = ((offset << 1) + 1) << log2_denom; \
2415 for(y=0; y<H; y++, dst += stride, src += stride){ \
2416 op_scale2(0); \
2417 op_scale2(1); \
2418 if(W==2) continue; \
2419 op_scale2(2); \
2420 op_scale2(3); \
2421 if(W==4) continue; \
2422 op_scale2(4); \
2423 op_scale2(5); \
2424 op_scale2(6); \
2425 op_scale2(7); \
2426 if(W==8) continue; \
2427 op_scale2(8); \
2428 op_scale2(9); \
2429 op_scale2(10); \
2430 op_scale2(11); \
2431 op_scale2(12); \
2432 op_scale2(13); \
2433 op_scale2(14); \
2434 op_scale2(15); \
2435 } \
2436}
2437
2438H264_WEIGHT(16,16)
2439H264_WEIGHT(16,8)
2440H264_WEIGHT(8,16)
2441H264_WEIGHT(8,8)
2442H264_WEIGHT(8,4)
2443H264_WEIGHT(4,8)
2444H264_WEIGHT(4,4)
2445H264_WEIGHT(4,2)
2446H264_WEIGHT(2,4)
2447H264_WEIGHT(2,2)
2448
2449#undef op_scale1
2450#undef op_scale2
2451#undef H264_WEIGHT
2452
1457ab52
MN
2453static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2454 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2455 int i;
2456
2457 for(i=0; i<h; i++){
2458 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2459 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2460 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2461 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2462 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2463 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2464 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2465 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2466 dst+=dstStride;
2467 src+=srcStride;
2468 }
2469}
2470
2471static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2472 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2473 int i;
2474
2475 for(i=0; i<w; i++){
2476 const int src_1= src[ -srcStride];
2477 const int src0 = src[0 ];
2478 const int src1 = src[ srcStride];
2479 const int src2 = src[2*srcStride];
2480 const int src3 = src[3*srcStride];
2481 const int src4 = src[4*srcStride];
2482 const int src5 = src[5*srcStride];
2483 const int src6 = src[6*srcStride];
2484 const int src7 = src[7*srcStride];
2485 const int src8 = src[8*srcStride];
2486 const int src9 = src[9*srcStride];
2487 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2488 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2489 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2490 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2491 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2492 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2493 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2494 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2495 src++;
2496 dst++;
2497 }
2498}
2499
2500static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2501 put_pixels8_c(dst, src, stride, 8);
2502}
2503
2504static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2505 uint8_t half[64];
2506 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2507 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2508}
2509
2510static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2511 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2512}
2513
2514static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2515 uint8_t half[64];
2516 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2517 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2518}
2519
2520static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2521 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2522}
2523
2524static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2525 uint8_t halfH[88];
2526 uint8_t halfV[64];
2527 uint8_t halfHV[64];
2528 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2529 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2530 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2531 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2532}
2533static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2534 uint8_t halfH[88];
2535 uint8_t halfV[64];
2536 uint8_t halfHV[64];
2537 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2538 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2539 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2540 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2541}
2542static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2543 uint8_t halfH[88];
2544 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2545 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2546}
2547
332f9ac4
MN
2548static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2549 int x;
2550 const int strength= ff_h263_loop_filter_strength[qscale];
2551
2552 for(x=0; x<8; x++){
2553 int d1, d2, ad1;
2554 int p0= src[x-2*stride];
2555 int p1= src[x-1*stride];
2556 int p2= src[x+0*stride];
2557 int p3= src[x+1*stride];
2558 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2559
2560 if (d<-2*strength) d1= 0;
2561 else if(d<- strength) d1=-2*strength - d;
2562 else if(d< strength) d1= d;
2563 else if(d< 2*strength) d1= 2*strength - d;
2564 else d1= 0;
2565
2566 p1 += d1;
2567 p2 -= d1;
2568 if(p1&256) p1= ~(p1>>31);
2569 if(p2&256) p2= ~(p2>>31);
2570
2571 src[x-1*stride] = p1;
2572 src[x+0*stride] = p2;
2573
5b5404e3 2574 ad1= ABS(d1)>>1;
332f9ac4
MN
2575
2576 d2= clip((p0-p3)/4, -ad1, ad1);
2577
2578 src[x-2*stride] = p0 - d2;
2579 src[x+ stride] = p3 + d2;
2580 }
2581}
2582
2583static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2584 int y;
2585 const int strength= ff_h263_loop_filter_strength[qscale];
2586
2587 for(y=0; y<8; y++){
2588 int d1, d2, ad1;
2589 int p0= src[y*stride-2];
2590 int p1= src[y*stride-1];
2591 int p2= src[y*stride+0];
2592 int p3= src[y*stride+1];
2593 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2594
2595 if (d<-2*strength) d1= 0;
2596 else if(d<- strength) d1=-2*strength - d;
2597 else if(d< strength) d1= d;
2598 else if(d< 2*strength) d1= 2*strength - d;
2599 else d1= 0;
2600
2601 p1 += d1;
2602 p2 -= d1;
2603 if(p1&256) p1= ~(p1>>31);
2604 if(p2&256) p2= ~(p2>>31);
2605
2606 src[y*stride-1] = p1;
2607 src[y*stride+0] = p2;
2608
2609 ad1= ABS(d1)>>1;
2610
2611 d2= clip((p0-p3)/4, -ad1, ad1);
2612
2613 src[y*stride-2] = p0 - d2;
2614 src[y*stride+1] = p3 + d2;
2615 }
2616}
1457ab52 2617
fdbbf2e0
MN
2618static void h261_loop_filter_c(uint8_t *src, int stride){
2619 int x,y,xy,yz;
2620 int temp[64];
2621
2622 for(x=0; x<8; x++){
2623 temp[x ] = 4*src[x ];
2624 temp[x + 7*8] = 4*src[x + 7*stride];
2625 }
2626 for(y=1; y<7; y++){
2627 for(x=0; x<8; x++){
2628 xy = y * stride + x;
2629 yz = y * 8 + x;
2630 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2631 }
2632 }
fdbbf2e0
MN
2633
2634 for(y=0; y<8; y++){
2635 src[ y*stride] = (temp[ y*8] + 2)>>2;
2636 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2637 for(x=1; x<7; x++){
2638 xy = y * stride + x;
2639 yz = y * 8 + x;
2640 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2641 }
2642 }
2643}
2644
bb198e19 2645static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2646{
2647 int s, i;
2648
2649 s = 0;
bb198e19 2650 for(i=0;i<h;i++) {
de6d9b64
FB
2651 s += abs(pix1[0] - pix2[0]);
2652 s += abs(pix1[1] - pix2[1]);
2653 s += abs(pix1[2] - pix2[2]);
2654 s += abs(pix1[3] - pix2[3]);
2655 s += abs(pix1[4] - pix2[4]);
2656 s += abs(pix1[5] - pix2[5]);
2657 s += abs(pix1[6] - pix2[6]);
2658 s += abs(pix1[7] - pix2[7]);
2659 s += abs(pix1[8] - pix2[8]);
2660 s += abs(pix1[9] - pix2[9]);
2661 s += abs(pix1[10] - pix2[10]);
2662 s += abs(pix1[11] - pix2[11]);
2663 s += abs(pix1[12] - pix2[12]);
2664 s += abs(pix1[13] - pix2[13]);
2665 s += abs(pix1[14] - pix2[14]);
2666 s += abs(pix1[15] - pix2[15]);
2667 pix1 += line_size;
2668 pix2 += line_size;
2669 }
2670 return s;
2671}
2672
bb198e19 2673static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2674{
2675 int s, i;
2676
2677 s = 0;
bb198e19 2678 for(i=0;i<h;i++) {
de6d9b64
FB
2679 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2680 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2681 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2682 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2683 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2684 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2685 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2686 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2687 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2688 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2689 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2690 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2691 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2692 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2693 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2694 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2695 pix1 += line_size;
2696 pix2 += line_size;
2697 }
2698 return s;
2699}
2700
bb198e19 2701static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2702{
2703 int s, i;
0c1a9eda 2704 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2705
2706 s = 0;
bb198e19 2707 for(i=0;i<h;i++) {
de6d9b64
FB
2708 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2709 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2710 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2711 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2712 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2713 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2714 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2715 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2716 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2717 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2718 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2719 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2720 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2721 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2722 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2723 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2724 pix1 += line_size;
2725 pix2 += line_size;
2726 pix3 += line_size;
2727 }
2728 return s;
2729}
2730
bb198e19 2731static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2732{
2733 int s, i;
0c1a9eda 2734 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2735
2736 s = 0;
bb198e19 2737 for(i=0;i<h;i++) {
de6d9b64
FB
2738 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2739 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2740 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2741 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2742 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2743 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2744 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2745 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2746 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2747 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2748 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2749 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2750 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2751 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2752 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2753 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2754 pix1 += line_size;
2755 pix2 += line_size;
2756 pix3 += line_size;
2757 }
2758 return s;
2759}
2760
bb198e19 2761static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2762{
2763 int s, i;
2764
2765 s = 0;
bb198e19 2766 for(i=0;i<h;i++) {
ba6802de
MN
2767 s += abs(pix1[0] - pix2[0]);
2768 s += abs(pix1[1] - pix2[1]);
2769 s += abs(pix1[2] - pix2[2]);
2770 s += abs(pix1[3] - pix2[3]);
2771 s += abs(pix1[4] - pix2[4]);
2772 s += abs(pix1[5] - pix2[5]);
2773 s += abs(pix1[6] - pix2[6]);
2774 s += abs(pix1[7] - pix2[7]);
2775 pix1 += line_size;
2776 pix2 += line_size;
2777 }
2778 return s;
2779}
2780
bb198e19 2781static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2782{
2783 int s, i;
2784
2785 s = 0;
bb198e19 2786 for(i=0;i<h;i++) {
ba6802de
MN
2787 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2788 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2789 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2790 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2791 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2792 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2793 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2794 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2795 pix1 += line_size;
2796 pix2 += line_size;
2797 }
2798 return s;
2799}
2800
bb198e19 2801static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2802{
2803 int s, i;
0c1a9eda 2804 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2805
2806 s = 0;
bb198e19 2807 for(i=0;i<h;i++) {
ba6802de
MN
2808 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2809 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2810 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2811 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2812 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2813 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2814 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2815 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2816 pix1 += line_size;
2817 pix2 += line_size;
2818 pix3 += line_size;
2819 }
2820 return s;
2821}
2822
bb198e19 2823static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2824{
2825 int s, i;
0c1a9eda 2826 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2827
2828 s = 0;
bb198e19 2829 for(i=0;i<h;i++) {
ba6802de
MN
2830 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2831 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2832 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2833 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2834 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2835 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2836 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2837 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2838 pix1 += line_size;
2839 pix2 += line_size;
2840 pix3 += line_size;
2841 }
2842 return s;
2843}
2844
d4c5d2ad 2845static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2846 int score1=0;
2847 int score2=0;
2848 int x,y;
d4c5d2ad 2849
e6a2ac34
MN
2850 for(y=0; y<h; y++){
2851 for(x=0; x<16; x++){
2852 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2853 }
2854 if(y+1<h){
2855 for(x=0; x<15; x++){
2856 score2+= ABS( s1[x ] - s1[x +stride]
2857 - s1[x+1] + s1[x+1+stride])
2858 -ABS( s2[x ] - s2[x +stride]
2859 - s2[x+1] + s2[x+1+stride]);
2860 }
2861 }
2862 s1+= stride;
2863 s2+= stride;
2864 }
d4c5d2ad
MN
2865
2866 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2867 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2868}
2869
d4c5d2ad 2870static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2871 int score1=0;
2872 int score2=0;
2873 int x,y;
2874
2875 for(y=0; y<h; y++){
2876 for(x=0; x<8; x++){
2877 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2878 }
2879 if(y+1<h){
2880 for(x=0; x<7; x++){
2881 score2+= ABS( s1[x ] - s1[x +stride]
2882 - s1[x+1] + s1[x+1+stride])
2883 -ABS( s2[x ] - s2[x +stride]
2884 - s2[x+1] + s2[x+1+stride]);
2885 }
2886 }
2887 s1+= stride;
2888 s2+= stride;
2889 }
2890
d4c5d2ad
MN
2891 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2892 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2893}
2894
364a1797
MN
2895static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2896 int i;
2897 unsigned int sum=0;
2898
2899 for(i=0; i<8*8; i++){
2900 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2901 int w= weight[i];
2902 b>>= RECON_SHIFT;
2903 assert(-512<b && b<512);
2904
2905 sum += (w*b)*(w*b)>>4;
2906 }
2907 return sum>>2;
2908}
2909
2910static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2911 int i;
2912
2913 for(i=0; i<8*8; i++){
2914 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2915 }
2916}
2917
a9badb51
MN
2918/**
2919 * permutes an 8x8 block.
2a5700de 2920 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
2921 * @param permutation the permutation vector
2922 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2a5700de
MN
2923 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2924 * (inverse) permutated to scantable order!
a9badb51 2925 */
0c1a9eda 2926void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 2927{
7801d21d 2928 int i;
477ab036 2929 DCTELEM temp[64];
7801d21d
MN
2930
2931 if(last<=0) return;
9a7b310d 2932 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 2933
7801d21d
MN
2934 for(i=0; i<=last; i++){
2935 const int j= scantable[i];
2936 temp[j]= block[j];
2937 block[j]=0;
2938 }
2939
2940 for(i=0; i<=last; i++){
2941 const int j= scantable[i];
2942 const int perm_j= permutation[j];
2943 block[perm_j]= temp[j];
2944 }
d962f6fd 2945}
e0eac44e 2946
622348f9
MN
2947static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2948 return 0;
2949}
2950
2951void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2952 int i;
2953
2954 memset(cmp, 0, sizeof(void*)*5);
2955
2956 for(i=0; i<5; i++){
2957 switch(type&0xFF){
2958 case FF_CMP_SAD:
2959 cmp[i]= c->sad[i];
2960 break;
2961 case FF_CMP_SATD:
2962 cmp[i]= c->hadamard8_diff[i];
2963 break;
2964 case FF_CMP_SSE:
2965 cmp[i]= c->sse[i];
2966 break;
2967 case FF_CMP_DCT:
2968 cmp[i]= c->dct_sad[i];
2969 break;
0fd6aea1
MN
2970 case FF_CMP_DCTMAX:
2971 cmp[i]= c->dct_max[i];
2972 break;
622348f9
MN
2973 case FF_CMP_PSNR:
2974 cmp[i]= c->quant_psnr[i];
2975 break;
2976 case FF_CMP_BIT:
2977 cmp[i]= c->bit[i];
2978 break;
2979 case FF_CMP_RD:
2980 cmp[i]= c->rd[i];
2981 break;
2982 case FF_CMP_VSAD:
2983 cmp[i]= c->vsad[i];
2984 break;
2985 case FF_CMP_VSSE:
2986 cmp[i]= c->vsse[i];
2987 break;
2988 case FF_CMP_ZERO:
2989 cmp[i]= zero_cmp;
2990 break;
e6a2ac34
MN
2991 case FF_CMP_NSSE:
2992 cmp[i]= c->nsse[i];
2993 break;
26efc54e
MN
2994 case FF_CMP_W53:
2995 cmp[i]= c->w53[i];
2996 break;
2997 case FF_CMP_W97:
2998 cmp[i]= c->w97[i];
2999 break;
622348f9
MN
3000 default:
3001 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3002 }
3003 }
3004}
3005
2a5700de
MN
3006/**
3007 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3008 */
eb4b3dd3 3009static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3010{
3011 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3012}
3013
11f18faf
MN
3014static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3015 int i;
d32ac509 3016 for(i=0; i+7<w; i+=8){
11f18faf
MN
3017 dst[i+0] += src[i+0];
3018 dst[i+1] += src[i+1];
3019 dst[i+2] += src[i+2];
3020 dst[i+3] += src[i+3];
3021 dst[i+4] += src[i+4];
3022 dst[i+5] += src[i+5];
3023 dst[i+6] += src[i+6];
3024 dst[i+7] += src[i+7];
3025 }
3026 for(; i<w; i++)
3027 dst[i+0] += src[i+0];
3028}
3029
3030static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3031 int i;
d32ac509 3032 for(i=0; i+7<w; i+=8){
11f18faf
MN
3033 dst[i+0] = src1[i+0]-src2[i+0];
3034 dst[i+1] = src1[i+1]-src2[i+1];
3035 dst[i+2] = src1[i+2]-src2[i+2];
3036 dst[i+3] = src1[i+3]-src2[i+3];
3037 dst[i+4] = src1[i+4]-src2[i+4];
3038 dst[i+5] = src1[i+5]-src2[i+5];
3039 dst[i+6] = src1[i+6]-src2[i+6];
3040 dst[i+7] = src1[i+7]-src2[i+7];
3041 }
3042 for(; i<w; i++)
3043 dst[i+0] = src1[i+0]-src2[i+0];
3044}
3045
84705403
MN
3046static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3047 int i;
3048 uint8_t l, lt;
3049
3050 l= *left;
3051 lt= *left_top;
3052
3053 for(i=0; i<w; i++){
3054 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3055 lt= src1[i];
3056 l= src2[i];
3057 dst[i]= l - pred;
3058 }
3059
3060 *left= l;
3061 *left_top= lt;
3062}
3063
1457ab52
MN
3064#define BUTTERFLY2(o1,o2,i1,i2) \
3065o1= (i1)+(i2);\
3066o2= (i1)-(i2);
3067
3068#define BUTTERFLY1(x,y) \
3069{\
3070 int a,b;\
3071 a= x;\
3072 b= y;\
3073 x= a+b;\
3074 y= a-b;\
3075}
3076
3077#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3078
bb198e19 3079static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3080 int i;
3081 int temp[64];
3082 int sum=0;
bb198e19
MN
3083
3084 assert(h==8);
1457ab52
MN
3085
3086 for(i=0; i<8; i++){
3087 //FIXME try pointer walks
3088 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3089 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3090 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3091 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3092
3093 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3094 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3095 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3096 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3097
3098 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3099 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3100 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3101 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3102 }
3103
3104 for(i=0; i<8; i++){
3105 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3106 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3107 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3108 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3109
3110 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3111 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3112 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3113 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3114
3115 sum +=
3116 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3117 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3118 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3119 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3120 }
3121#if 0
3122static int maxi=0;
3123if(sum>maxi){
3124 maxi=sum;
3125 printf("MAX:%d\n", maxi);
3126}
3127#endif
3128 return sum;
3129}
3130
622348f9 3131static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3132 int i;
3133 int temp[64];
3134 int sum=0;
622348f9
MN
3135
3136 assert(h==8);
3137
1457ab52
MN
3138 for(i=0; i<8; i++){
3139 //FIXME try pointer walks
622348f9
MN
3140 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3141 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3142 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3143 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1457ab52
MN
3144
3145 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3146 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3147 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3148 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3149
3150 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3151 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3152 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3153 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3154 }
3155
3156 for(i=0; i<8; i++){
3157 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3158 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3159 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3160 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3161
3162 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3163 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3164 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3165 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3166
3167 sum +=
3168 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3169 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3170 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3171 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3172 }
3173
622348f9
MN
3174 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3175
1457ab52
MN
3176 return sum;
3177}
3178
bb198e19 3179static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3180 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3181 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3182 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3183 int sum=0, i;
bb198e19
MN
3184
3185 assert(h==8);
1457ab52
MN
3186
3187 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3188 s->dsp.fdct(temp);
1457ab52
MN
3189
3190 for(i=0; i<64; i++)
3191 sum+= ABS(temp[i]);
3192
3193 return sum;
3194}
3195
0fd6aea1
MN
3196static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3197 MpegEncContext * const s= (MpegEncContext *)c;
3198 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3199 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3200 int sum=0, i;
3201
3202 assert(h==8);
3203
3204 s->dsp.diff_pixels(temp, src1, src2, stride);
3205 s->dsp.fdct(temp);
3206
3207 for(i=0; i<64; i++)
3208 sum= FFMAX(sum, ABS(temp[i]));
3209
3210 return sum;
3211}
3212
0e15384d 3213void simple_idct(DCTELEM *block); //FIXME
1457ab52 3214
bb198e19 3215static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3216 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3217 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3218 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3219 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3220 int sum=0, i;
3221
bb198e19 3222 assert(h==8);
1457ab52
MN
3223 s->mb_intra=0;
3224
3225 s->dsp.diff_pixels(temp, src1, src2, stride);
3226
3227 memcpy(bak, temp, 64*sizeof(DCTELEM));
3228
67725183 3229 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3230 s->dct_unquantize_inter(s, temp, 0, s->qscale);
1457ab52
MN
3231 simple_idct(temp); //FIXME
3232
3233 for(i=0; i<64; i++)
3234 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3235
3236 return sum;
3237}
3238
bb198e19 3239static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3240 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3241 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3242 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3243 uint64_t __align8 aligned_bak[stride];
3244 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3245 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3246 int i, last, run, bits, level, distoration, start_i;
3247 const int esc_length= s->ac_esc_length;
3248 uint8_t * length;
3249 uint8_t * last_length;
67725183 3250
bb198e19
MN
3251 assert(h==8);
3252
67725183
MN
3253 for(i=0; i<8; i++){
3254 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3255 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3256 }
3a87ac94 3257
67725183
MN
3258 s->dsp.diff_pixels(temp, src1, src2, stride);
3259
3260 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3261
3262 bits=0;
3a87ac94
MN
3263
3264 if (s->mb_intra) {
67725183 3265 start_i = 1;
3a87ac94
MN
3266 length = s->intra_ac_vlc_length;
3267 last_length= s->intra_ac_vlc_last_length;
67725183 3268 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3269 } else {
3270 start_i = 0;
3271 length = s->inter_ac_vlc_length;
3272 last_length= s->inter_ac_vlc_last_length;
3273 }
3a87ac94 3274
67725183 3275 if(last>=start_i){
3a87ac94
MN
3276 run=0;
3277 for(i=start_i; i<last; i++){
3278 int j= scantable[i];
3279 level= temp[j];
3280
3281 if(level){
3282 level+=64;
3283 if((level&(~127)) == 0){
3284 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3285 }else
3286 bits+= esc_length;
3287 run=0;
3288 }else
3289 run++;
3290 }
3291 i= scantable[last];
1d0eab1d 3292
3a87ac94 3293 level= temp[i] + 64;
1d0eab1d
MN
3294
3295 assert(level - 64);
3296
3a87ac94
MN
3297 if((level&(~127)) == 0){
3298 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3299 }else
3300 bits+= esc_length;
3301
67725183
MN
3302 }
3303
3304 if(last>=0){
d50635cd
MN
3305 if(s->mb_intra)
3306 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3307 else
3308 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94
MN
3309 }
3310
b0368839 3311 s->dsp.idct_add(bak, stride, temp);
3a87ac94 3312
bb198e19 3313 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3314
67725183 3315 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3316}
3317
bb198e19 3318static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3319 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3320 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3321 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3322 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3323 int i, last, run, bits, level, start_i;
3324 const int esc_length= s->ac_esc_length;
3325 uint8_t * length;
3326 uint8_t * last_length;
bb198e19
MN
3327
3328 assert(h==8);
67725183
MN
3329
3330 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3331
67725183
MN
3332 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3333
3334 bits=0;
3a87ac94
MN
3335
3336 if (s->mb_intra) {
67725183 3337 start_i = 1;
3a87ac94
MN
3338 length = s->intra_ac_vlc_length;
3339 last_length= s->intra_ac_vlc_last_length;
67725183 3340 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3341 } else {
3342 start_i = 0;
3343 length = s->inter_ac_vlc_length;
3344 last_length= s->inter_ac_vlc_last_length;
3345 }
3a87ac94 3346
67725183 3347 if(last>=start_i){
3a87ac94
MN
3348 run=0;
3349 for(i=start_i; i<last; i++){
3350 int j= scantable[i];
3351 level= temp[j];
3352
3353 if(level){
3354 level+=64;
3355 if((level&(~127)) == 0){
3356 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3357 }else
3358 bits+= esc_length;
3359 run=0;
3360 }else
3361 run++;
3362 }
3363 i= scantable[last];
67725183
MN
3364
3365 level= temp[i] + 64;
3a87ac94 3366
67725183 3367 assert(level - 64);
3a87ac94 3368
3a87ac94
MN
3369 if((level&(~127)) == 0){
3370 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3371 }else
3372 bits+= esc_length;
3373 }
3374
3375 return bits;
3376}
3377
622348f9
MN
3378static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3379 int score=0;
3380 int x,y;
3381
3382 for(y=1; y<h; y++){
3383 for(x=0; x<16; x+=4){
3384 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3385 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3386 }
3387 s+= stride;
3388 }
3389
3390 return score;
3391}
3392
3393static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3394 int score=0;
3395 int x,y;
3396
3397 for(y=1; y<h; y++){
3398 for(x=0; x<16; x++){
3399 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3400 }
3401 s1+= stride;
3402 s2+= stride;
3403 }
3404
3405 return score;
3406}
3407
3408#define SQ(a) ((a)*(a))
3409static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3410 int score=0;
3411 int x,y;
3412
3413 for(y=1; y<h; y++){
3414 for(x=0; x<16; x+=4){
3415 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3416 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3417 }
3418 s+= stride;
3419 }
3420
3421 return score;
3422}
3423
3424static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3425 int score=0;
3426 int x,y;
3427
3428 for(y=1; y<h; y++){
3429 for(x=0; x<16; x++){
3430 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3431 }
3432 s1+= stride;
3433 s2+= stride;
3434 }
3435
3436 return score;
3437}
3438
bb198e19 3439WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
622348f9 3440WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
bb198e19 3441WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
0fd6aea1 3442WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
bb198e19
MN
3443WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3444WARPER8_16_SQ(rd8x8_c, rd16_c)
3445WARPER8_16_SQ(bit8x8_c, bit16_c)
1457ab52 3446
b0368839
MN
3447/* XXX: those functions should be suppressed ASAP when all IDCTs are
3448 converted */
3449static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3450{
3451 j_rev_dct (block);
3452 put_pixels_clamped_c(block, dest, line_size);
3453}
3454static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3455{
3456 j_rev_dct (block);
3457 add_pixels_clamped_c(block, dest, line_size);
3458}
3459
178fcca8
MN
3460static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3461{
3462 j_rev_dct4 (block);
3463 put_pixels_clamped4_c(block, dest, line_size);
3464}
3465static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3466{
3467 j_rev_dct4 (block);
3468 add_pixels_clamped4_c(block, dest, line_size);
3469}
3470
9ca358b9
MN
3471static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3472{
3473 j_rev_dct2 (block);
3474 put_pixels_clamped2_c(block, dest, line_size);
3475}
3476static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3477{
3478 j_rev_dct2 (block);
3479 add_pixels_clamped2_c(block, dest, line_size);
3480}
3481
1aa8c57b
MN
3482static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3483{
3484 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3485
3486 dest[0] = cm[(block[0] + 4)>>3];
3487}
3488static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3489{
3490 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3491
3492 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3493}
3494
59cf08ce
FB
3495/* init static data */
3496void dsputil_static_init(void)
e0eac44e 3497{
d2975f8d 3498 int i;
e0eac44e 3499
59cf08ce
FB
3500 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3501 for(i=0;i<MAX_NEG_CROP;i++) {
3502 cropTbl[i] = 0;
3503 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3504 }
3505
3506 for(i=0;i<512;i++) {
3507 squareTbl[i] = (i - 256) * (i - 256);
3508 }
3509
3510 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3511}
92ddb692 3512
92ddb692 3513
59cf08ce
FB
3514void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3515{
3516 int i;
de6d9b64 3517
b0368839 3518#ifdef CONFIG_ENCODERS
10acc479 3519 if(avctx->dct_algo==FF_DCT_FASTINT) {
b0368839 3520 c->fdct = fdct_ifast;
48b1f800 3521 c->fdct248 = fdct_ifast248;
10acc479
RS
3522 }
3523 else if(avctx->dct_algo==FF_DCT_FAAN) {
65e4c8c9 3524 c->fdct = ff_faandct;
48b1f800 3525 c->fdct248 = ff_faandct248;
10acc479
RS
3526 }
3527 else {
b0368839 3528 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
10acc479
RS
3529 c->fdct248 = ff_fdct248_islow;
3530 }
b0368839
MN
3531#endif //CONFIG_ENCODERS
3532
178fcca8 3533 if(avctx->lowres==1){
0fa8158d
MN
3534 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3535 c->idct_put= ff_jref_idct4_put;
3536 c->idct_add= ff_jref_idct4_add;
3537 }else{
3538 c->idct_put= ff_h264_lowres_idct_put_c;
3539 c->idct_add= ff_h264_lowres_idct_add_c;
3540 }
178fcca8 3541 c->idct = j_rev_dct4;
b0368839 3542 c->idct_permutation_type= FF_NO_IDCT_PERM;
9ca358b9
MN
3543 }else if(avctx->lowres==2){
3544 c->idct_put= ff_jref_idct2_put;
3545 c->idct_add= ff_jref_idct2_add;
3546 c->idct = j_rev_dct2;
3547 c->idct_permutation_type= FF_NO_IDCT_PERM;
1aa8c57b
MN
3548 }else if(avctx->lowres==3){
3549 c->idct_put= ff_jref_idct1_put;
3550 c->idct_add= ff_jref_idct1_add;
3551 c->idct = j_rev_dct1;
3552 c->idct_permutation_type= FF_NO_IDCT_PERM;
178fcca8
MN
3553 }else{
3554 if(avctx->idct_algo==FF_IDCT_INT){
3555 c->idct_put= ff_jref_idct_put;
3556 c->idct_add= ff_jref_idct_add;
3557 c->idct = j_rev_dct;
3558 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3559 }else{ //accurate/default
3560 c->idct_put= simple_idct_put;
3561 c->idct_add= simple_idct_add;
3562 c->idct = simple_idct;
3563 c->idct_permutation_type= FF_NO_IDCT_PERM;
3564 }
b0368839
MN
3565 }
3566
0fa8158d
MN
3567 c->h264_idct_add= ff_h264_idct_add_c;
3568
44cb64ee
MM
3569 /* VP3 DSP support */
3570 c->vp3_dsp_init = vp3_dsp_init_c;
116824d0 3571 c->vp3_idct = vp3_idct_c;
44cb64ee 3572
eb4b3dd3
ZK
3573 c->get_pixels = get_pixels_c;
3574 c->diff_pixels = diff_pixels_c;
3575 c->put_pixels_clamped = put_pixels_clamped_c;
f9ed9d85 3576 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
eb4b3dd3
ZK
3577 c->add_pixels_clamped = add_pixels_clamped_c;
3578 c->gmc1 = gmc1_c;
3579 c->gmc = gmc_c;
3580 c->clear_blocks = clear_blocks_c;
3581 c->pix_sum = pix_sum_c;
3582 c->pix_norm1 = pix_norm1_c;
3583
45553457 3584 /* TODO [0] 16 [1] 8 */
bb198e19
MN
3585 c->pix_abs[0][0] = pix_abs16_c;
3586 c->pix_abs[0][1] = pix_abs16_x2_c;
3587 c->pix_abs[0][2] = pix_abs16_y2_c;
3588 c->pix_abs[0][3] = pix_abs16_xy2_c;
3589 c->pix_abs[1][0] = pix_abs8_c;
3590 c->pix_abs[1][1] = pix_abs8_x2_c;
3591 c->pix_abs[1][2] = pix_abs8_y2_c;
3592 c->pix_abs[1][3] = pix_abs8_xy2_c;
eb4b3dd3 3593
45553457
ZK
3594#define dspfunc(PFX, IDX, NUM) \
3595 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3596 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3597 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3598 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3599
3600 dspfunc(put, 0, 16);
3601 dspfunc(put_no_rnd, 0, 16);
3602 dspfunc(put, 1, 8);
3603 dspfunc(put_no_rnd, 1, 8);
669ac79c
MN
3604 dspfunc(put, 2, 4);
3605 dspfunc(put, 3, 2);
45553457
ZK
3606
3607 dspfunc(avg, 0, 16);
3608 dspfunc(avg_no_rnd, 0, 16);
3609 dspfunc(avg, 1, 8);
3610 dspfunc(avg_no_rnd, 1, 8);
da3b9756
MM
3611 dspfunc(avg, 2, 4);
3612 dspfunc(avg, 3, 2);
45553457
ZK
3613#undef dspfunc
3614
c0a0170c
MN
3615 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3616 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3617
669ac79c
MN
3618 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3619 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3620 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3621 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3622 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3623 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3624 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3625 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3626 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3627
da3b9756
MM
3628 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3629 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3630 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3631 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3632 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3633 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3634 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3635 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3636 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3637
45553457
ZK
3638#define dspfunc(PFX, IDX, NUM) \
3639 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3640 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3641 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3642 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3643 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3644 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3645 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3646 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3647 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3648 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3649 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3650 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3651 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3652 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3653 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3654 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3655
3656 dspfunc(put_qpel, 0, 16);
3657 dspfunc(put_no_rnd_qpel, 0, 16);
3658
3659 dspfunc(avg_qpel, 0, 16);
3660 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3661
3662 dspfunc(put_qpel, 1, 8);
3663 dspfunc(put_no_rnd_qpel, 1, 8);
3664
3665 dspfunc(avg_qpel, 1, 8);
3666 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
MN
3667
3668 dspfunc(put_h264_qpel, 0, 16);
3669 dspfunc(put_h264_qpel, 1, 8);
3670 dspfunc(put_h264_qpel, 2, 4);
3671 dspfunc(avg_h264_qpel, 0, 16);
3672 dspfunc(avg_h264_qpel, 1, 8);
3673 dspfunc(avg_h264_qpel, 2, 4);
3674
45553457 3675#undef dspfunc
0da71265
MN
3676 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3677 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3678 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3679 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3680 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3681 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
c9a2ebc4 3682
9f2d1b4f
LM
3683 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3684 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3685 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3686 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3687 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3688 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3689 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3690 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3691 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3692 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3693 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3694 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3695 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3696 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3697 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3698 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3699 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3700 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3701 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3702 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3703
1457ab52
MN
3704 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3705 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3706 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3707 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3708 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3709 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3710 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3711 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
669ac79c 3712
bb198e19
MN
3713#define SET_CMP_FUNC(name) \
3714 c->name[0]= name ## 16_c;\
3715 c->name[1]= name ## 8x8_c;
3716
3717 SET_CMP_FUNC(hadamard8_diff)
622348f9 3718 c->hadamard8_diff[4]= hadamard8_intra16_c;
bb198e19 3719 SET_CMP_FUNC(dct_sad)
0fd6aea1 3720 SET_CMP_FUNC(dct_max)
bb198e19
MN
3721 c->sad[0]= pix_abs16_c;
3722 c->sad[1]= pix_abs8_c;
3723 c->sse[0]= sse16_c;
3724 c->sse[1]= sse8_c;
26efc54e 3725 c->sse[2]= sse4_c;
bb198e19
MN
3726 SET_CMP_FUNC(quant_psnr)
3727 SET_CMP_FUNC(rd)
3728 SET_CMP_FUNC(bit)
622348f9
MN
3729 c->vsad[0]= vsad16_c;
3730 c->vsad[4]= vsad_intra16_c;
3731 c->vsse[0]= vsse16_c;
3732 c->vsse[4]= vsse_intra16_c;
e6a2ac34
MN
3733 c->nsse[0]= nsse16_c;
3734 c->nsse[1]= nsse8_c;
26efc54e
MN
3735 c->w53[0]= w53_16_c;
3736 c->w53[1]= w53_8_c;
3737 c->w97[0]= w97_16_c;
3738 c->w97[1]= w97_8_c;
3739
11f18faf
MN
3740 c->add_bytes= add_bytes_c;