avoid unneeded clear_blocks()
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
983e3246
MN
22
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
5596c60c 33
88730be6
MR
34/* snow.c */
35void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
36
8b69867f
MN
37uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38uint32_t squareTbl[512] = {0, };
de6d9b64 39
0c1a9eda 40const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 43 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 44 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
49};
50
10acc479
RS
51/* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
62};
63
2f349de2 64/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
8b69867f 65uint16_t __align8 inv_zigzag_direct16[64] = {0, };
2f349de2 66
0c1a9eda 67const uint8_t ff_alternate_horizontal_scan[64] = {
2ad1516a 68 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
76};
77
0c1a9eda 78const uint8_t ff_alternate_vertical_scan[64] = {
2ad1516a 79 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
87};
88
2f349de2 89/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 90const uint32_t inverse[256]={
2f349de2
MN
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
123};
124
b0368839
MN
125/* Input permutation for the simple_idct_mmx */
126static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
135};
136
0c1a9eda 137static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
138{
139 int s, i, j;
140
141 s = 0;
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
144 s += pix[0];
145 s += pix[1];
146 s += pix[2];
147 s += pix[3];
148 s += pix[4];
149 s += pix[5];
150 s += pix[6];
151 s += pix[7];
152 pix += 8;
153 }
154 pix += line_size - 16;
155 }
156 return s;
157}
158
0c1a9eda 159static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
160{
161 int s, i, j;
0c1a9eda 162 uint32_t *sq = squareTbl + 256;
3aa102be
MN
163
164 s = 0;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
2a006cd3 167#if 0
3aa102be
MN
168 s += sq[pix[0]];
169 s += sq[pix[1]];
170 s += sq[pix[2]];
171 s += sq[pix[3]];
172 s += sq[pix[4]];
173 s += sq[pix[5]];
174 s += sq[pix[6]];
175 s += sq[pix[7]];
2a006cd3
FL
176#else
177#if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
179 s += sq[x&0xff];
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
187#else
188 register uint32_t x=*(uint32_t*)pix;
189 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
194 s += sq[x&0xff];
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
198#endif
199#endif
3aa102be
MN
200 pix += 8;
201 }
202 pix += line_size - 16;
203 }
204 return s;
205}
206
3d2e8cce
MN
207static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
208 int i;
209
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
219 }
220 for(;i<w; i++){
221 dst[i+0]= bswap_32(src[i+0]);
222 }
223}
3aa102be 224
26efc54e
MN
225static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
226{
227 int s, i;
228 uint32_t *sq = squareTbl + 256;
229
230 s = 0;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
236 pix1 += line_size;
237 pix2 += line_size;
238 }
239 return s;
240}
241
bb198e19 242static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
243{
244 int s, i;
0c1a9eda 245 uint32_t *sq = squareTbl + 256;
1457ab52
MN
246
247 s = 0;
bb198e19 248 for (i = 0; i < h; i++) {
1457ab52
MN
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
257 pix1 += line_size;
258 pix2 += line_size;
259 }
260 return s;
261}
262
bb198e19 263static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 264{
6b026927
FH
265 int s, i;
266 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
267
268 s = 0;
bb198e19 269 for (i = 0; i < h; i++) {
6b026927
FH
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
2a006cd3 286
6b026927
FH
287 pix1 += line_size;
288 pix2 += line_size;
9c76bd48
BF
289 }
290 return s;
291}
292
26efc54e
MN
293
294static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295 int s, i, j;
296 const int dec_count= w==8 ? 3 : 4;
297 int tmp[16*16];
298#if 0
299 int level, ori;
300 static const int scale[2][2][4][4]={
301 {
302 {
303 //8x8 dec=3
304 {268, 239, 239, 213},
305 { 0, 224, 224, 152},
306 { 0, 135, 135, 110},
307 },{
308 //16x16 dec=4
309 {344, 310, 310, 280},
310 { 0, 320, 320, 228},
311 { 0, 175, 175, 136},
312 { 0, 129, 129, 102},
313 }
314 },{
315 {//FIXME 5/3
316 //8x8 dec=3
317 {275, 245, 245, 218},
318 { 0, 230, 230, 156},
319 { 0, 138, 138, 113},
320 },{
321 //16x16 dec=4
322 {352, 317, 317, 286},
323 { 0, 328, 328, 233},
324 { 0, 180, 180, 140},
325 { 0, 132, 132, 105},
326 }
327 }
328 };
329#endif
330
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
333 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
337 }
338 pix1 += line_size;
339 pix2 += line_size;
340 }
341 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
342
343 s=0;
344#if 0
345 for(level=0; level<dec_count; level++){
346 for(ori= level ? 1 : 0; ori<4; ori++){
347 int sx= (ori&1) ? 1<<level: 0;
348 int stride= 16<<(dec_count-level);
349 int sy= (ori&2) ? stride>>1 : 0;
350 int size= 1<<level;
351
352 for(i=0; i<size; i++){
353 for(j=0; j<size; j++){
354 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
355 s += ABS(v);
356 }
357 }
358 }
359 }
360#endif
361 for (i = 0; i < h; i++) {
362 for (j = 0; j < w; j+=4) {
363 s+= ABS(tmp[16*i+j+0]);
364 s+= ABS(tmp[16*i+j+1]);
365 s+= ABS(tmp[16*i+j+2]);
366 s+= ABS(tmp[16*i+j+3]);
367 }
368 }
369 assert(s>=0);
370
371 return s>>2;
372}
373
374static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
376}
377
378static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
380}
381
382static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
384}
385
386static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
388}
389
0c1a9eda 390static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 391{
de6d9b64
FB
392 int i;
393
394 /* read the pixels */
de6d9b64 395 for(i=0;i<8;i++) {
c13e1abd
FH
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
404 pixels += line_size;
405 block += 8;
de6d9b64
FB
406 }
407}
408
0c1a9eda
ZK
409static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410 const uint8_t *s2, int stride){
9dbcbd92
MN
411 int i;
412
413 /* read the pixels */
9dbcbd92 414 for(i=0;i<8;i++) {
c13e1abd
FH
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
9dbcbd92
MN
423 s1 += stride;
424 s2 += stride;
c13e1abd 425 block += 8;
9dbcbd92
MN
426 }
427}
428
429
0c1a9eda 430static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
eb4b3dd3 431 int line_size)
de6d9b64 432{
de6d9b64 433 int i;
0c1a9eda 434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
435
436 /* read the pixels */
de6d9b64 437 for(i=0;i<8;i++) {
c13e1abd
FH
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
446
447 pixels += line_size;
448 block += 8;
de6d9b64
FB
449 }
450}
451
178fcca8
MN
452static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453 int line_size)
454{
455 int i;
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
457
458 /* read the pixels */
459 for(i=0;i<4;i++) {
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
464
465 pixels += line_size;
466 block += 8;
467 }
468}
469
9ca358b9
MN
470static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
471 int line_size)
472{
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
475
476 /* read the pixels */
477 for(i=0;i<2;i++) {
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
480
481 pixels += line_size;
482 block += 8;
483 }
484}
485
f9ed9d85
MM
486static void put_signed_pixels_clamped_c(const DCTELEM *block,
487 uint8_t *restrict pixels,
488 int line_size)
489{
490 int i, j;
491
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
494 if (*block < -128)
495 *pixels = 0;
496 else if (*block > 127)
497 *pixels = 255;
498 else
499 *pixels = (uint8_t)(*block + 128);
500 block++;
501 pixels++;
502 }
503 pixels += (line_size - 8);
504 }
505}
506
0c1a9eda 507static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 508 int line_size)
de6d9b64 509{
de6d9b64 510 int i;
0c1a9eda 511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
512
513 /* read the pixels */
de6d9b64 514 for(i=0;i<8;i++) {
c13e1abd
FH
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
523 pixels += line_size;
524 block += 8;
de6d9b64
FB
525 }
526}
178fcca8
MN
527
528static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529 int line_size)
530{
531 int i;
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
533
534 /* read the pixels */
535 for(i=0;i<4;i++) {
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
540 pixels += line_size;
541 block += 8;
542 }
543}
9ca358b9
MN
544
545static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546 int line_size)
547{
548 int i;
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
550
551 /* read the pixels */
552 for(i=0;i<2;i++) {
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
555 pixels += line_size;
556 block += 8;
557 }
558}
59fe111e
MN
559#if 0
560
561#define PIXOP2(OPNAME, OP) \
b3184779 562static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
563{\
564 int i;\
565 for(i=0; i<h; i++){\
566 OP(*((uint64_t*)block), LD64(pixels));\
567 pixels+=line_size;\
568 block +=line_size;\
569 }\
570}\
571\
45553457 572static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
573{\
574 int i;\
575 for(i=0; i<h; i++){\
576 const uint64_t a= LD64(pixels );\
577 const uint64_t b= LD64(pixels+1);\
578 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
579 pixels+=line_size;\
580 block +=line_size;\
581 }\
582}\
583\
45553457 584static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
585{\
586 int i;\
587 for(i=0; i<h; i++){\
588 const uint64_t a= LD64(pixels );\
589 const uint64_t b= LD64(pixels+1);\
590 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
591 pixels+=line_size;\
592 block +=line_size;\
593 }\
594}\
595\
45553457 596static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
597{\
598 int i;\
599 for(i=0; i<h; i++){\
600 const uint64_t a= LD64(pixels );\
601 const uint64_t b= LD64(pixels+line_size);\
602 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
603 pixels+=line_size;\
604 block +=line_size;\
605 }\
606}\
607\
45553457 608static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
609{\
610 int i;\
611 for(i=0; i<h; i++){\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+line_size);\
614 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618}\
619\
45553457 620static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
621{\
622 int i;\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 uint64_t l0= (a&0x0303030303030303ULL)\
626 + (b&0x0303030303030303ULL)\
627 + 0x0202020202020202ULL;\
628 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
629 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
630 uint64_t l1,h1;\
631\
632 pixels+=line_size;\
633 for(i=0; i<h; i+=2){\
634 uint64_t a= LD64(pixels );\
635 uint64_t b= LD64(pixels+1);\
636 l1= (a&0x0303030303030303ULL)\
637 + (b&0x0303030303030303ULL);\
638 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
639 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
640 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
641 pixels+=line_size;\
642 block +=line_size;\
643 a= LD64(pixels );\
644 b= LD64(pixels+1);\
645 l0= (a&0x0303030303030303ULL)\
646 + (b&0x0303030303030303ULL)\
647 + 0x0202020202020202ULL;\
648 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
649 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
650 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654}\
655\
45553457 656static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
657{\
658 int i;\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0101010101010101ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666 uint64_t l1,h1;\
667\
668 pixels+=line_size;\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677 pixels+=line_size;\
678 block +=line_size;\
679 a= LD64(pixels );\
680 b= LD64(pixels+1);\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0101010101010101ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687 pixels+=line_size;\
688 block +=line_size;\
689 }\
690}\
691\
45553457
ZK
692CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
693CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
694CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
695CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
696CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
697CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
698CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
699
700#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
701#else // 64 bit variant
702
703#define PIXOP2(OPNAME, OP) \
669ac79c
MN
704static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
705 int i;\
706 for(i=0; i<h; i++){\
707 OP(*((uint16_t*)(block )), LD16(pixels ));\
708 pixels+=line_size;\
709 block +=line_size;\
710 }\
711}\
0da71265
MN
712static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
713 int i;\
714 for(i=0; i<h; i++){\
715 OP(*((uint32_t*)(block )), LD32(pixels ));\
716 pixels+=line_size;\
717 block +=line_size;\
718 }\
719}\
45553457 720static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
721 int i;\
722 for(i=0; i<h; i++){\
723 OP(*((uint32_t*)(block )), LD32(pixels ));\
724 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
725 pixels+=line_size;\
726 block +=line_size;\
727 }\
728}\
45553457
ZK
729static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
730 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 731}\
59fe111e 732\
b3184779
MN
733static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
734 int src_stride1, int src_stride2, int h){\
59fe111e
MN
735 int i;\
736 for(i=0; i<h; i++){\
b3184779
MN
737 uint32_t a,b;\
738 a= LD32(&src1[i*src_stride1 ]);\
739 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 740 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
741 a= LD32(&src1[i*src_stride1+4]);\
742 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 743 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
744 }\
745}\
746\
b3184779
MN
747static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
748 int src_stride1, int src_stride2, int h){\
59fe111e
MN
749 int i;\
750 for(i=0; i<h; i++){\
b3184779
MN
751 uint32_t a,b;\
752 a= LD32(&src1[i*src_stride1 ]);\
753 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 754 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
755 a= LD32(&src1[i*src_stride1+4]);\
756 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 757 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
758 }\
759}\
760\
0da71265
MN
761static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
762 int src_stride1, int src_stride2, int h){\
763 int i;\
764 for(i=0; i<h; i++){\
765 uint32_t a,b;\
766 a= LD32(&src1[i*src_stride1 ]);\
767 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 768 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
769 }\
770}\
771\
669ac79c
MN
772static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
773 int src_stride1, int src_stride2, int h){\
774 int i;\
775 for(i=0; i<h; i++){\
776 uint32_t a,b;\
777 a= LD16(&src1[i*src_stride1 ]);\
778 b= LD16(&src2[i*src_stride2 ]);\
779 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
780 }\
781}\
782\
b3184779
MN
783static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
785 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
786 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
787}\
788\
789static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
790 int src_stride1, int src_stride2, int h){\
791 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
792 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
793}\
794\
45553457 795static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
796 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
797}\
798\
45553457 799static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
800 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
801}\
802\
45553457 803static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
804 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
805}\
806\
45553457 807static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
808 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
809}\
810\
811static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
812 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
813 int i;\
814 for(i=0; i<h; i++){\
b3184779
MN
815 uint32_t a, b, c, d, l0, l1, h0, h1;\
816 a= LD32(&src1[i*src_stride1]);\
817 b= LD32(&src2[i*src_stride2]);\
818 c= LD32(&src3[i*src_stride3]);\
819 d= LD32(&src4[i*src_stride4]);\
820 l0= (a&0x03030303UL)\
821 + (b&0x03030303UL)\
822 + 0x02020202UL;\
823 h0= ((a&0xFCFCFCFCUL)>>2)\
824 + ((b&0xFCFCFCFCUL)>>2);\
825 l1= (c&0x03030303UL)\
826 + (d&0x03030303UL);\
827 h1= ((c&0xFCFCFCFCUL)>>2)\
828 + ((d&0xFCFCFCFCUL)>>2);\
829 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
830 a= LD32(&src1[i*src_stride1+4]);\
831 b= LD32(&src2[i*src_stride2+4]);\
832 c= LD32(&src3[i*src_stride3+4]);\
833 d= LD32(&src4[i*src_stride4+4]);\
834 l0= (a&0x03030303UL)\
835 + (b&0x03030303UL)\
836 + 0x02020202UL;\
837 h0= ((a&0xFCFCFCFCUL)>>2)\
838 + ((b&0xFCFCFCFCUL)>>2);\
839 l1= (c&0x03030303UL)\
840 + (d&0x03030303UL);\
841 h1= ((c&0xFCFCFCFCUL)>>2)\
842 + ((d&0xFCFCFCFCUL)>>2);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
844 }\
845}\
669ac79c
MN
846\
847static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
849}\
850\
851static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
853}\
854\
855static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
857}\
858\
859static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
860 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
861}\
862\
b3184779
MN
863static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
864 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
865 int i;\
866 for(i=0; i<h; i++){\
b3184779
MN
867 uint32_t a, b, c, d, l0, l1, h0, h1;\
868 a= LD32(&src1[i*src_stride1]);\
869 b= LD32(&src2[i*src_stride2]);\
870 c= LD32(&src3[i*src_stride3]);\
871 d= LD32(&src4[i*src_stride4]);\
872 l0= (a&0x03030303UL)\
873 + (b&0x03030303UL)\
874 + 0x01010101UL;\
875 h0= ((a&0xFCFCFCFCUL)>>2)\
876 + ((b&0xFCFCFCFCUL)>>2);\
877 l1= (c&0x03030303UL)\
878 + (d&0x03030303UL);\
879 h1= ((c&0xFCFCFCFCUL)>>2)\
880 + ((d&0xFCFCFCFCUL)>>2);\
881 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
882 a= LD32(&src1[i*src_stride1+4]);\
883 b= LD32(&src2[i*src_stride2+4]);\
884 c= LD32(&src3[i*src_stride3+4]);\
885 d= LD32(&src4[i*src_stride4+4]);\
886 l0= (a&0x03030303UL)\
887 + (b&0x03030303UL)\
888 + 0x01010101UL;\
889 h0= ((a&0xFCFCFCFCUL)>>2)\
890 + ((b&0xFCFCFCFCUL)>>2);\
891 l1= (c&0x03030303UL)\
892 + (d&0x03030303UL);\
893 h1= ((c&0xFCFCFCFCUL)>>2)\
894 + ((d&0xFCFCFCFCUL)>>2);\
895 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
896 }\
897}\
b3184779
MN
898static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
901 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
902}\
903static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
904 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
905 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
906 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
907}\
59fe111e 908\
669ac79c
MN
909static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
910{\
911 int i, a0, b0, a1, b1;\
912 a0= pixels[0];\
913 b0= pixels[1] + 2;\
914 a0 += b0;\
915 b0 += pixels[2];\
916\
917 pixels+=line_size;\
918 for(i=0; i<h; i+=2){\
919 a1= pixels[0];\
920 b1= pixels[1];\
921 a1 += b1;\
922 b1 += pixels[2];\
923\
924 block[0]= (a1+a0)>>2; /* FIXME non put */\
925 block[1]= (b1+b0)>>2;\
926\
927 pixels+=line_size;\
928 block +=line_size;\
929\
930 a0= pixels[0];\
931 b0= pixels[1] + 2;\
932 a0 += b0;\
933 b0 += pixels[2];\
934\
935 block[0]= (a1+a0)>>2;\
936 block[1]= (b1+b0)>>2;\
937 pixels+=line_size;\
938 block +=line_size;\
939 }\
940}\
941\
942static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
943{\
944 int i;\
945 const uint32_t a= LD32(pixels );\
946 const uint32_t b= LD32(pixels+1);\
947 uint32_t l0= (a&0x03030303UL)\
948 + (b&0x03030303UL)\
949 + 0x02020202UL;\
950 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
951 + ((b&0xFCFCFCFCUL)>>2);\
952 uint32_t l1,h1;\
953\
954 pixels+=line_size;\
955 for(i=0; i<h; i+=2){\
956 uint32_t a= LD32(pixels );\
957 uint32_t b= LD32(pixels+1);\
958 l1= (a&0x03030303UL)\
959 + (b&0x03030303UL);\
960 h1= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
963 pixels+=line_size;\
964 block +=line_size;\
965 a= LD32(pixels );\
966 b= LD32(pixels+1);\
967 l0= (a&0x03030303UL)\
968 + (b&0x03030303UL)\
969 + 0x02020202UL;\
970 h0= ((a&0xFCFCFCFCUL)>>2)\
971 + ((b&0xFCFCFCFCUL)>>2);\
972 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
973 pixels+=line_size;\
974 block +=line_size;\
975 }\
976}\
977\
45553457 978static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
979{\
980 int j;\
981 for(j=0; j<2; j++){\
982 int i;\
983 const uint32_t a= LD32(pixels );\
984 const uint32_t b= LD32(pixels+1);\
985 uint32_t l0= (a&0x03030303UL)\
986 + (b&0x03030303UL)\
987 + 0x02020202UL;\
988 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
989 + ((b&0xFCFCFCFCUL)>>2);\
990 uint32_t l1,h1;\
991\
992 pixels+=line_size;\
993 for(i=0; i<h; i+=2){\
994 uint32_t a= LD32(pixels );\
995 uint32_t b= LD32(pixels+1);\
996 l1= (a&0x03030303UL)\
997 + (b&0x03030303UL);\
998 h1= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001 pixels+=line_size;\
1002 block +=line_size;\
1003 a= LD32(pixels );\
1004 b= LD32(pixels+1);\
1005 l0= (a&0x03030303UL)\
1006 + (b&0x03030303UL)\
1007 + 0x02020202UL;\
1008 h0= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 pixels+=line_size;\
1012 block +=line_size;\
1013 }\
1014 pixels+=4-line_size*(h+1);\
1015 block +=4-line_size*h;\
1016 }\
1017}\
1018\
45553457 1019static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1020{\
1021 int j;\
1022 for(j=0; j<2; j++){\
1023 int i;\
1024 const uint32_t a= LD32(pixels );\
1025 const uint32_t b= LD32(pixels+1);\
1026 uint32_t l0= (a&0x03030303UL)\
1027 + (b&0x03030303UL)\
1028 + 0x01010101UL;\
1029 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1031 uint32_t l1,h1;\
1032\
1033 pixels+=line_size;\
1034 for(i=0; i<h; i+=2){\
1035 uint32_t a= LD32(pixels );\
1036 uint32_t b= LD32(pixels+1);\
1037 l1= (a&0x03030303UL)\
1038 + (b&0x03030303UL);\
1039 h1= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042 pixels+=line_size;\
1043 block +=line_size;\
1044 a= LD32(pixels );\
1045 b= LD32(pixels+1);\
1046 l0= (a&0x03030303UL)\
1047 + (b&0x03030303UL)\
1048 + 0x01010101UL;\
1049 h0= ((a&0xFCFCFCFCUL)>>2)\
1050 + ((b&0xFCFCFCFCUL)>>2);\
1051 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1052 pixels+=line_size;\
1053 block +=line_size;\
1054 }\
1055 pixels+=4-line_size*(h+1);\
1056 block +=4-line_size*h;\
1057 }\
1058}\
1059\
45553457
ZK
1060CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1061CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1062CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1063CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1064CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1065CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1066CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1067CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1068
d8085ea7 1069#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1070#endif
59fe111e
MN
1071#define op_put(a, b) a = b
1072
1073PIXOP2(avg, op_avg)
1074PIXOP2(put, op_put)
1075#undef op_avg
1076#undef op_put
1077
de6d9b64
FB
1078#define avg2(a,b) ((a+b+1)>>1)
1079#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1080
c0a0170c
MN
1081static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1082 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1083}
1084
1085static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1086 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1087}
073b013d 1088
0c1a9eda 1089static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1090{
1091 const int A=(16-x16)*(16-y16);
1092 const int B=( x16)*(16-y16);
1093 const int C=(16-x16)*( y16);
1094 const int D=( x16)*( y16);
1095 int i;
44eb4951
MN
1096
1097 for(i=0; i<h; i++)
1098 {
b3184779
MN
1099 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1100 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1101 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1102 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1103 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1104 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1105 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1106 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1107 dst+= stride;
1108 src+= stride;
44eb4951
MN
1109 }
1110}
1111
0c1a9eda 1112static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1113 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1114{
1115 int y, vx, vy;
1116 const int s= 1<<shift;
1117
1118 width--;
1119 height--;
1120
1121 for(y=0; y<h; y++){
1122 int x;
1123
1124 vx= ox;
1125 vy= oy;
1126 for(x=0; x<8; x++){ //XXX FIXME optimize
1127 int src_x, src_y, frac_x, frac_y, index;
1128
1129 src_x= vx>>16;
1130 src_y= vy>>16;
1131 frac_x= src_x&(s-1);
1132 frac_y= src_y&(s-1);
1133 src_x>>=shift;
1134 src_y>>=shift;
1135
1136 if((unsigned)src_x < width){
1137 if((unsigned)src_y < height){
1138 index= src_x + src_y*stride;
1139 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1140 + src[index +1]* frac_x )*(s-frac_y)
1141 + ( src[index+stride ]*(s-frac_x)
1142 + src[index+stride+1]* frac_x )* frac_y
1143 + r)>>(shift*2);
1144 }else{
1145 index= src_x + clip(src_y, 0, height)*stride;
1146 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1147 + src[index +1]* frac_x )*s
1148 + r)>>(shift*2);
1149 }
1150 }else{
1151 if((unsigned)src_y < height){
1152 index= clip(src_x, 0, width) + src_y*stride;
1153 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1154 + src[index+stride ]* frac_y )*s
1155 + r)>>(shift*2);
1156 }else{
1157 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1158 dst[y*stride + x]= src[index ];
1159 }
1160 }
1161
1162 vx+= dxx;
1163 vy+= dyx;
1164 }
1165 ox += dxy;
1166 oy += dyy;
1167 }
1168}
669ac79c
MN
1169
1170static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1171 switch(width){
1172 case 2: put_pixels2_c (dst, src, stride, height); break;
1173 case 4: put_pixels4_c (dst, src, stride, height); break;
1174 case 8: put_pixels8_c (dst, src, stride, height); break;
1175 case 16:put_pixels16_c(dst, src, stride, height); break;
1176 }
1177}
1178
1179static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1180 int i,j;
1181 for (i=0; i < height; i++) {
1182 for (j=0; j < width; j++) {
1183 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1184 }
1185 src += stride;
1186 dst += stride;
1187 }
1188}
1189
1190static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1191 int i,j;
1192 for (i=0; i < height; i++) {
1193 for (j=0; j < width; j++) {
1194 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1195 }
1196 src += stride;
1197 dst += stride;
1198 }
1199}
1200
1201static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202 int i,j;
1203 for (i=0; i < height; i++) {
1204 for (j=0; j < width; j++) {
1205 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1206 }
1207 src += stride;
1208 dst += stride;
1209 }
1210}
1211
1212static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1213 int i,j;
1214 for (i=0; i < height; i++) {
1215 for (j=0; j < width; j++) {
1216 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1217 }
1218 src += stride;
1219 dst += stride;
1220 }
1221}
1222
1223static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224 int i,j;
1225 for (i=0; i < height; i++) {
1226 for (j=0; j < width; j++) {
89ebf4e8 1227 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1228 }
1229 src += stride;
1230 dst += stride;
1231 }
1232}
1233
1234static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235 int i,j;
1236 for (i=0; i < height; i++) {
1237 for (j=0; j < width; j++) {
1238 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1239 }
1240 src += stride;
1241 dst += stride;
1242 }
1243}
1244
1245static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1246 int i,j;
1247 for (i=0; i < height; i++) {
1248 for (j=0; j < width; j++) {
89ebf4e8 1249 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1250 }
1251 src += stride;
1252 dst += stride;
1253 }
1254}
1255
1256static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257 int i,j;
1258 for (i=0; i < height; i++) {
1259 for (j=0; j < width; j++) {
1260 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1261 }
1262 src += stride;
1263 dst += stride;
1264 }
1265}
da3b9756
MM
1266
1267static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1268 switch(width){
1269 case 2: avg_pixels2_c (dst, src, stride, height); break;
1270 case 4: avg_pixels4_c (dst, src, stride, height); break;
1271 case 8: avg_pixels8_c (dst, src, stride, height); break;
1272 case 16:avg_pixels16_c(dst, src, stride, height); break;
1273 }
1274}
1275
1276static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 int i,j;
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
1280 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1281 }
1282 src += stride;
1283 dst += stride;
1284 }
1285}
1286
1287static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 int i,j;
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
1291 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1292 }
1293 src += stride;
1294 dst += stride;
1295 }
1296}
1297
1298static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 int i,j;
1300 for (i=0; i < height; i++) {
1301 for (j=0; j < width; j++) {
1302 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1303 }
1304 src += stride;
1305 dst += stride;
1306 }
1307}
1308
1309static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1310 int i,j;
1311 for (i=0; i < height; i++) {
1312 for (j=0; j < width; j++) {
1313 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1314 }
1315 src += stride;
1316 dst += stride;
1317 }
1318}
1319
1320static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321 int i,j;
1322 for (i=0; i < height; i++) {
1323 for (j=0; j < width; j++) {
89ebf4e8 1324 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1325 }
1326 src += stride;
1327 dst += stride;
1328 }
1329}
1330
1331static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332 int i,j;
1333 for (i=0; i < height; i++) {
1334 for (j=0; j < width; j++) {
1335 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1336 }
1337 src += stride;
1338 dst += stride;
1339 }
1340}
1341
1342static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343 int i,j;
1344 for (i=0; i < height; i++) {
1345 for (j=0; j < width; j++) {
89ebf4e8 1346 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1347 }
1348 src += stride;
1349 dst += stride;
1350 }
1351}
1352
1353static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354 int i,j;
1355 for (i=0; i < height; i++) {
1356 for (j=0; j < width; j++) {
1357 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1358 }
1359 src += stride;
1360 dst += stride;
1361 }
1362}
669ac79c
MN
1363#if 0
1364#define TPEL_WIDTH(width)\
1365static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1366 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1367static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1368 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1369static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1370 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1371static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1372 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1373static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1374 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1375static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1376 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1377static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1378 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1379static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1380 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1381static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1382 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1383#endif
1384
0da71265
MN
1385#define H264_CHROMA_MC(OPNAME, OP)\
1386static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1387 const int A=(8-x)*(8-y);\
1388 const int B=( x)*(8-y);\
1389 const int C=(8-x)*( y);\
1390 const int D=( x)*( y);\
1391 int i;\
1392 \
1393 assert(x<8 && y<8 && x>=0 && y>=0);\
1394\
1395 for(i=0; i<h; i++)\
1396 {\
1397 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1398 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1399 dst+= stride;\
1400 src+= stride;\
1401 }\
1402}\
1403\
1404static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1405 const int A=(8-x)*(8-y);\
1406 const int B=( x)*(8-y);\
1407 const int C=(8-x)*( y);\
1408 const int D=( x)*( y);\
1409 int i;\
1410 \
1411 assert(x<8 && y<8 && x>=0 && y>=0);\
1412\
1413 for(i=0; i<h; i++)\
1414 {\
1415 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1416 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1417 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1418 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1419 dst+= stride;\
1420 src+= stride;\
1421 }\
1422}\
1423\
1424static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1425 const int A=(8-x)*(8-y);\
1426 const int B=( x)*(8-y);\
1427 const int C=(8-x)*( y);\
1428 const int D=( x)*( y);\
1429 int i;\
1430 \
1431 assert(x<8 && y<8 && x>=0 && y>=0);\
1432\
1433 for(i=0; i<h; i++)\
1434 {\
1435 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1436 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1437 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1438 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1439 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1440 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1441 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1442 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1443 dst+= stride;\
1444 src+= stride;\
1445 }\
1446}
1447
1448#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1449#define op_put(a, b) a = (((b) + 32)>>6)
1450
1451H264_CHROMA_MC(put_ , op_put)
1452H264_CHROMA_MC(avg_ , op_avg)
1453#undef op_avg
1454#undef op_put
1455
1456static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1457{
1458 int i;
1459 for(i=0; i<h; i++)
1460 {
1461 ST32(dst , LD32(src ));
1462 dst+=dstStride;
1463 src+=srcStride;
1464 }
1465}
1466
1467static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1468{
1469 int i;
1470 for(i=0; i<h; i++)
1471 {
1472 ST32(dst , LD32(src ));
1473 ST32(dst+4 , LD32(src+4 ));
1474 dst+=dstStride;
1475 src+=srcStride;
1476 }
1477}
1478
1479static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1480{
1481 int i;
1482 for(i=0; i<h; i++)
1483 {
1484 ST32(dst , LD32(src ));
1485 ST32(dst+4 , LD32(src+4 ));
1486 ST32(dst+8 , LD32(src+8 ));
1487 ST32(dst+12, LD32(src+12));
1488 dst+=dstStride;
1489 src+=srcStride;
1490 }
1491}
073b013d 1492
0c1a9eda 1493static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1494{
44eb4951
MN
1495 int i;
1496 for(i=0; i<h; i++)
1497 {
b3184779
MN
1498 ST32(dst , LD32(src ));
1499 ST32(dst+4 , LD32(src+4 ));
1500 ST32(dst+8 , LD32(src+8 ));
1501 ST32(dst+12, LD32(src+12));
1502 dst[16]= src[16];
44eb4951
MN
1503 dst+=dstStride;
1504 src+=srcStride;
1505 }
1506}
1507
0c1a9eda 1508static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1509{
1510 int i;
b3184779 1511 for(i=0; i<h; i++)
44eb4951 1512 {
b3184779
MN
1513 ST32(dst , LD32(src ));
1514 ST32(dst+4 , LD32(src+4 ));
1515 dst[8]= src[8];
44eb4951
MN
1516 dst+=dstStride;
1517 src+=srcStride;
1518 }
1519}
1520
826f429a 1521
b3184779 1522#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1523static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1524 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1525 int i;\
1526 for(i=0; i<h; i++)\
1527 {\
1528 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1529 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1530 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1531 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1532 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1533 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1534 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1535 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1536 dst+=dstStride;\
1537 src+=srcStride;\
1538 }\
44eb4951
MN
1539}\
1540\
0c1a9eda 1541static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1542 const int w=8;\
0c1a9eda 1543 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1544 int i;\
1545 for(i=0; i<w; i++)\
1546 {\
1547 const int src0= src[0*srcStride];\
1548 const int src1= src[1*srcStride];\
1549 const int src2= src[2*srcStride];\
1550 const int src3= src[3*srcStride];\
1551 const int src4= src[4*srcStride];\
1552 const int src5= src[5*srcStride];\
1553 const int src6= src[6*srcStride];\
1554 const int src7= src[7*srcStride];\
1555 const int src8= src[8*srcStride];\
1556 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1557 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1558 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1559 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1560 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1561 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1562 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1563 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1564 dst++;\
1565 src++;\
1566 }\
1567}\
1568\
0c1a9eda
ZK
1569static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1571 int i;\
826f429a 1572 \
b3184779
MN
1573 for(i=0; i<h; i++)\
1574 {\
1575 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1576 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1577 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1578 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1579 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1580 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1581 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1582 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1583 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1584 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1585 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1586 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1587 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1588 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1589 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1590 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1591 dst+=dstStride;\
1592 src+=srcStride;\
1593 }\
1594}\
1595\
0c1a9eda
ZK
1596static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1597 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1598 int i;\
826f429a 1599 const int w=16;\
b3184779
MN
1600 for(i=0; i<w; i++)\
1601 {\
1602 const int src0= src[0*srcStride];\
1603 const int src1= src[1*srcStride];\
1604 const int src2= src[2*srcStride];\
1605 const int src3= src[3*srcStride];\
1606 const int src4= src[4*srcStride];\
1607 const int src5= src[5*srcStride];\
1608 const int src6= src[6*srcStride];\
1609 const int src7= src[7*srcStride];\
1610 const int src8= src[8*srcStride];\
1611 const int src9= src[9*srcStride];\
1612 const int src10= src[10*srcStride];\
1613 const int src11= src[11*srcStride];\
1614 const int src12= src[12*srcStride];\
1615 const int src13= src[13*srcStride];\
1616 const int src14= src[14*srcStride];\
1617 const int src15= src[15*srcStride];\
1618 const int src16= src[16*srcStride];\
1619 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1620 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1621 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1622 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1623 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1624 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1625 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1626 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1627 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1628 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1629 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1630 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1631 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1632 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1633 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1634 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1635 dst++;\
1636 src++;\
1637 }\
1638}\
1639\
0c1a9eda 1640static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1641 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1642}\
1643\
0c1a9eda
ZK
1644static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t half[64];\
b3184779
MN
1646 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1647 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1648}\
1649\
0c1a9eda 1650static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1651 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1652}\
1653\
0c1a9eda
ZK
1654static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t half[64];\
b3184779
MN
1656 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1657 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1658}\
1659\
0c1a9eda
ZK
1660static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1661 uint8_t full[16*9];\
1662 uint8_t half[64];\
b3184779 1663 copy_block9(full, src, 16, stride, 9);\
db794953 1664 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1665 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1666}\
1667\
0c1a9eda
ZK
1668static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
b3184779 1670 copy_block9(full, src, 16, stride, 9);\
db794953 1671 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1672}\
1673\
0c1a9eda
ZK
1674static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[16*9];\
1676 uint8_t half[64];\
b3184779 1677 copy_block9(full, src, 16, stride, 9);\
db794953 1678 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1679 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1680}\
0c1a9eda
ZK
1681void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[16*9];\
1683 uint8_t halfH[72];\
1684 uint8_t halfV[64];\
1685 uint8_t halfHV[64];\
b3184779
MN
1686 copy_block9(full, src, 16, stride, 9);\
1687 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1688 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1689 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1690 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1691}\
0c1a9eda
ZK
1692static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1693 uint8_t full[16*9];\
1694 uint8_t halfH[72];\
1695 uint8_t halfHV[64];\
db794953
MN
1696 copy_block9(full, src, 16, stride, 9);\
1697 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1698 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1699 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1700 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1701}\
0c1a9eda
ZK
1702void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1704 uint8_t halfH[72];\
1705 uint8_t halfV[64];\
1706 uint8_t halfHV[64];\
b3184779
MN
1707 copy_block9(full, src, 16, stride, 9);\
1708 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1709 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1710 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1711 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1712}\
0c1a9eda
ZK
1713static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1714 uint8_t full[16*9];\
1715 uint8_t halfH[72];\
1716 uint8_t halfHV[64];\
db794953
MN
1717 copy_block9(full, src, 16, stride, 9);\
1718 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1719 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1720 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1721 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1722}\
0c1a9eda
ZK
1723void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1725 uint8_t halfH[72];\
1726 uint8_t halfV[64];\
1727 uint8_t halfHV[64];\
b3184779
MN
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1732 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1733}\
0c1a9eda
ZK
1734static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1736 uint8_t halfH[72];\
1737 uint8_t halfHV[64];\
db794953
MN
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1743}\
0c1a9eda
ZK
1744void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1746 uint8_t halfH[72];\
1747 uint8_t halfV[64];\
1748 uint8_t halfHV[64];\
b3184779
MN
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1753 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1754}\
0c1a9eda
ZK
1755static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1757 uint8_t halfH[72];\
1758 uint8_t halfHV[64];\
db794953
MN
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1764}\
0c1a9eda
ZK
1765static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1766 uint8_t halfH[72];\
1767 uint8_t halfHV[64];\
b3184779 1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1770 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1771}\
0c1a9eda
ZK
1772static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1773 uint8_t halfH[72];\
1774 uint8_t halfHV[64];\
b3184779 1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1777 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1778}\
0c1a9eda
ZK
1779void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1781 uint8_t halfH[72];\
1782 uint8_t halfV[64];\
1783 uint8_t halfHV[64];\
b3184779
MN
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1788 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1789}\
0c1a9eda
ZK
1790static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1792 uint8_t halfH[72];\
db794953
MN
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1796 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1797}\
0c1a9eda
ZK
1798void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1799 uint8_t full[16*9];\
1800 uint8_t halfH[72];\
1801 uint8_t halfV[64];\
1802 uint8_t halfHV[64];\
b3184779
MN
1803 copy_block9(full, src, 16, stride, 9);\
1804 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1805 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1807 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1808}\
0c1a9eda
ZK
1809static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1810 uint8_t full[16*9];\
1811 uint8_t halfH[72];\
db794953
MN
1812 copy_block9(full, src, 16, stride, 9);\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1814 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1815 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1816}\
0c1a9eda
ZK
1817static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t halfH[72];\
b3184779 1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1820 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1821}\
0c1a9eda 1822static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1823 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1824}\
1825\
0c1a9eda
ZK
1826static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t half[256];\
b3184779
MN
1828 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1829 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1830}\
1831\
0c1a9eda 1832static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1833 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1834}\
b3184779 1835\
0c1a9eda
ZK
1836static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t half[256];\
b3184779
MN
1838 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1839 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1840}\
1841\
0c1a9eda
ZK
1842static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[24*17];\
1844 uint8_t half[256];\
b3184779 1845 copy_block17(full, src, 24, stride, 17);\
826f429a 1846 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1847 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1848}\
1849\
0c1a9eda
ZK
1850static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
b3184779 1852 copy_block17(full, src, 24, stride, 17);\
826f429a 1853 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1854}\
1855\
0c1a9eda
ZK
1856static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[24*17];\
1858 uint8_t half[256];\
b3184779 1859 copy_block17(full, src, 24, stride, 17);\
826f429a 1860 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1861 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1862}\
0c1a9eda
ZK
1863void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t full[24*17];\
1865 uint8_t halfH[272];\
1866 uint8_t halfV[256];\
1867 uint8_t halfHV[256];\
b3184779
MN
1868 copy_block17(full, src, 24, stride, 17);\
1869 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1870 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1871 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1872 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1873}\
0c1a9eda
ZK
1874static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1875 uint8_t full[24*17];\
1876 uint8_t halfH[272];\
1877 uint8_t halfHV[256];\
db794953
MN
1878 copy_block17(full, src, 24, stride, 17);\
1879 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1880 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1881 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1882 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1883}\
0c1a9eda
ZK
1884void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1886 uint8_t halfH[272];\
1887 uint8_t halfV[256];\
1888 uint8_t halfHV[256];\
b3184779
MN
1889 copy_block17(full, src, 24, stride, 17);\
1890 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1891 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1892 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1893 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1894}\
0c1a9eda
ZK
1895static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1896 uint8_t full[24*17];\
1897 uint8_t halfH[272];\
1898 uint8_t halfHV[256];\
db794953
MN
1899 copy_block17(full, src, 24, stride, 17);\
1900 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1901 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1902 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1903 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1904}\
0c1a9eda
ZK
1905void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
b3184779
MN
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1914 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915}\
0c1a9eda
ZK
1916static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
db794953
MN
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1925}\
0c1a9eda
ZK
1926void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
b3184779
MN
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1935 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936}\
0c1a9eda
ZK
1937static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
db794953
MN
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1946}\
0c1a9eda
ZK
1947static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t halfH[272];\
1949 uint8_t halfHV[256];\
b3184779 1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1952 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1953}\
0c1a9eda
ZK
1954static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t halfH[272];\
1956 uint8_t halfHV[256];\
b3184779 1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1959 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1960}\
0c1a9eda
ZK
1961void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfV[256];\
1965 uint8_t halfHV[256];\
b3184779
MN
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1970 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1971}\
0c1a9eda
ZK
1972static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
db794953
MN
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1978 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1979}\
0c1a9eda
ZK
1980void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[24*17];\
1982 uint8_t halfH[272];\
1983 uint8_t halfV[256];\
1984 uint8_t halfHV[256];\
b3184779
MN
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1989 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1990}\
0c1a9eda
ZK
1991static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t full[24*17];\
1993 uint8_t halfH[272];\
db794953
MN
1994 copy_block17(full, src, 24, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1997 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1998}\
0c1a9eda
ZK
1999static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t halfH[272];\
b3184779 2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2002 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2003}
44eb4951 2004
b3184779
MN
2005#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2006#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2007#define op_put(a, b) a = cm[((b) + 16)>>5]
2008#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2009
2010QPEL_MC(0, put_ , _ , op_put)
2011QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2012QPEL_MC(0, avg_ , _ , op_avg)
2013//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2014#undef op_avg
2015#undef op_avg_no_rnd
2016#undef op_put
2017#undef op_put_no_rnd
44eb4951 2018
0da71265
MN
2019#if 1
2020#define H264_LOWPASS(OPNAME, OP, OP2) \
2021static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 const int h=4;\
2023 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2024 int i;\
2025 for(i=0; i<h; i++)\
2026 {\
2027 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2028 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2029 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2030 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2031 dst+=dstStride;\
2032 src+=srcStride;\
2033 }\
2034}\
2035\
2036static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2037 const int w=4;\
2038 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2039 int i;\
2040 for(i=0; i<w; i++)\
2041 {\
2042 const int srcB= src[-2*srcStride];\
2043 const int srcA= src[-1*srcStride];\
2044 const int src0= src[0 *srcStride];\
2045 const int src1= src[1 *srcStride];\
2046 const int src2= src[2 *srcStride];\
2047 const int src3= src[3 *srcStride];\
2048 const int src4= src[4 *srcStride];\
2049 const int src5= src[5 *srcStride];\
2050 const int src6= src[6 *srcStride];\
2051 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2052 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2053 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2054 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2055 dst++;\
2056 src++;\
2057 }\
2058}\
2059\
2060static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2061 const int h=4;\
2062 const int w=4;\
2063 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2064 int i;\
2065 src -= 2*srcStride;\
2066 for(i=0; i<h+5; i++)\
2067 {\
2068 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2071 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2072 tmp+=tmpStride;\
2073 src+=srcStride;\
2074 }\
2075 tmp -= tmpStride*(h+5-2);\
2076 for(i=0; i<w; i++)\
2077 {\
2078 const int tmpB= tmp[-2*tmpStride];\
2079 const int tmpA= tmp[-1*tmpStride];\
2080 const int tmp0= tmp[0 *tmpStride];\
2081 const int tmp1= tmp[1 *tmpStride];\
2082 const int tmp2= tmp[2 *tmpStride];\
2083 const int tmp3= tmp[3 *tmpStride];\
2084 const int tmp4= tmp[4 *tmpStride];\
2085 const int tmp5= tmp[5 *tmpStride];\
2086 const int tmp6= tmp[6 *tmpStride];\
2087 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2088 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2089 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2090 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2091 dst++;\
2092 tmp++;\
2093 }\
2094}\
2095\
2096static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2097 const int h=8;\
2098 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2099 int i;\
2100 for(i=0; i<h; i++)\
2101 {\
2102 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2103 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2104 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2105 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2106 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2107 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2108 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2109 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2110 dst+=dstStride;\
2111 src+=srcStride;\
2112 }\
2113}\
2114\
2115static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2116 const int w=8;\
2117 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2118 int i;\
2119 for(i=0; i<w; i++)\
2120 {\
2121 const int srcB= src[-2*srcStride];\
2122 const int srcA= src[-1*srcStride];\
2123 const int src0= src[0 *srcStride];\
2124 const int src1= src[1 *srcStride];\
2125 const int src2= src[2 *srcStride];\
2126 const int src3= src[3 *srcStride];\
2127 const int src4= src[4 *srcStride];\
2128 const int src5= src[5 *srcStride];\
2129 const int src6= src[6 *srcStride];\
2130 const int src7= src[7 *srcStride];\
2131 const int src8= src[8 *srcStride];\
2132 const int src9= src[9 *srcStride];\
2133 const int src10=src[10*srcStride];\
2134 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2137 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2138 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2139 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2140 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2141 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2142 dst++;\
2143 src++;\
2144 }\
2145}\
2146\
2147static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2148 const int h=8;\
2149 const int w=8;\
2150 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2151 int i;\
2152 src -= 2*srcStride;\
2153 for(i=0; i<h+5; i++)\
2154 {\
2155 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2156 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2157 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2158 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2159 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2160 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2161 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2162 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2163 tmp+=tmpStride;\
2164 src+=srcStride;\
2165 }\
2166 tmp -= tmpStride*(h+5-2);\
2167 for(i=0; i<w; i++)\
2168 {\
2169 const int tmpB= tmp[-2*tmpStride];\
2170 const int tmpA= tmp[-1*tmpStride];\
2171 const int tmp0= tmp[0 *tmpStride];\
2172 const int tmp1= tmp[1 *tmpStride];\
2173 const int tmp2= tmp[2 *tmpStride];\
2174 const int tmp3= tmp[3 *tmpStride];\
2175 const int tmp4= tmp[4 *tmpStride];\
2176 const int tmp5= tmp[5 *tmpStride];\
2177 const int tmp6= tmp[6 *tmpStride];\
2178 const int tmp7= tmp[7 *tmpStride];\
2179 const int tmp8= tmp[8 *tmpStride];\
2180 const int tmp9= tmp[9 *tmpStride];\
2181 const int tmp10=tmp[10*tmpStride];\
2182 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2183 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2184 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2185 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2186 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2187 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2188 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2189 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2190 dst++;\
2191 tmp++;\
2192 }\
2193}\
2194\
2195static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2196 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2197 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2198 src += 8*srcStride;\
2199 dst += 8*dstStride;\
2200 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2201 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2202}\
2203\
2204static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2206 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2207 src += 8*srcStride;\
2208 dst += 8*dstStride;\
2209 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2210 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2211}\
2212\
2213static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2214 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2216 src += 8*srcStride;\
0da71265
MN
2217 dst += 8*dstStride;\
2218 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2219 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2220}\
2221
2222#define H264_MC(OPNAME, SIZE) \
2223static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2224 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2225}\
2226\
2227static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2228 uint8_t half[SIZE*SIZE];\
2229 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2230 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2231}\
2232\
2233static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2234 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2235}\
2236\
2237static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t half[SIZE*SIZE];\
2239 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2240 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2241}\
2242\
2243static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2244 uint8_t full[SIZE*(SIZE+5)];\
2245 uint8_t * const full_mid= full + SIZE*2;\
2246 uint8_t half[SIZE*SIZE];\
2247 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2248 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2249 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2250}\
2251\
2252static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2253 uint8_t full[SIZE*(SIZE+5)];\
2254 uint8_t * const full_mid= full + SIZE*2;\
2255 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2256 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2257}\
2258\
2259static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2260 uint8_t full[SIZE*(SIZE+5)];\
2261 uint8_t * const full_mid= full + SIZE*2;\
2262 uint8_t half[SIZE*SIZE];\
2263 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2264 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2265 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2266}\
2267\
2268static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2269 uint8_t full[SIZE*(SIZE+5)];\
2270 uint8_t * const full_mid= full + SIZE*2;\
2271 uint8_t halfH[SIZE*SIZE];\
2272 uint8_t halfV[SIZE*SIZE];\
2273 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2274 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2275 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2276 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2277}\
2278\
2279static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2280 uint8_t full[SIZE*(SIZE+5)];\
2281 uint8_t * const full_mid= full + SIZE*2;\
2282 uint8_t halfH[SIZE*SIZE];\
2283 uint8_t halfV[SIZE*SIZE];\
2284 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2285 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2286 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2287 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2288}\
2289\
2290static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2291 uint8_t full[SIZE*(SIZE+5)];\
2292 uint8_t * const full_mid= full + SIZE*2;\
2293 uint8_t halfH[SIZE*SIZE];\
2294 uint8_t halfV[SIZE*SIZE];\
2295 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2296 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2297 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2299}\
2300\
2301static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t full[SIZE*(SIZE+5)];\
2303 uint8_t * const full_mid= full + SIZE*2;\
2304 uint8_t halfH[SIZE*SIZE];\
2305 uint8_t halfV[SIZE*SIZE];\
2306 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2307 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2308 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2310}\
2311\
2312static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2313 int16_t tmp[SIZE*(SIZE+5)];\
2314 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2315}\
2316\
2317static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2318 int16_t tmp[SIZE*(SIZE+5)];\
2319 uint8_t halfH[SIZE*SIZE];\
2320 uint8_t halfHV[SIZE*SIZE];\
2321 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2322 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2323 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2324}\
2325\
2326static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2327 int16_t tmp[SIZE*(SIZE+5)];\
2328 uint8_t halfH[SIZE*SIZE];\
2329 uint8_t halfHV[SIZE*SIZE];\
2330 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2331 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2333}\
2334\
2335static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 int16_t tmp[SIZE*(SIZE+5)];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 uint8_t halfHV[SIZE*SIZE];\
2341 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2345}\
2346\
2347static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 int16_t tmp[SIZE*(SIZE+5)];\
2351 uint8_t halfV[SIZE*SIZE];\
2352 uint8_t halfHV[SIZE*SIZE];\
2353 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2356 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2357}\
2358
2359#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2360//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2361#define op_put(a, b) a = cm[((b) + 16)>>5]
2362#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2363#define op2_put(a, b) a = cm[((b) + 512)>>10]
2364
2365H264_LOWPASS(put_ , op_put, op2_put)
2366H264_LOWPASS(avg_ , op_avg, op2_avg)
2367H264_MC(put_, 4)
2368H264_MC(put_, 8)
2369H264_MC(put_, 16)
2370H264_MC(avg_, 4)
2371H264_MC(avg_, 8)
2372H264_MC(avg_, 16)
2373
2374#undef op_avg
2375#undef op_put
2376#undef op2_avg
2377#undef op2_put
2378#endif
2379
91c56db6
MN
2380#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2381#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2382#define H264_WEIGHT(W,H) \
2383static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
88730be6 2384 int attribute_unused x, y; \
9f2d1b4f
LM
2385 offset <<= log2_denom; \
2386 if(log2_denom) offset += 1<<(log2_denom-1); \
2387 for(y=0; y<H; y++, block += stride){ \
2388 op_scale1(0); \
2389 op_scale1(1); \
2390 if(W==2) continue; \
2391 op_scale1(2); \
2392 op_scale1(3); \
2393 if(W==4) continue; \
2394 op_scale1(4); \
2395 op_scale1(5); \
2396 op_scale1(6); \
2397 op_scale1(7); \
2398 if(W==8) continue; \
2399 op_scale1(8); \
2400 op_scale1(9); \
2401 op_scale1(10); \
2402 op_scale1(11); \
2403 op_scale1(12); \
2404 op_scale1(13); \
2405 op_scale1(14); \
2406 op_scale1(15); \
2407 } \
2408} \
2409static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
88730be6 2410 int attribute_unused x, y; \
9f2d1b4f
LM
2411 int offset = (offsets + offsetd + 1) >> 1; \
2412 offset = ((offset << 1) + 1) << log2_denom; \
2413 for(y=0; y<H; y++, dst += stride, src += stride){ \
2414 op_scale2(0); \
2415 op_scale2(1); \
2416 if(W==2) continue; \
2417 op_scale2(2); \
2418 op_scale2(3); \
2419 if(W==4) continue; \
2420 op_scale2(4); \
2421 op_scale2(5); \
2422 op_scale2(6); \
2423 op_scale2(7); \
2424 if(W==8) continue; \
2425 op_scale2(8); \
2426 op_scale2(9); \
2427 op_scale2(10); \
2428 op_scale2(11); \
2429 op_scale2(12); \
2430 op_scale2(13); \
2431 op_scale2(14); \
2432 op_scale2(15); \
2433 } \
2434}
2435
2436H264_WEIGHT(16,16)
2437H264_WEIGHT(16,8)
2438H264_WEIGHT(8,16)
2439H264_WEIGHT(8,8)
2440H264_WEIGHT(8,4)
2441H264_WEIGHT(4,8)
2442H264_WEIGHT(4,4)
2443H264_WEIGHT(4,2)
2444H264_WEIGHT(2,4)
2445H264_WEIGHT(2,2)
2446
2447#undef op_scale1
2448#undef op_scale2
2449#undef H264_WEIGHT
2450
1457ab52
MN
2451static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2452 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2453 int i;
2454
2455 for(i=0; i<h; i++){
2456 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2457 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2458 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2459 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2460 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2461 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2462 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2463 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2464 dst+=dstStride;
2465 src+=srcStride;
2466 }
2467}
2468
2469static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2470 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2471 int i;
2472
2473 for(i=0; i<w; i++){
2474 const int src_1= src[ -srcStride];
2475 const int src0 = src[0 ];
2476 const int src1 = src[ srcStride];
2477 const int src2 = src[2*srcStride];
2478 const int src3 = src[3*srcStride];
2479 const int src4 = src[4*srcStride];
2480 const int src5 = src[5*srcStride];
2481 const int src6 = src[6*srcStride];
2482 const int src7 = src[7*srcStride];
2483 const int src8 = src[8*srcStride];
2484 const int src9 = src[9*srcStride];
2485 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2486 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2487 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2488 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2489 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2490 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2491 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2492 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2493 src++;
2494 dst++;
2495 }
2496}
2497
2498static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2499 put_pixels8_c(dst, src, stride, 8);
2500}
2501
2502static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2503 uint8_t half[64];
2504 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2505 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2506}
2507
2508static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2509 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2510}
2511
2512static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2513 uint8_t half[64];
2514 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2515 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2516}
2517
2518static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2519 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2520}
2521
2522static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2523 uint8_t halfH[88];
2524 uint8_t halfV[64];
2525 uint8_t halfHV[64];
2526 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2527 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2528 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2529 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2530}
2531static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2532 uint8_t halfH[88];
2533 uint8_t halfV[64];
2534 uint8_t halfHV[64];
2535 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2536 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2537 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2538 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2539}
2540static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2541 uint8_t halfH[88];
2542 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2543 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2544}
2545
332f9ac4
MN
2546static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2547 int x;
2548 const int strength= ff_h263_loop_filter_strength[qscale];
2549
2550 for(x=0; x<8; x++){
2551 int d1, d2, ad1;
2552 int p0= src[x-2*stride];
2553 int p1= src[x-1*stride];
2554 int p2= src[x+0*stride];
2555 int p3= src[x+1*stride];
2556 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2557
2558 if (d<-2*strength) d1= 0;
2559 else if(d<- strength) d1=-2*strength - d;
2560 else if(d< strength) d1= d;
2561 else if(d< 2*strength) d1= 2*strength - d;
2562 else d1= 0;
2563
2564 p1 += d1;
2565 p2 -= d1;
2566 if(p1&256) p1= ~(p1>>31);
2567 if(p2&256) p2= ~(p2>>31);
2568
2569 src[x-1*stride] = p1;
2570 src[x+0*stride] = p2;
2571
5b5404e3 2572 ad1= ABS(d1)>>1;
332f9ac4
MN
2573
2574 d2= clip((p0-p3)/4, -ad1, ad1);
2575
2576 src[x-2*stride] = p0 - d2;
2577 src[x+ stride] = p3 + d2;
2578 }
2579}
2580
2581static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2582 int y;
2583 const int strength= ff_h263_loop_filter_strength[qscale];
2584
2585 for(y=0; y<8; y++){
2586 int d1, d2, ad1;
2587 int p0= src[y*stride-2];
2588 int p1= src[y*stride-1];
2589 int p2= src[y*stride+0];
2590 int p3= src[y*stride+1];
2591 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2592
2593 if (d<-2*strength) d1= 0;
2594 else if(d<- strength) d1=-2*strength - d;
2595 else if(d< strength) d1= d;
2596 else if(d< 2*strength) d1= 2*strength - d;
2597 else d1= 0;
2598
2599 p1 += d1;
2600 p2 -= d1;
2601 if(p1&256) p1= ~(p1>>31);
2602 if(p2&256) p2= ~(p2>>31);
2603
2604 src[y*stride-1] = p1;
2605 src[y*stride+0] = p2;
2606
2607 ad1= ABS(d1)>>1;
2608
2609 d2= clip((p0-p3)/4, -ad1, ad1);
2610
2611 src[y*stride-2] = p0 - d2;
2612 src[y*stride+1] = p3 + d2;
2613 }
2614}
1457ab52 2615
fdbbf2e0
MN
2616static void h261_loop_filter_c(uint8_t *src, int stride){
2617 int x,y,xy,yz;
2618 int temp[64];
2619
2620 for(x=0; x<8; x++){
2621 temp[x ] = 4*src[x ];
2622 temp[x + 7*8] = 4*src[x + 7*stride];
2623 }
2624 for(y=1; y<7; y++){
2625 for(x=0; x<8; x++){
2626 xy = y * stride + x;
2627 yz = y * 8 + x;
2628 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2629 }
2630 }
fdbbf2e0
MN
2631
2632 for(y=0; y<8; y++){
2633 src[ y*stride] = (temp[ y*8] + 2)>>2;
2634 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2635 for(x=1; x<7; x++){
2636 xy = y * stride + x;
2637 yz = y * 8 + x;
2638 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2639 }
2640 }
2641}
2642
bb198e19 2643static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2644{
2645 int s, i;
2646
2647 s = 0;
bb198e19 2648 for(i=0;i<h;i++) {
de6d9b64
FB
2649 s += abs(pix1[0] - pix2[0]);
2650 s += abs(pix1[1] - pix2[1]);
2651 s += abs(pix1[2] - pix2[2]);
2652 s += abs(pix1[3] - pix2[3]);
2653 s += abs(pix1[4] - pix2[4]);
2654 s += abs(pix1[5] - pix2[5]);
2655 s += abs(pix1[6] - pix2[6]);
2656 s += abs(pix1[7] - pix2[7]);
2657 s += abs(pix1[8] - pix2[8]);
2658 s += abs(pix1[9] - pix2[9]);
2659 s += abs(pix1[10] - pix2[10]);
2660 s += abs(pix1[11] - pix2[11]);
2661 s += abs(pix1[12] - pix2[12]);
2662 s += abs(pix1[13] - pix2[13]);
2663 s += abs(pix1[14] - pix2[14]);
2664 s += abs(pix1[15] - pix2[15]);
2665 pix1 += line_size;
2666 pix2 += line_size;
2667 }
2668 return s;
2669}
2670
bb198e19 2671static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2672{
2673 int s, i;
2674
2675 s = 0;
bb198e19 2676 for(i=0;i<h;i++) {
de6d9b64
FB
2677 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2678 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2679 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2680 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2681 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2682 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2683 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2684 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2685 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2686 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2687 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2688 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2689 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2690 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2691 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2692 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2693 pix1 += line_size;
2694 pix2 += line_size;
2695 }
2696 return s;
2697}
2698
bb198e19 2699static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2700{
2701 int s, i;
0c1a9eda 2702 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2703
2704 s = 0;
bb198e19 2705 for(i=0;i<h;i++) {
de6d9b64
FB
2706 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2707 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2708 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2709 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2710 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2711 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2712 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2713 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2714 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2715 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2716 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2717 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2718 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2719 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2720 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2721 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2722 pix1 += line_size;
2723 pix2 += line_size;
2724 pix3 += line_size;
2725 }
2726 return s;
2727}
2728
bb198e19 2729static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2730{
2731 int s, i;
0c1a9eda 2732 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2733
2734 s = 0;
bb198e19 2735 for(i=0;i<h;i++) {
de6d9b64
FB
2736 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2737 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2738 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2739 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2740 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2741 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2742 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2743 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2744 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2745 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2746 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2747 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2748 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2749 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2750 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2751 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2752 pix1 += line_size;
2753 pix2 += line_size;
2754 pix3 += line_size;
2755 }
2756 return s;
2757}
2758
bb198e19 2759static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2760{
2761 int s, i;
2762
2763 s = 0;
bb198e19 2764 for(i=0;i<h;i++) {
ba6802de
MN
2765 s += abs(pix1[0] - pix2[0]);
2766 s += abs(pix1[1] - pix2[1]);
2767 s += abs(pix1[2] - pix2[2]);
2768 s += abs(pix1[3] - pix2[3]);
2769 s += abs(pix1[4] - pix2[4]);
2770 s += abs(pix1[5] - pix2[5]);
2771 s += abs(pix1[6] - pix2[6]);
2772 s += abs(pix1[7] - pix2[7]);
2773 pix1 += line_size;
2774 pix2 += line_size;
2775 }
2776 return s;
2777}
2778
bb198e19 2779static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2780{
2781 int s, i;
2782
2783 s = 0;
bb198e19 2784 for(i=0;i<h;i++) {
ba6802de
MN
2785 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2786 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2787 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2788 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2789 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2790 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2791 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2792 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2793 pix1 += line_size;
2794 pix2 += line_size;
2795 }
2796 return s;
2797}
2798
bb198e19 2799static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2800{
2801 int s, i;
0c1a9eda 2802 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2803
2804 s = 0;
bb198e19 2805 for(i=0;i<h;i++) {
ba6802de
MN
2806 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2807 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2808 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2809 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2810 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2811 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2812 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2813 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2814 pix1 += line_size;
2815 pix2 += line_size;
2816 pix3 += line_size;
2817 }
2818 return s;
2819}
2820
bb198e19 2821static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2822{
2823 int s, i;
0c1a9eda 2824 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2825
2826 s = 0;
bb198e19 2827 for(i=0;i<h;i++) {
ba6802de
MN
2828 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2829 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2830 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2831 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2832 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2833 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2834 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2835 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2836 pix1 += line_size;
2837 pix2 += line_size;
2838 pix3 += line_size;
2839 }
2840 return s;
2841}
2842
d4c5d2ad 2843static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2844 int score1=0;
2845 int score2=0;
2846 int x,y;
d4c5d2ad 2847
e6a2ac34
MN
2848 for(y=0; y<h; y++){
2849 for(x=0; x<16; x++){
2850 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2851 }
2852 if(y+1<h){
2853 for(x=0; x<15; x++){
2854 score2+= ABS( s1[x ] - s1[x +stride]
2855 - s1[x+1] + s1[x+1+stride])
2856 -ABS( s2[x ] - s2[x +stride]
2857 - s2[x+1] + s2[x+1+stride]);
2858 }
2859 }
2860 s1+= stride;
2861 s2+= stride;
2862 }
d4c5d2ad
MN
2863
2864 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2865 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2866}
2867
d4c5d2ad 2868static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
e6a2ac34
MN
2869 int score1=0;
2870 int score2=0;
2871 int x,y;
2872
2873 for(y=0; y<h; y++){
2874 for(x=0; x<8; x++){
2875 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2876 }
2877 if(y+1<h){
2878 for(x=0; x<7; x++){
2879 score2+= ABS( s1[x ] - s1[x +stride]
2880 - s1[x+1] + s1[x+1+stride])
2881 -ABS( s2[x ] - s2[x +stride]
2882 - s2[x+1] + s2[x+1+stride]);
2883 }
2884 }
2885 s1+= stride;
2886 s2+= stride;
2887 }
2888
d4c5d2ad
MN
2889 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2890 else return score1 + ABS(score2)*8;
e6a2ac34
MN
2891}
2892
364a1797
MN
2893static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2894 int i;
2895 unsigned int sum=0;
2896
2897 for(i=0; i<8*8; i++){
2898 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2899 int w= weight[i];
2900 b>>= RECON_SHIFT;
2901 assert(-512<b && b<512);
2902
2903 sum += (w*b)*(w*b)>>4;
2904 }
2905 return sum>>2;
2906}
2907
2908static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2909 int i;
2910
2911 for(i=0; i<8*8; i++){
2912 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2913 }
2914}
2915
a9badb51
MN
2916/**
2917 * permutes an 8x8 block.
2a5700de 2918 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
2919 * @param permutation the permutation vector
2920 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2a5700de
MN
2921 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2922 * (inverse) permutated to scantable order!
a9badb51 2923 */
0c1a9eda 2924void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 2925{
7801d21d 2926 int i;
477ab036 2927 DCTELEM temp[64];
7801d21d
MN
2928
2929 if(last<=0) return;
9a7b310d 2930 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 2931
7801d21d
MN
2932 for(i=0; i<=last; i++){
2933 const int j= scantable[i];
2934 temp[j]= block[j];
2935 block[j]=0;
2936 }
2937
2938 for(i=0; i<=last; i++){
2939 const int j= scantable[i];
2940 const int perm_j= permutation[j];
2941 block[perm_j]= temp[j];
2942 }
d962f6fd 2943}
e0eac44e 2944
622348f9
MN
2945static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2946 return 0;
2947}
2948
2949void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2950 int i;
2951
2952 memset(cmp, 0, sizeof(void*)*5);
2953
2954 for(i=0; i<5; i++){
2955 switch(type&0xFF){
2956 case FF_CMP_SAD:
2957 cmp[i]= c->sad[i];
2958 break;
2959 case FF_CMP_SATD:
2960 cmp[i]= c->hadamard8_diff[i];
2961 break;
2962 case FF_CMP_SSE:
2963 cmp[i]= c->sse[i];
2964 break;
2965 case FF_CMP_DCT:
2966 cmp[i]= c->dct_sad[i];
2967 break;
0fd6aea1
MN
2968 case FF_CMP_DCTMAX:
2969 cmp[i]= c->dct_max[i];
2970 break;
622348f9
MN
2971 case FF_CMP_PSNR:
2972 cmp[i]= c->quant_psnr[i];
2973 break;
2974 case FF_CMP_BIT:
2975 cmp[i]= c->bit[i];
2976 break;
2977 case FF_CMP_RD:
2978 cmp[i]= c->rd[i];
2979 break;
2980 case FF_CMP_VSAD:
2981 cmp[i]= c->vsad[i];
2982 break;
2983 case FF_CMP_VSSE:
2984 cmp[i]= c->vsse[i];
2985 break;
2986 case FF_CMP_ZERO:
2987 cmp[i]= zero_cmp;
2988 break;
e6a2ac34
MN
2989 case FF_CMP_NSSE:
2990 cmp[i]= c->nsse[i];
2991 break;
26efc54e
MN
2992 case FF_CMP_W53:
2993 cmp[i]= c->w53[i];
2994 break;
2995 case FF_CMP_W97:
2996 cmp[i]= c->w97[i];
2997 break;
622348f9
MN
2998 default:
2999 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3000 }
3001 }
3002}
3003
2a5700de
MN
3004/**
3005 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3006 */
eb4b3dd3 3007static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3008{
3009 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3010}
3011
11f18faf
MN
3012static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3013 int i;
d32ac509 3014 for(i=0; i+7<w; i+=8){
11f18faf
MN
3015 dst[i+0] += src[i+0];
3016 dst[i+1] += src[i+1];
3017 dst[i+2] += src[i+2];
3018 dst[i+3] += src[i+3];
3019 dst[i+4] += src[i+4];
3020 dst[i+5] += src[i+5];
3021 dst[i+6] += src[i+6];
3022 dst[i+7] += src[i+7];
3023 }
3024 for(; i<w; i++)
3025 dst[i+0] += src[i+0];
3026}
3027
3028static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3029 int i;
d32ac509 3030 for(i=0; i+7<w; i+=8){
11f18faf
MN
3031 dst[i+0] = src1[i+0]-src2[i+0];
3032 dst[i+1] = src1[i+1]-src2[i+1];
3033 dst[i+2] = src1[i+2]-src2[i+2];
3034 dst[i+3] = src1[i+3]-src2[i+3];
3035 dst[i+4] = src1[i+4]-src2[i+4];
3036 dst[i+5] = src1[i+5]-src2[i+5];
3037 dst[i+6] = src1[i+6]-src2[i+6];
3038 dst[i+7] = src1[i+7]-src2[i+7];
3039 }
3040 for(; i<w; i++)
3041 dst[i+0] = src1[i+0]-src2[i+0];
3042}
3043
84705403
MN
3044static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3045 int i;
3046 uint8_t l, lt;
3047
3048 l= *left;
3049 lt= *left_top;
3050
3051 for(i=0; i<w; i++){
3052 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3053 lt= src1[i];
3054 l= src2[i];
3055 dst[i]= l - pred;
3056 }
3057
3058 *left= l;
3059 *left_top= lt;
3060}
3061
1457ab52
MN
3062#define BUTTERFLY2(o1,o2,i1,i2) \
3063o1= (i1)+(i2);\
3064o2= (i1)-(i2);
3065
3066#define BUTTERFLY1(x,y) \
3067{\
3068 int a,b;\
3069 a= x;\
3070 b= y;\
3071 x= a+b;\
3072 y= a-b;\
3073}
3074
3075#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3076
bb198e19 3077static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3078 int i;
3079 int temp[64];
3080 int sum=0;
bb198e19
MN
3081
3082 assert(h==8);
1457ab52
MN
3083
3084 for(i=0; i<8; i++){
3085 //FIXME try pointer walks
3086 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3087 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3088 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3089 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3090
3091 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3092 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3093 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3094 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3095
3096 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3097 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3098 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3099 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3100 }
3101
3102 for(i=0; i<8; i++){
3103 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3104 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3105 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3106 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3107
3108 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3109 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3110 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3111 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3112
3113 sum +=
3114 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3115 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3116 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3117 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3118 }
3119#if 0
3120static int maxi=0;
3121if(sum>maxi){
3122 maxi=sum;
3123 printf("MAX:%d\n", maxi);
3124}
3125#endif
3126 return sum;
3127}
3128
622348f9 3129static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3130 int i;
3131 int temp[64];
3132 int sum=0;
622348f9
MN
3133
3134 assert(h==8);
3135
1457ab52
MN
3136 for(i=0; i<8; i++){
3137 //FIXME try pointer walks
622348f9
MN
3138 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3139 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3140 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3141 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1457ab52
MN
3142
3143 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3144 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3145 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3146 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3147
3148 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3149 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3150 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3151 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3152 }
3153
3154 for(i=0; i<8; i++){
3155 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3156 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3157 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3158 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3159
3160 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3161 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3162 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3163 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3164
3165 sum +=
3166 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3167 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3168 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3169 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3170 }
3171
622348f9
MN
3172 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3173
1457ab52
MN
3174 return sum;
3175}
3176
bb198e19 3177static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3178 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3179 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3180 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3181 int sum=0, i;
bb198e19
MN
3182
3183 assert(h==8);
1457ab52
MN
3184
3185 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3186 s->dsp.fdct(temp);
1457ab52
MN
3187
3188 for(i=0; i<64; i++)
3189 sum+= ABS(temp[i]);
3190
3191 return sum;
3192}
3193
0fd6aea1
MN
3194static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3195 MpegEncContext * const s= (MpegEncContext *)c;
3196 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3197 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3198 int sum=0, i;
3199
3200 assert(h==8);
3201
3202 s->dsp.diff_pixels(temp, src1, src2, stride);
3203 s->dsp.fdct(temp);
3204
3205 for(i=0; i<64; i++)
3206 sum= FFMAX(sum, ABS(temp[i]));
3207
3208 return sum;
3209}
3210
0e15384d 3211void simple_idct(DCTELEM *block); //FIXME
1457ab52 3212
bb198e19 3213static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3214 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
3215 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3216 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3217 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3218 int sum=0, i;
3219
bb198e19 3220 assert(h==8);
1457ab52
MN
3221 s->mb_intra=0;
3222
3223 s->dsp.diff_pixels(temp, src1, src2, stride);
3224
3225 memcpy(bak, temp, 64*sizeof(DCTELEM));
3226
67725183 3227 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3228 s->dct_unquantize_inter(s, temp, 0, s->qscale);
1457ab52
MN
3229 simple_idct(temp); //FIXME
3230
3231 for(i=0; i<64; i++)
3232 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3233
3234 return sum;
3235}
3236
bb198e19 3237static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3238 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3239 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3240 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3241 uint64_t __align8 aligned_bak[stride];
3242 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3243 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3244 int i, last, run, bits, level, distoration, start_i;
3245 const int esc_length= s->ac_esc_length;
3246 uint8_t * length;
3247 uint8_t * last_length;
67725183 3248
bb198e19
MN
3249 assert(h==8);
3250
67725183
MN
3251 for(i=0; i<8; i++){
3252 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3253 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3254 }
3a87ac94 3255
67725183
MN
3256 s->dsp.diff_pixels(temp, src1, src2, stride);
3257
3258 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3259
3260 bits=0;
3a87ac94
MN
3261
3262 if (s->mb_intra) {
67725183 3263 start_i = 1;
3a87ac94
MN
3264 length = s->intra_ac_vlc_length;
3265 last_length= s->intra_ac_vlc_last_length;
67725183 3266 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3267 } else {
3268 start_i = 0;
3269 length = s->inter_ac_vlc_length;
3270 last_length= s->inter_ac_vlc_last_length;
3271 }
3a87ac94 3272
67725183 3273 if(last>=start_i){
3a87ac94
MN
3274 run=0;
3275 for(i=start_i; i<last; i++){
3276 int j= scantable[i];
3277 level= temp[j];
3278
3279 if(level){
3280 level+=64;
3281 if((level&(~127)) == 0){
3282 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3283 }else
3284 bits+= esc_length;
3285 run=0;
3286 }else
3287 run++;
3288 }
3289 i= scantable[last];
1d0eab1d 3290
3a87ac94 3291 level= temp[i] + 64;
1d0eab1d
MN
3292
3293 assert(level - 64);
3294
3a87ac94
MN
3295 if((level&(~127)) == 0){
3296 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3297 }else
3298 bits+= esc_length;
3299
67725183
MN
3300 }
3301
3302 if(last>=0){
d50635cd
MN
3303 if(s->mb_intra)
3304 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3305 else
3306 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94
MN
3307 }
3308
b0368839 3309 s->dsp.idct_add(bak, stride, temp);
3a87ac94 3310
bb198e19 3311 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3312
67725183 3313 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3314}
3315
bb198e19 3316static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3317 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3318 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
3319 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3320 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3321 int i, last, run, bits, level, start_i;
3322 const int esc_length= s->ac_esc_length;
3323 uint8_t * length;
3324 uint8_t * last_length;
bb198e19
MN
3325
3326 assert(h==8);
67725183
MN
3327
3328 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 3329
67725183
MN
3330 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3331
3332 bits=0;
3a87ac94
MN
3333
3334 if (s->mb_intra) {
67725183 3335 start_i = 1;
3a87ac94
MN
3336 length = s->intra_ac_vlc_length;
3337 last_length= s->intra_ac_vlc_last_length;
67725183 3338 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3339 } else {
3340 start_i = 0;
3341 length = s->inter_ac_vlc_length;
3342 last_length= s->inter_ac_vlc_last_length;
3343 }
3a87ac94 3344
67725183 3345 if(last>=start_i){
3a87ac94
MN
3346 run=0;
3347 for(i=start_i; i<last; i++){
3348 int j= scantable[i];
3349 level= temp[j];
3350
3351 if(level){
3352 level+=64;
3353 if((level&(~127)) == 0){
3354 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3355 }else
3356 bits+= esc_length;
3357 run=0;
3358 }else
3359 run++;
3360 }
3361 i= scantable[last];
67725183
MN
3362
3363 level= temp[i] + 64;
3a87ac94 3364
67725183 3365 assert(level - 64);
3a87ac94 3366
3a87ac94
MN
3367 if((level&(~127)) == 0){
3368 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3369 }else
3370 bits+= esc_length;
3371 }
3372
3373 return bits;
3374}
3375
622348f9
MN
3376static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3377 int score=0;
3378 int x,y;
3379
3380 for(y=1; y<h; y++){
3381 for(x=0; x<16; x+=4){
3382 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3383 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3384 }
3385 s+= stride;
3386 }
3387
3388 return score;
3389}
3390
3391static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3392 int score=0;
3393 int x,y;
3394
3395 for(y=1; y<h; y++){
3396 for(x=0; x<16; x++){
3397 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3398 }
3399 s1+= stride;
3400 s2+= stride;
3401 }
3402
3403 return score;
3404}
3405
3406#define SQ(a) ((a)*(a))
3407static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3408 int score=0;
3409 int x,y;
3410
3411 for(y=1; y<h; y++){
3412 for(x=0; x<16; x+=4){
3413 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3414 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3415 }
3416 s+= stride;
3417 }
3418
3419 return score;
3420}
3421
3422static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3423 int score=0;
3424 int x,y;
3425
3426 for(y=1; y<h; y++){
3427 for(x=0; x<16; x++){
3428 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3429 }
3430 s1+= stride;
3431 s2+= stride;
3432 }
3433
3434 return score;
3435}
3436
bb198e19 3437WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
622348f9 3438WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
bb198e19 3439WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
0fd6aea1 3440WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
bb198e19
MN
3441WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3442WARPER8_16_SQ(rd8x8_c, rd16_c)
3443WARPER8_16_SQ(bit8x8_c, bit16_c)
1457ab52 3444
b0368839
MN
3445/* XXX: those functions should be suppressed ASAP when all IDCTs are
3446 converted */
3447static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3448{
3449 j_rev_dct (block);
3450 put_pixels_clamped_c(block, dest, line_size);
3451}
3452static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3453{
3454 j_rev_dct (block);
3455 add_pixels_clamped_c(block, dest, line_size);
3456}
3457
178fcca8
MN
3458static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3459{
3460 j_rev_dct4 (block);
3461 put_pixels_clamped4_c(block, dest, line_size);
3462}
3463static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3464{
3465 j_rev_dct4 (block);
3466 add_pixels_clamped4_c(block, dest, line_size);
3467}
3468
9ca358b9
MN
3469static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3470{
3471 j_rev_dct2 (block);
3472 put_pixels_clamped2_c(block, dest, line_size);
3473}
3474static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3475{
3476 j_rev_dct2 (block);
3477 add_pixels_clamped2_c(block, dest, line_size);
3478}
3479
1aa8c57b
MN
3480static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3481{
3482 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3483
3484 dest[0] = cm[(block[0] + 4)>>3];
3485}
3486static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3487{
3488 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3489
3490 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3491}
3492
59cf08ce
FB
3493/* init static data */
3494void dsputil_static_init(void)
e0eac44e 3495{
d2975f8d 3496 int i;
e0eac44e 3497
59cf08ce
FB
3498 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3499 for(i=0;i<MAX_NEG_CROP;i++) {
3500 cropTbl[i] = 0;
3501 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3502 }
3503
3504 for(i=0;i<512;i++) {
3505 squareTbl[i] = (i - 256) * (i - 256);
3506 }
3507
3508 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3509}
92ddb692 3510
92ddb692 3511
59cf08ce
FB
3512void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3513{
3514 int i;
de6d9b64 3515
b0368839 3516#ifdef CONFIG_ENCODERS
10acc479 3517 if(avctx->dct_algo==FF_DCT_FASTINT) {
b0368839 3518 c->fdct = fdct_ifast;
48b1f800 3519 c->fdct248 = fdct_ifast248;
10acc479
RS
3520 }
3521 else if(avctx->dct_algo==FF_DCT_FAAN) {
65e4c8c9 3522 c->fdct = ff_faandct;
48b1f800 3523 c->fdct248 = ff_faandct248;
10acc479
RS
3524 }
3525 else {
b0368839 3526 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
10acc479
RS
3527 c->fdct248 = ff_fdct248_islow;
3528 }
b0368839
MN
3529#endif //CONFIG_ENCODERS
3530
178fcca8 3531 if(avctx->lowres==1){
0fa8158d
MN
3532 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3533 c->idct_put= ff_jref_idct4_put;
3534 c->idct_add= ff_jref_idct4_add;
3535 }else{
3536 c->idct_put= ff_h264_lowres_idct_put_c;
3537 c->idct_add= ff_h264_lowres_idct_add_c;
3538 }
178fcca8 3539 c->idct = j_rev_dct4;
b0368839 3540 c->idct_permutation_type= FF_NO_IDCT_PERM;
9ca358b9
MN
3541 }else if(avctx->lowres==2){
3542 c->idct_put= ff_jref_idct2_put;
3543 c->idct_add= ff_jref_idct2_add;
3544 c->idct = j_rev_dct2;
3545 c->idct_permutation_type= FF_NO_IDCT_PERM;
1aa8c57b
MN
3546 }else if(avctx->lowres==3){
3547 c->idct_put= ff_jref_idct1_put;
3548 c->idct_add= ff_jref_idct1_add;
3549 c->idct = j_rev_dct1;
3550 c->idct_permutation_type= FF_NO_IDCT_PERM;
178fcca8
MN
3551 }else{
3552 if(avctx->idct_algo==FF_IDCT_INT){
3553 c->idct_put= ff_jref_idct_put;
3554 c->idct_add= ff_jref_idct_add;
3555 c->idct = j_rev_dct;
3556 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3557 }else{ //accurate/default
3558 c->idct_put= simple_idct_put;
3559 c->idct_add= simple_idct_add;
3560 c->idct = simple_idct;
3561 c->idct_permutation_type= FF_NO_IDCT_PERM;
3562 }
b0368839
MN
3563 }
3564
0fa8158d
MN
3565 c->h264_idct_add= ff_h264_idct_add_c;
3566
44cb64ee
MM
3567 /* VP3 DSP support */
3568 c->vp3_dsp_init = vp3_dsp_init_c;
116824d0 3569 c->vp3_idct = vp3_idct_c;
44cb64ee 3570
eb4b3dd3
ZK
3571 c->get_pixels = get_pixels_c;
3572 c->diff_pixels = diff_pixels_c;
3573 c->put_pixels_clamped = put_pixels_clamped_c;
f9ed9d85 3574 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
eb4b3dd3
ZK
3575 c->add_pixels_clamped = add_pixels_clamped_c;
3576 c->gmc1 = gmc1_c;
3577 c->gmc = gmc_c;
3578 c->clear_blocks = clear_blocks_c;
3579 c->pix_sum = pix_sum_c;
3580 c->pix_norm1 = pix_norm1_c;
3581
45553457 3582 /* TODO [0] 16 [1] 8 */
bb198e19
MN
3583 c->pix_abs[0][0] = pix_abs16_c;
3584 c->pix_abs[0][1] = pix_abs16_x2_c;
3585 c->pix_abs[0][2] = pix_abs16_y2_c;
3586 c->pix_abs[0][3] = pix_abs16_xy2_c;
3587 c->pix_abs[1][0] = pix_abs8_c;
3588 c->pix_abs[1][1] = pix_abs8_x2_c;
3589 c->pix_abs[1][2] = pix_abs8_y2_c;
3590 c->pix_abs[1][3] = pix_abs8_xy2_c;
eb4b3dd3 3591
45553457
ZK
3592#define dspfunc(PFX, IDX, NUM) \
3593 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3594 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3595 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3596 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3597
3598 dspfunc(put, 0, 16);
3599 dspfunc(put_no_rnd, 0, 16);
3600 dspfunc(put, 1, 8);
3601 dspfunc(put_no_rnd, 1, 8);
669ac79c
MN
3602 dspfunc(put, 2, 4);
3603 dspfunc(put, 3, 2);
45553457
ZK
3604
3605 dspfunc(avg, 0, 16);
3606 dspfunc(avg_no_rnd, 0, 16);
3607 dspfunc(avg, 1, 8);
3608 dspfunc(avg_no_rnd, 1, 8);
da3b9756
MM
3609 dspfunc(avg, 2, 4);
3610 dspfunc(avg, 3, 2);
45553457
ZK
3611#undef dspfunc
3612
c0a0170c
MN
3613 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3614 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3615
669ac79c
MN
3616 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3617 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3618 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3619 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3620 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3621 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3622 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3623 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3624 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3625
da3b9756
MM
3626 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3627 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3628 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3629 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3630 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3631 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3632 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3633 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3634 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3635
45553457
ZK
3636#define dspfunc(PFX, IDX, NUM) \
3637 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3638 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3639 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3640 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3641 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3642 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3643 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3644 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3645 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3646 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3647 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3648 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3649 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3650 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3651 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3652 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3653
3654 dspfunc(put_qpel, 0, 16);
3655 dspfunc(put_no_rnd_qpel, 0, 16);
3656
3657 dspfunc(avg_qpel, 0, 16);
3658 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3659
3660 dspfunc(put_qpel, 1, 8);
3661 dspfunc(put_no_rnd_qpel, 1, 8);
3662
3663 dspfunc(avg_qpel, 1, 8);
3664 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
MN
3665
3666 dspfunc(put_h264_qpel, 0, 16);
3667 dspfunc(put_h264_qpel, 1, 8);
3668 dspfunc(put_h264_qpel, 2, 4);
3669 dspfunc(avg_h264_qpel, 0, 16);
3670 dspfunc(avg_h264_qpel, 1, 8);
3671 dspfunc(avg_h264_qpel, 2, 4);
3672
45553457 3673#undef dspfunc
0da71265
MN
3674 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3675 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3676 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3677 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3678 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3679 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
c9a2ebc4 3680
9f2d1b4f
LM
3681 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3682 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3683 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3684 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3685 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3686 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3687 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3688 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3689 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3690 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3691 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3692 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3693 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3694 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3695 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3696 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3697 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3698 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3699 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3700 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3701
1457ab52
MN
3702 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3703 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3704 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3705 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3706 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3707 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3708 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3709 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
669ac79c 3710
bb198e19
MN
3711#define SET_CMP_FUNC(name) \
3712 c->name[0]= name ## 16_c;\
3713 c->name[1]= name ## 8x8_c;
3714
3715 SET_CMP_FUNC(hadamard8_diff)
622348f9 3716 c->hadamard8_diff[4]= hadamard8_intra16_c;
bb198e19 3717 SET_CMP_FUNC(dct_sad)
0fd6aea1 3718 SET_CMP_FUNC(dct_max)
bb198e19
MN
3719 c->sad[0]= pix_abs16_c;
3720 c->sad[1]= pix_abs8_c;
3721 c->sse[0]= sse16_c;
3722 c->sse[1]= sse8_c;
26efc54e 3723 c->sse[2]= sse4_c;
bb198e19
MN
3724 SET_CMP_FUNC(quant_psnr)
3725 SET_CMP_FUNC(rd)
3726 SET_CMP_FUNC(bit)
622348f9
MN
3727 c->vsad[0]= vsad16_c;
3728 c->vsad[4]= vsad_intra16_c;
3729 c->vsse[0]= vsse16_c;
3730 c->vsse[4]= vsse_intra16_c;
e6a2ac34
MN
3731 c->nsse[0]= nsse16_c;
3732 c->nsse[1]= nsse8_c;
26efc54e
MN
3733 c->w53[0]= w53_16_c;
3734 c->w53[1]= w53_8_c;
3735 c->w97[0]= w97_16_c;
3736 c