wmv2: move IDCT to its own DSP context.
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file
27 * DSP utils
28 */
29
30 #include "libavutil/imgutils.h"
31 #include "avcodec.h"
32 #include "dsputil.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "faanidct.h"
36 #include "mathops.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39
40 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
41 uint32_t ff_squareTbl[512] = {0, };
42
43 #define BIT_DEPTH 9
44 #include "dsputil_template.c"
45 #undef BIT_DEPTH
46
47 #define BIT_DEPTH 10
48 #include "dsputil_template.c"
49 #undef BIT_DEPTH
50
51 #define BIT_DEPTH 8
52 #include "dsputil_template.c"
53
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
57
58 const uint8_t ff_zigzag_direct[64] = {
59 0, 1, 8, 16, 9, 2, 3, 10,
60 17, 24, 32, 25, 18, 11, 4, 5,
61 12, 19, 26, 33, 40, 48, 41, 34,
62 27, 20, 13, 6, 7, 14, 21, 28,
63 35, 42, 49, 56, 57, 50, 43, 36,
64 29, 22, 15, 23, 30, 37, 44, 51,
65 58, 59, 52, 45, 38, 31, 39, 46,
66 53, 60, 61, 54, 47, 55, 62, 63
67 };
68
69 /* Specific zigzag scan for 248 idct. NOTE that unlike the
70 specification, we interleave the fields */
71 const uint8_t ff_zigzag248_direct[64] = {
72 0, 8, 1, 9, 16, 24, 2, 10,
73 17, 25, 32, 40, 48, 56, 33, 41,
74 18, 26, 3, 11, 4, 12, 19, 27,
75 34, 42, 49, 57, 50, 58, 35, 43,
76 20, 28, 5, 13, 6, 14, 21, 29,
77 36, 44, 51, 59, 52, 60, 37, 45,
78 22, 30, 7, 15, 23, 31, 38, 46,
79 53, 61, 54, 62, 39, 47, 55, 63,
80 };
81
82 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
83 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
84
85 const uint8_t ff_alternate_horizontal_scan[64] = {
86 0, 1, 2, 3, 8, 9, 16, 17,
87 10, 11, 4, 5, 6, 7, 15, 14,
88 13, 12, 19, 18, 24, 25, 32, 33,
89 26, 27, 20, 21, 22, 23, 28, 29,
90 30, 31, 34, 35, 40, 41, 48, 49,
91 42, 43, 36, 37, 38, 39, 44, 45,
92 46, 47, 50, 51, 56, 57, 58, 59,
93 52, 53, 54, 55, 60, 61, 62, 63,
94 };
95
96 const uint8_t ff_alternate_vertical_scan[64] = {
97 0, 8, 16, 24, 1, 9, 2, 10,
98 17, 25, 32, 40, 48, 56, 57, 49,
99 41, 33, 26, 18, 3, 11, 4, 12,
100 19, 27, 34, 42, 50, 58, 35, 43,
101 51, 59, 20, 28, 5, 13, 6, 14,
102 21, 29, 36, 44, 52, 60, 37, 45,
103 53, 61, 22, 30, 7, 15, 23, 31,
104 38, 46, 54, 62, 39, 47, 55, 63,
105 };
106
107 /* Input permutation for the simple_idct_mmx */
108 static const uint8_t simple_mmx_permutation[64]={
109 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
110 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
111 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
112 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
113 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
114 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
115 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
116 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
117 };
118
119 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
120
121 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
122 int i;
123 int end;
124
125 st->scantable= src_scantable;
126
127 for(i=0; i<64; i++){
128 int j;
129 j = src_scantable[i];
130 st->permutated[i] = permutation[j];
131 }
132
133 end=-1;
134 for(i=0; i<64; i++){
135 int j;
136 j = st->permutated[i];
137 if(j>end) end=j;
138 st->raster_end[i]= end;
139 }
140 }
141
142 void ff_init_scantable_permutation(uint8_t *idct_permutation,
143 int idct_permutation_type)
144 {
145 int i;
146
147 switch(idct_permutation_type){
148 case FF_NO_IDCT_PERM:
149 for(i=0; i<64; i++)
150 idct_permutation[i]= i;
151 break;
152 case FF_LIBMPEG2_IDCT_PERM:
153 for(i=0; i<64; i++)
154 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
155 break;
156 case FF_SIMPLE_IDCT_PERM:
157 for(i=0; i<64; i++)
158 idct_permutation[i]= simple_mmx_permutation[i];
159 break;
160 case FF_TRANSPOSE_IDCT_PERM:
161 for(i=0; i<64; i++)
162 idct_permutation[i]= ((i&7)<<3) | (i>>3);
163 break;
164 case FF_PARTTRANS_IDCT_PERM:
165 for(i=0; i<64; i++)
166 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
167 break;
168 case FF_SSE2_IDCT_PERM:
169 for(i=0; i<64; i++)
170 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
171 break;
172 default:
173 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
174 }
175 }
176
177 static int pix_sum_c(uint8_t * pix, int line_size)
178 {
179 int s, i, j;
180
181 s = 0;
182 for (i = 0; i < 16; i++) {
183 for (j = 0; j < 16; j += 8) {
184 s += pix[0];
185 s += pix[1];
186 s += pix[2];
187 s += pix[3];
188 s += pix[4];
189 s += pix[5];
190 s += pix[6];
191 s += pix[7];
192 pix += 8;
193 }
194 pix += line_size - 16;
195 }
196 return s;
197 }
198
199 static int pix_norm1_c(uint8_t * pix, int line_size)
200 {
201 int s, i, j;
202 uint32_t *sq = ff_squareTbl + 256;
203
204 s = 0;
205 for (i = 0; i < 16; i++) {
206 for (j = 0; j < 16; j += 8) {
207 #if 0
208 s += sq[pix[0]];
209 s += sq[pix[1]];
210 s += sq[pix[2]];
211 s += sq[pix[3]];
212 s += sq[pix[4]];
213 s += sq[pix[5]];
214 s += sq[pix[6]];
215 s += sq[pix[7]];
216 #else
217 #if HAVE_FAST_64BIT
218 register uint64_t x=*(uint64_t*)pix;
219 s += sq[x&0xff];
220 s += sq[(x>>8)&0xff];
221 s += sq[(x>>16)&0xff];
222 s += sq[(x>>24)&0xff];
223 s += sq[(x>>32)&0xff];
224 s += sq[(x>>40)&0xff];
225 s += sq[(x>>48)&0xff];
226 s += sq[(x>>56)&0xff];
227 #else
228 register uint32_t x=*(uint32_t*)pix;
229 s += sq[x&0xff];
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 x=*(uint32_t*)(pix+4);
234 s += sq[x&0xff];
235 s += sq[(x>>8)&0xff];
236 s += sq[(x>>16)&0xff];
237 s += sq[(x>>24)&0xff];
238 #endif
239 #endif
240 pix += 8;
241 }
242 pix += line_size - 16;
243 }
244 return s;
245 }
246
247 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
248 int i;
249
250 for(i=0; i+8<=w; i+=8){
251 dst[i+0]= av_bswap32(src[i+0]);
252 dst[i+1]= av_bswap32(src[i+1]);
253 dst[i+2]= av_bswap32(src[i+2]);
254 dst[i+3]= av_bswap32(src[i+3]);
255 dst[i+4]= av_bswap32(src[i+4]);
256 dst[i+5]= av_bswap32(src[i+5]);
257 dst[i+6]= av_bswap32(src[i+6]);
258 dst[i+7]= av_bswap32(src[i+7]);
259 }
260 for(;i<w; i++){
261 dst[i+0]= av_bswap32(src[i+0]);
262 }
263 }
264
265 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
266 {
267 while (len--)
268 *dst++ = av_bswap16(*src++);
269 }
270
271 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
272 {
273 int s, i;
274 uint32_t *sq = ff_squareTbl + 256;
275
276 s = 0;
277 for (i = 0; i < h; i++) {
278 s += sq[pix1[0] - pix2[0]];
279 s += sq[pix1[1] - pix2[1]];
280 s += sq[pix1[2] - pix2[2]];
281 s += sq[pix1[3] - pix2[3]];
282 pix1 += line_size;
283 pix2 += line_size;
284 }
285 return s;
286 }
287
288 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
289 {
290 int s, i;
291 uint32_t *sq = ff_squareTbl + 256;
292
293 s = 0;
294 for (i = 0; i < h; i++) {
295 s += sq[pix1[0] - pix2[0]];
296 s += sq[pix1[1] - pix2[1]];
297 s += sq[pix1[2] - pix2[2]];
298 s += sq[pix1[3] - pix2[3]];
299 s += sq[pix1[4] - pix2[4]];
300 s += sq[pix1[5] - pix2[5]];
301 s += sq[pix1[6] - pix2[6]];
302 s += sq[pix1[7] - pix2[7]];
303 pix1 += line_size;
304 pix2 += line_size;
305 }
306 return s;
307 }
308
309 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
310 {
311 int s, i;
312 uint32_t *sq = ff_squareTbl + 256;
313
314 s = 0;
315 for (i = 0; i < h; i++) {
316 s += sq[pix1[ 0] - pix2[ 0]];
317 s += sq[pix1[ 1] - pix2[ 1]];
318 s += sq[pix1[ 2] - pix2[ 2]];
319 s += sq[pix1[ 3] - pix2[ 3]];
320 s += sq[pix1[ 4] - pix2[ 4]];
321 s += sq[pix1[ 5] - pix2[ 5]];
322 s += sq[pix1[ 6] - pix2[ 6]];
323 s += sq[pix1[ 7] - pix2[ 7]];
324 s += sq[pix1[ 8] - pix2[ 8]];
325 s += sq[pix1[ 9] - pix2[ 9]];
326 s += sq[pix1[10] - pix2[10]];
327 s += sq[pix1[11] - pix2[11]];
328 s += sq[pix1[12] - pix2[12]];
329 s += sq[pix1[13] - pix2[13]];
330 s += sq[pix1[14] - pix2[14]];
331 s += sq[pix1[15] - pix2[15]];
332
333 pix1 += line_size;
334 pix2 += line_size;
335 }
336 return s;
337 }
338
339 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
340 const uint8_t *s2, int stride){
341 int i;
342
343 /* read the pixels */
344 for(i=0;i<8;i++) {
345 block[0] = s1[0] - s2[0];
346 block[1] = s1[1] - s2[1];
347 block[2] = s1[2] - s2[2];
348 block[3] = s1[3] - s2[3];
349 block[4] = s1[4] - s2[4];
350 block[5] = s1[5] - s2[5];
351 block[6] = s1[6] - s2[6];
352 block[7] = s1[7] - s2[7];
353 s1 += stride;
354 s2 += stride;
355 block += 8;
356 }
357 }
358
359
360 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
361 int line_size)
362 {
363 int i;
364
365 /* read the pixels */
366 for(i=0;i<8;i++) {
367 pixels[0] = av_clip_uint8(block[0]);
368 pixels[1] = av_clip_uint8(block[1]);
369 pixels[2] = av_clip_uint8(block[2]);
370 pixels[3] = av_clip_uint8(block[3]);
371 pixels[4] = av_clip_uint8(block[4]);
372 pixels[5] = av_clip_uint8(block[5]);
373 pixels[6] = av_clip_uint8(block[6]);
374 pixels[7] = av_clip_uint8(block[7]);
375
376 pixels += line_size;
377 block += 8;
378 }
379 }
380
381 static void put_signed_pixels_clamped_c(const DCTELEM *block,
382 uint8_t *restrict pixels,
383 int line_size)
384 {
385 int i, j;
386
387 for (i = 0; i < 8; i++) {
388 for (j = 0; j < 8; j++) {
389 if (*block < -128)
390 *pixels = 0;
391 else if (*block > 127)
392 *pixels = 255;
393 else
394 *pixels = (uint8_t)(*block + 128);
395 block++;
396 pixels++;
397 }
398 pixels += (line_size - 8);
399 }
400 }
401
402 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
403 int line_size)
404 {
405 int i;
406
407 /* read the pixels */
408 for(i=0;i<8;i++) {
409 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
410 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
411 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
412 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
413 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
414 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
415 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
416 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
417 pixels += line_size;
418 block += 8;
419 }
420 }
421
422 static int sum_abs_dctelem_c(DCTELEM *block)
423 {
424 int sum=0, i;
425 for(i=0; i<64; i++)
426 sum+= FFABS(block[i]);
427 return sum;
428 }
429
430 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
431 {
432 int i;
433
434 for (i = 0; i < h; i++) {
435 memset(block, value, 16);
436 block += line_size;
437 }
438 }
439
440 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
441 {
442 int i;
443
444 for (i = 0; i < h; i++) {
445 memset(block, value, 8);
446 block += line_size;
447 }
448 }
449
450 #define avg2(a,b) ((a+b+1)>>1)
451 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
452
453 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
454 {
455 const int A=(16-x16)*(16-y16);
456 const int B=( x16)*(16-y16);
457 const int C=(16-x16)*( y16);
458 const int D=( x16)*( y16);
459 int i;
460
461 for(i=0; i<h; i++)
462 {
463 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
464 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
465 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
466 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
467 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
468 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
469 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
470 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
471 dst+= stride;
472 src+= stride;
473 }
474 }
475
476 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
477 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
478 {
479 int y, vx, vy;
480 const int s= 1<<shift;
481
482 width--;
483 height--;
484
485 for(y=0; y<h; y++){
486 int x;
487
488 vx= ox;
489 vy= oy;
490 for(x=0; x<8; x++){ //XXX FIXME optimize
491 int src_x, src_y, frac_x, frac_y, index;
492
493 src_x= vx>>16;
494 src_y= vy>>16;
495 frac_x= src_x&(s-1);
496 frac_y= src_y&(s-1);
497 src_x>>=shift;
498 src_y>>=shift;
499
500 if((unsigned)src_x < width){
501 if((unsigned)src_y < height){
502 index= src_x + src_y*stride;
503 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
504 + src[index +1]* frac_x )*(s-frac_y)
505 + ( src[index+stride ]*(s-frac_x)
506 + src[index+stride+1]* frac_x )* frac_y
507 + r)>>(shift*2);
508 }else{
509 index= src_x + av_clip(src_y, 0, height)*stride;
510 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
511 + src[index +1]* frac_x )*s
512 + r)>>(shift*2);
513 }
514 }else{
515 if((unsigned)src_y < height){
516 index= av_clip(src_x, 0, width) + src_y*stride;
517 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
518 + src[index+stride ]* frac_y )*s
519 + r)>>(shift*2);
520 }else{
521 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
522 dst[y*stride + x]= src[index ];
523 }
524 }
525
526 vx+= dxx;
527 vy+= dyx;
528 }
529 ox += dxy;
530 oy += dyy;
531 }
532 }
533
534 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
535 switch(width){
536 case 2: put_pixels2_8_c (dst, src, stride, height); break;
537 case 4: put_pixels4_8_c (dst, src, stride, height); break;
538 case 8: put_pixels8_8_c (dst, src, stride, height); break;
539 case 16:put_pixels16_8_c(dst, src, stride, height); break;
540 }
541 }
542
543 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
544 int i,j;
545 for (i=0; i < height; i++) {
546 for (j=0; j < width; j++) {
547 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
548 }
549 src += stride;
550 dst += stride;
551 }
552 }
553
554 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
555 int i,j;
556 for (i=0; i < height; i++) {
557 for (j=0; j < width; j++) {
558 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
559 }
560 src += stride;
561 dst += stride;
562 }
563 }
564
565 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
566 int i,j;
567 for (i=0; i < height; i++) {
568 for (j=0; j < width; j++) {
569 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
570 }
571 src += stride;
572 dst += stride;
573 }
574 }
575
576 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
577 int i,j;
578 for (i=0; i < height; i++) {
579 for (j=0; j < width; j++) {
580 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
581 }
582 src += stride;
583 dst += stride;
584 }
585 }
586
587 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
588 int i,j;
589 for (i=0; i < height; i++) {
590 for (j=0; j < width; j++) {
591 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
592 }
593 src += stride;
594 dst += stride;
595 }
596 }
597
598 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
599 int i,j;
600 for (i=0; i < height; i++) {
601 for (j=0; j < width; j++) {
602 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
603 }
604 src += stride;
605 dst += stride;
606 }
607 }
608
609 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
610 int i,j;
611 for (i=0; i < height; i++) {
612 for (j=0; j < width; j++) {
613 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
614 }
615 src += stride;
616 dst += stride;
617 }
618 }
619
620 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
621 int i,j;
622 for (i=0; i < height; i++) {
623 for (j=0; j < width; j++) {
624 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
625 }
626 src += stride;
627 dst += stride;
628 }
629 }
630
631 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
632 switch(width){
633 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
634 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
635 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
636 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
637 }
638 }
639
640 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
641 int i,j;
642 for (i=0; i < height; i++) {
643 for (j=0; j < width; j++) {
644 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
645 }
646 src += stride;
647 dst += stride;
648 }
649 }
650
651 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
652 int i,j;
653 for (i=0; i < height; i++) {
654 for (j=0; j < width; j++) {
655 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
656 }
657 src += stride;
658 dst += stride;
659 }
660 }
661
662 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
663 int i,j;
664 for (i=0; i < height; i++) {
665 for (j=0; j < width; j++) {
666 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
667 }
668 src += stride;
669 dst += stride;
670 }
671 }
672
673 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
674 int i,j;
675 for (i=0; i < height; i++) {
676 for (j=0; j < width; j++) {
677 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
678 }
679 src += stride;
680 dst += stride;
681 }
682 }
683
684 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
685 int i,j;
686 for (i=0; i < height; i++) {
687 for (j=0; j < width; j++) {
688 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
689 }
690 src += stride;
691 dst += stride;
692 }
693 }
694
695 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
696 int i,j;
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
700 }
701 src += stride;
702 dst += stride;
703 }
704 }
705
706 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707 int i,j;
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
711 }
712 src += stride;
713 dst += stride;
714 }
715 }
716
717 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
718 int i,j;
719 for (i=0; i < height; i++) {
720 for (j=0; j < width; j++) {
721 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
722 }
723 src += stride;
724 dst += stride;
725 }
726 }
727
728 #define QPEL_MC(r, OPNAME, RND, OP) \
729 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
730 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
731 int i;\
732 for(i=0; i<h; i++)\
733 {\
734 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
735 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
736 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
737 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
738 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
739 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
740 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
741 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
742 dst+=dstStride;\
743 src+=srcStride;\
744 }\
745 }\
746 \
747 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
748 const int w=8;\
749 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
750 int i;\
751 for(i=0; i<w; i++)\
752 {\
753 const int src0= src[0*srcStride];\
754 const int src1= src[1*srcStride];\
755 const int src2= src[2*srcStride];\
756 const int src3= src[3*srcStride];\
757 const int src4= src[4*srcStride];\
758 const int src5= src[5*srcStride];\
759 const int src6= src[6*srcStride];\
760 const int src7= src[7*srcStride];\
761 const int src8= src[8*srcStride];\
762 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
763 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
764 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
765 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
766 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
767 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
768 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
769 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
770 dst++;\
771 src++;\
772 }\
773 }\
774 \
775 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
776 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
777 int i;\
778 \
779 for(i=0; i<h; i++)\
780 {\
781 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
782 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
783 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
784 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
785 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
786 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
787 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
788 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
789 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
790 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
791 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
792 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
793 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
794 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
795 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
796 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
797 dst+=dstStride;\
798 src+=srcStride;\
799 }\
800 }\
801 \
802 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
803 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
804 int i;\
805 const int w=16;\
806 for(i=0; i<w; i++)\
807 {\
808 const int src0= src[0*srcStride];\
809 const int src1= src[1*srcStride];\
810 const int src2= src[2*srcStride];\
811 const int src3= src[3*srcStride];\
812 const int src4= src[4*srcStride];\
813 const int src5= src[5*srcStride];\
814 const int src6= src[6*srcStride];\
815 const int src7= src[7*srcStride];\
816 const int src8= src[8*srcStride];\
817 const int src9= src[9*srcStride];\
818 const int src10= src[10*srcStride];\
819 const int src11= src[11*srcStride];\
820 const int src12= src[12*srcStride];\
821 const int src13= src[13*srcStride];\
822 const int src14= src[14*srcStride];\
823 const int src15= src[15*srcStride];\
824 const int src16= src[16*srcStride];\
825 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
826 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
827 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
828 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
829 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
830 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
831 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
832 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
833 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
834 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
835 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
836 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
837 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
838 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
839 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
840 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
841 dst++;\
842 src++;\
843 }\
844 }\
845 \
846 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
847 uint8_t half[64];\
848 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
849 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
850 }\
851 \
852 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
853 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
854 }\
855 \
856 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
857 uint8_t half[64];\
858 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
859 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
860 }\
861 \
862 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
863 uint8_t full[16*9];\
864 uint8_t half[64];\
865 copy_block9(full, src, 16, stride, 9);\
866 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
867 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
868 }\
869 \
870 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
871 uint8_t full[16*9];\
872 copy_block9(full, src, 16, stride, 9);\
873 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
874 }\
875 \
876 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
877 uint8_t full[16*9];\
878 uint8_t half[64];\
879 copy_block9(full, src, 16, stride, 9);\
880 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
881 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
882 }\
883 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
884 uint8_t full[16*9];\
885 uint8_t halfH[72];\
886 uint8_t halfV[64];\
887 uint8_t halfHV[64];\
888 copy_block9(full, src, 16, stride, 9);\
889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
890 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
891 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
892 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
893 }\
894 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
895 uint8_t full[16*9];\
896 uint8_t halfH[72];\
897 uint8_t halfHV[64];\
898 copy_block9(full, src, 16, stride, 9);\
899 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
900 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
901 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
902 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
903 }\
904 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
905 uint8_t full[16*9];\
906 uint8_t halfH[72];\
907 uint8_t halfV[64];\
908 uint8_t halfHV[64];\
909 copy_block9(full, src, 16, stride, 9);\
910 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
911 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
912 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
913 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
914 }\
915 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
916 uint8_t full[16*9];\
917 uint8_t halfH[72];\
918 uint8_t halfHV[64];\
919 copy_block9(full, src, 16, stride, 9);\
920 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
921 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
922 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
923 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
924 }\
925 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
926 uint8_t full[16*9];\
927 uint8_t halfH[72];\
928 uint8_t halfV[64];\
929 uint8_t halfHV[64];\
930 copy_block9(full, src, 16, stride, 9);\
931 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
932 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
933 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
934 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
935 }\
936 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
937 uint8_t full[16*9];\
938 uint8_t halfH[72];\
939 uint8_t halfHV[64];\
940 copy_block9(full, src, 16, stride, 9);\
941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
942 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
944 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
945 }\
946 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
947 uint8_t full[16*9];\
948 uint8_t halfH[72];\
949 uint8_t halfV[64];\
950 uint8_t halfHV[64];\
951 copy_block9(full, src, 16, stride, 9);\
952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
953 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
954 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
955 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
956 }\
957 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
958 uint8_t full[16*9];\
959 uint8_t halfH[72];\
960 uint8_t halfHV[64];\
961 copy_block9(full, src, 16, stride, 9);\
962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
963 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
965 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
966 }\
967 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
968 uint8_t halfH[72];\
969 uint8_t halfHV[64];\
970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
973 }\
974 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
975 uint8_t halfH[72];\
976 uint8_t halfHV[64];\
977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
978 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
979 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
980 }\
981 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
982 uint8_t full[16*9];\
983 uint8_t halfH[72];\
984 uint8_t halfV[64];\
985 uint8_t halfHV[64];\
986 copy_block9(full, src, 16, stride, 9);\
987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
988 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
990 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
991 }\
992 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
993 uint8_t full[16*9];\
994 uint8_t halfH[72];\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
998 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
999 }\
1000 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1001 uint8_t full[16*9];\
1002 uint8_t halfH[72];\
1003 uint8_t halfV[64];\
1004 uint8_t halfHV[64];\
1005 copy_block9(full, src, 16, stride, 9);\
1006 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1010 }\
1011 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1012 uint8_t full[16*9];\
1013 uint8_t halfH[72];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1018 }\
1019 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t halfH[72];\
1021 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1022 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1023 }\
1024 \
1025 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1026 uint8_t half[256];\
1027 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1028 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1029 }\
1030 \
1031 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1032 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1033 }\
1034 \
1035 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1036 uint8_t half[256];\
1037 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1038 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1039 }\
1040 \
1041 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[24*17];\
1043 uint8_t half[256];\
1044 copy_block17(full, src, 24, stride, 17);\
1045 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1046 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1047 }\
1048 \
1049 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1050 uint8_t full[24*17];\
1051 copy_block17(full, src, 24, stride, 17);\
1052 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1053 }\
1054 \
1055 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1056 uint8_t full[24*17];\
1057 uint8_t half[256];\
1058 copy_block17(full, src, 24, stride, 17);\
1059 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1060 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1061 }\
1062 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t full[24*17];\
1064 uint8_t halfH[272];\
1065 uint8_t halfV[256];\
1066 uint8_t halfHV[256];\
1067 copy_block17(full, src, 24, stride, 17);\
1068 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1069 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1070 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1071 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1072 }\
1073 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1074 uint8_t full[24*17];\
1075 uint8_t halfH[272];\
1076 uint8_t halfHV[256];\
1077 copy_block17(full, src, 24, stride, 17);\
1078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1079 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1080 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1081 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1082 }\
1083 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1084 uint8_t full[24*17];\
1085 uint8_t halfH[272];\
1086 uint8_t halfV[256];\
1087 uint8_t halfHV[256];\
1088 copy_block17(full, src, 24, stride, 17);\
1089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1090 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1092 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1093 }\
1094 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[24*17];\
1096 uint8_t halfH[272];\
1097 uint8_t halfHV[256];\
1098 copy_block17(full, src, 24, stride, 17);\
1099 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1100 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1101 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1102 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1103 }\
1104 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1105 uint8_t full[24*17];\
1106 uint8_t halfH[272];\
1107 uint8_t halfV[256];\
1108 uint8_t halfHV[256];\
1109 copy_block17(full, src, 24, stride, 17);\
1110 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1111 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1112 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1113 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1114 }\
1115 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1116 uint8_t full[24*17];\
1117 uint8_t halfH[272];\
1118 uint8_t halfHV[256];\
1119 copy_block17(full, src, 24, stride, 17);\
1120 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1121 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1122 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1123 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1124 }\
1125 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1126 uint8_t full[24*17];\
1127 uint8_t halfH[272];\
1128 uint8_t halfV[256];\
1129 uint8_t halfHV[256];\
1130 copy_block17(full, src, 24, stride, 17);\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1133 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1134 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1135 }\
1136 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1138 uint8_t halfH[272];\
1139 uint8_t halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1145 }\
1146 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1147 uint8_t halfH[272];\
1148 uint8_t halfHV[256];\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1151 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1152 }\
1153 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1154 uint8_t halfH[272];\
1155 uint8_t halfHV[256];\
1156 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1157 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1158 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1159 }\
1160 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161 uint8_t full[24*17];\
1162 uint8_t halfH[272];\
1163 uint8_t halfV[256];\
1164 uint8_t halfHV[256];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1170 }\
1171 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[24*17];\
1173 uint8_t halfH[272];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1177 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1178 }\
1179 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1180 uint8_t full[24*17];\
1181 uint8_t halfH[272];\
1182 uint8_t halfV[256];\
1183 uint8_t halfHV[256];\
1184 copy_block17(full, src, 24, stride, 17);\
1185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1189 }\
1190 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1191 uint8_t full[24*17];\
1192 uint8_t halfH[272];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1197 }\
1198 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t halfH[272];\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1201 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1202 }
1203
1204 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1205 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1206 #define op_put(a, b) a = cm[((b) + 16)>>5]
1207 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1208
1209 QPEL_MC(0, put_ , _ , op_put)
1210 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1211 QPEL_MC(0, avg_ , _ , op_avg)
1212 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1213 #undef op_avg
1214 #undef op_avg_no_rnd
1215 #undef op_put
1216 #undef op_put_no_rnd
1217
1218 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1219 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1220 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1221 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1222 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1223 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1224
1225 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1226 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1227 int i;
1228
1229 for(i=0; i<h; i++){
1230 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1231 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1232 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1233 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1234 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1235 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1236 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1237 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1238 dst+=dstStride;
1239 src+=srcStride;
1240 }
1241 }
1242
1243 #if CONFIG_RV40_DECODER
1244 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1245 put_pixels16_xy2_8_c(dst, src, stride, 16);
1246 }
1247 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1248 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1249 }
1250 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1251 put_pixels8_xy2_8_c(dst, src, stride, 8);
1252 }
1253 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1254 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1255 }
1256 #endif /* CONFIG_RV40_DECODER */
1257
1258 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1259 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1260 int i;
1261
1262 for(i=0; i<w; i++){
1263 const int src_1= src[ -srcStride];
1264 const int src0 = src[0 ];
1265 const int src1 = src[ srcStride];
1266 const int src2 = src[2*srcStride];
1267 const int src3 = src[3*srcStride];
1268 const int src4 = src[4*srcStride];
1269 const int src5 = src[5*srcStride];
1270 const int src6 = src[6*srcStride];
1271 const int src7 = src[7*srcStride];
1272 const int src8 = src[8*srcStride];
1273 const int src9 = src[9*srcStride];
1274 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1275 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1276 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1277 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1278 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1279 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1280 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1281 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1282 src++;
1283 dst++;
1284 }
1285 }
1286
1287 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1288 uint8_t half[64];
1289 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1290 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1291 }
1292
1293 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1294 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1295 }
1296
1297 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1298 uint8_t half[64];
1299 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1300 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1301 }
1302
1303 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1304 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1305 }
1306
1307 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1308 uint8_t halfH[88];
1309 uint8_t halfV[64];
1310 uint8_t halfHV[64];
1311 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1312 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1313 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1314 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1315 }
1316 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1317 uint8_t halfH[88];
1318 uint8_t halfV[64];
1319 uint8_t halfHV[64];
1320 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1321 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1322 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1323 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1324 }
1325 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1326 uint8_t halfH[88];
1327 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1328 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1329 }
1330
1331 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1332 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1333 int x;
1334 const int strength= ff_h263_loop_filter_strength[qscale];
1335
1336 for(x=0; x<8; x++){
1337 int d1, d2, ad1;
1338 int p0= src[x-2*stride];
1339 int p1= src[x-1*stride];
1340 int p2= src[x+0*stride];
1341 int p3= src[x+1*stride];
1342 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1343
1344 if (d<-2*strength) d1= 0;
1345 else if(d<- strength) d1=-2*strength - d;
1346 else if(d< strength) d1= d;
1347 else if(d< 2*strength) d1= 2*strength - d;
1348 else d1= 0;
1349
1350 p1 += d1;
1351 p2 -= d1;
1352 if(p1&256) p1= ~(p1>>31);
1353 if(p2&256) p2= ~(p2>>31);
1354
1355 src[x-1*stride] = p1;
1356 src[x+0*stride] = p2;
1357
1358 ad1= FFABS(d1)>>1;
1359
1360 d2= av_clip((p0-p3)/4, -ad1, ad1);
1361
1362 src[x-2*stride] = p0 - d2;
1363 src[x+ stride] = p3 + d2;
1364 }
1365 }
1366 }
1367
1368 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1369 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1370 int y;
1371 const int strength= ff_h263_loop_filter_strength[qscale];
1372
1373 for(y=0; y<8; y++){
1374 int d1, d2, ad1;
1375 int p0= src[y*stride-2];
1376 int p1= src[y*stride-1];
1377 int p2= src[y*stride+0];
1378 int p3= src[y*stride+1];
1379 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1380
1381 if (d<-2*strength) d1= 0;
1382 else if(d<- strength) d1=-2*strength - d;
1383 else if(d< strength) d1= d;
1384 else if(d< 2*strength) d1= 2*strength - d;
1385 else d1= 0;
1386
1387 p1 += d1;
1388 p2 -= d1;
1389 if(p1&256) p1= ~(p1>>31);
1390 if(p2&256) p2= ~(p2>>31);
1391
1392 src[y*stride-1] = p1;
1393 src[y*stride+0] = p2;
1394
1395 ad1= FFABS(d1)>>1;
1396
1397 d2= av_clip((p0-p3)/4, -ad1, ad1);
1398
1399 src[y*stride-2] = p0 - d2;
1400 src[y*stride+1] = p3 + d2;
1401 }
1402 }
1403 }
1404
1405 static void h261_loop_filter_c(uint8_t *src, int stride){
1406 int x,y,xy,yz;
1407 int temp[64];
1408
1409 for(x=0; x<8; x++){
1410 temp[x ] = 4*src[x ];
1411 temp[x + 7*8] = 4*src[x + 7*stride];
1412 }
1413 for(y=1; y<7; y++){
1414 for(x=0; x<8; x++){
1415 xy = y * stride + x;
1416 yz = y * 8 + x;
1417 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1418 }
1419 }
1420
1421 for(y=0; y<8; y++){
1422 src[ y*stride] = (temp[ y*8] + 2)>>2;
1423 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1424 for(x=1; x<7; x++){
1425 xy = y * stride + x;
1426 yz = y * 8 + x;
1427 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1428 }
1429 }
1430 }
1431
1432 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1433 {
1434 int s, i;
1435
1436 s = 0;
1437 for(i=0;i<h;i++) {
1438 s += abs(pix1[0] - pix2[0]);
1439 s += abs(pix1[1] - pix2[1]);
1440 s += abs(pix1[2] - pix2[2]);
1441 s += abs(pix1[3] - pix2[3]);
1442 s += abs(pix1[4] - pix2[4]);
1443 s += abs(pix1[5] - pix2[5]);
1444 s += abs(pix1[6] - pix2[6]);
1445 s += abs(pix1[7] - pix2[7]);
1446 s += abs(pix1[8] - pix2[8]);
1447 s += abs(pix1[9] - pix2[9]);
1448 s += abs(pix1[10] - pix2[10]);
1449 s += abs(pix1[11] - pix2[11]);
1450 s += abs(pix1[12] - pix2[12]);
1451 s += abs(pix1[13] - pix2[13]);
1452 s += abs(pix1[14] - pix2[14]);
1453 s += abs(pix1[15] - pix2[15]);
1454 pix1 += line_size;
1455 pix2 += line_size;
1456 }
1457 return s;
1458 }
1459
1460 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1461 {
1462 int s, i;
1463
1464 s = 0;
1465 for(i=0;i<h;i++) {
1466 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1467 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1468 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1469 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1470 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1471 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1472 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1473 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1474 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1475 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1476 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1477 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1478 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1479 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1480 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1481 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1482 pix1 += line_size;
1483 pix2 += line_size;
1484 }
1485 return s;
1486 }
1487
1488 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1489 {
1490 int s, i;
1491 uint8_t *pix3 = pix2 + line_size;
1492
1493 s = 0;
1494 for(i=0;i<h;i++) {
1495 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1496 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1497 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1498 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1499 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1500 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1501 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1502 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1503 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1504 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1505 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1506 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1507 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1508 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1509 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1510 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1511 pix1 += line_size;
1512 pix2 += line_size;
1513 pix3 += line_size;
1514 }
1515 return s;
1516 }
1517
1518 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1519 {
1520 int s, i;
1521 uint8_t *pix3 = pix2 + line_size;
1522
1523 s = 0;
1524 for(i=0;i<h;i++) {
1525 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1526 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1527 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1528 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1529 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1530 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1531 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1532 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1533 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1534 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1535 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1536 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1537 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1538 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1539 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1540 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1541 pix1 += line_size;
1542 pix2 += line_size;
1543 pix3 += line_size;
1544 }
1545 return s;
1546 }
1547
1548 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1549 {
1550 int s, i;
1551
1552 s = 0;
1553 for(i=0;i<h;i++) {
1554 s += abs(pix1[0] - pix2[0]);
1555 s += abs(pix1[1] - pix2[1]);
1556 s += abs(pix1[2] - pix2[2]);
1557 s += abs(pix1[3] - pix2[3]);
1558 s += abs(pix1[4] - pix2[4]);
1559 s += abs(pix1[5] - pix2[5]);
1560 s += abs(pix1[6] - pix2[6]);
1561 s += abs(pix1[7] - pix2[7]);
1562 pix1 += line_size;
1563 pix2 += line_size;
1564 }
1565 return s;
1566 }
1567
1568 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1569 {
1570 int s, i;
1571
1572 s = 0;
1573 for(i=0;i<h;i++) {
1574 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1575 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1576 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1577 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1578 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1579 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1580 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1581 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1582 pix1 += line_size;
1583 pix2 += line_size;
1584 }
1585 return s;
1586 }
1587
1588 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1589 {
1590 int s, i;
1591 uint8_t *pix3 = pix2 + line_size;
1592
1593 s = 0;
1594 for(i=0;i<h;i++) {
1595 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1596 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1597 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1598 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1599 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1600 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1601 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1602 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1603 pix1 += line_size;
1604 pix2 += line_size;
1605 pix3 += line_size;
1606 }
1607 return s;
1608 }
1609
1610 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1611 {
1612 int s, i;
1613 uint8_t *pix3 = pix2 + line_size;
1614
1615 s = 0;
1616 for(i=0;i<h;i++) {
1617 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1618 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1619 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1620 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1621 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1622 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1623 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1624 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1625 pix1 += line_size;
1626 pix2 += line_size;
1627 pix3 += line_size;
1628 }
1629 return s;
1630 }
1631
1632 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1633 MpegEncContext *c = v;
1634 int score1=0;
1635 int score2=0;
1636 int x,y;
1637
1638 for(y=0; y<h; y++){
1639 for(x=0; x<16; x++){
1640 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1641 }
1642 if(y+1<h){
1643 for(x=0; x<15; x++){
1644 score2+= FFABS( s1[x ] - s1[x +stride]
1645 - s1[x+1] + s1[x+1+stride])
1646 -FFABS( s2[x ] - s2[x +stride]
1647 - s2[x+1] + s2[x+1+stride]);
1648 }
1649 }
1650 s1+= stride;
1651 s2+= stride;
1652 }
1653
1654 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1655 else return score1 + FFABS(score2)*8;
1656 }
1657
1658 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1659 MpegEncContext *c = v;
1660 int score1=0;
1661 int score2=0;
1662 int x,y;
1663
1664 for(y=0; y<h; y++){
1665 for(x=0; x<8; x++){
1666 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1667 }
1668 if(y+1<h){
1669 for(x=0; x<7; x++){
1670 score2+= FFABS( s1[x ] - s1[x +stride]
1671 - s1[x+1] + s1[x+1+stride])
1672 -FFABS( s2[x ] - s2[x +stride]
1673 - s2[x+1] + s2[x+1+stride]);
1674 }
1675 }
1676 s1+= stride;
1677 s2+= stride;
1678 }
1679
1680 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1681 else return score1 + FFABS(score2)*8;
1682 }
1683
1684 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1685 int i;
1686 unsigned int sum=0;
1687
1688 for(i=0; i<8*8; i++){
1689 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1690 int w= weight[i];
1691 b>>= RECON_SHIFT;
1692 assert(-512<b && b<512);
1693
1694 sum += (w*b)*(w*b)>>4;
1695 }
1696 return sum>>2;
1697 }
1698
1699 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1700 int i;
1701
1702 for(i=0; i<8*8; i++){
1703 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1704 }
1705 }
1706
1707 /**
1708 * Permute an 8x8 block.
1709 * @param block the block which will be permuted according to the given permutation vector
1710 * @param permutation the permutation vector
1711 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1712 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1713 * (inverse) permutated to scantable order!
1714 */
1715 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1716 {
1717 int i;
1718 DCTELEM temp[64];
1719
1720 if(last<=0) return;
1721 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1722
1723 for(i=0; i<=last; i++){
1724 const int j= scantable[i];
1725 temp[j]= block[j];
1726 block[j]=0;
1727 }
1728
1729 for(i=0; i<=last; i++){
1730 const int j= scantable[i];
1731 const int perm_j= permutation[j];
1732 block[perm_j]= temp[j];
1733 }
1734 }
1735
1736 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1737 return 0;
1738 }
1739
1740 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1741 int i;
1742
1743 memset(cmp, 0, sizeof(void*)*6);
1744
1745 for(i=0; i<6; i++){
1746 switch(type&0xFF){
1747 case FF_CMP_SAD:
1748 cmp[i]= c->sad[i];
1749 break;
1750 case FF_CMP_SATD:
1751 cmp[i]= c->hadamard8_diff[i];
1752 break;
1753 case FF_CMP_SSE:
1754 cmp[i]= c->sse[i];
1755 break;
1756 case FF_CMP_DCT:
1757 cmp[i]= c->dct_sad[i];
1758 break;
1759 case FF_CMP_DCT264:
1760 cmp[i]= c->dct264_sad[i];
1761 break;
1762 case FF_CMP_DCTMAX:
1763 cmp[i]= c->dct_max[i];
1764 break;
1765 case FF_CMP_PSNR:
1766 cmp[i]= c->quant_psnr[i];
1767 break;
1768 case FF_CMP_BIT:
1769 cmp[i]= c->bit[i];
1770 break;
1771 case FF_CMP_RD:
1772 cmp[i]= c->rd[i];
1773 break;
1774 case FF_CMP_VSAD:
1775 cmp[i]= c->vsad[i];
1776 break;
1777 case FF_CMP_VSSE:
1778 cmp[i]= c->vsse[i];
1779 break;
1780 case FF_CMP_ZERO:
1781 cmp[i]= zero_cmp;
1782 break;
1783 case FF_CMP_NSSE:
1784 cmp[i]= c->nsse[i];
1785 break;
1786 default:
1787 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1788 }
1789 }
1790 }
1791
1792 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1793 long i;
1794 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1795 long a = *(long*)(src+i);
1796 long b = *(long*)(dst+i);
1797 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1798 }
1799 for(; i<w; i++)
1800 dst[i+0] += src[i+0];
1801 }
1802
1803 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1804 long i;
1805 #if !HAVE_FAST_UNALIGNED
1806 if((long)src2 & (sizeof(long)-1)){
1807 for(i=0; i+7<w; i+=8){
1808 dst[i+0] = src1[i+0]-src2[i+0];
1809 dst[i+1] = src1[i+1]-src2[i+1];
1810 dst[i+2] = src1[i+2]-src2[i+2];
1811 dst[i+3] = src1[i+3]-src2[i+3];
1812 dst[i+4] = src1[i+4]-src2[i+4];
1813 dst[i+5] = src1[i+5]-src2[i+5];
1814 dst[i+6] = src1[i+6]-src2[i+6];
1815 dst[i+7] = src1[i+7]-src2[i+7];
1816 }
1817 }else
1818 #endif
1819 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1820 long a = *(long*)(src1+i);
1821 long b = *(long*)(src2+i);
1822 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1823 }
1824 for(; i<w; i++)
1825 dst[i+0] = src1[i+0]-src2[i+0];
1826 }
1827
1828 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1829 int i;
1830 uint8_t l, lt;
1831
1832 l= *left;
1833 lt= *left_top;
1834
1835 for(i=0; i<w; i++){
1836 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1837 lt= src1[i];
1838 dst[i]= l;
1839 }
1840
1841 *left= l;
1842 *left_top= lt;
1843 }
1844
1845 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1846 int i;
1847 uint8_t l, lt;
1848
1849 l= *left;
1850 lt= *left_top;
1851
1852 for(i=0; i<w; i++){
1853 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1854 lt= src1[i];
1855 l= src2[i];
1856 dst[i]= l - pred;
1857 }
1858
1859 *left= l;
1860 *left_top= lt;
1861 }
1862
1863 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1864 int i;
1865
1866 for(i=0; i<w-1; i++){
1867 acc+= src[i];
1868 dst[i]= acc;
1869 i++;
1870 acc+= src[i];
1871 dst[i]= acc;
1872 }
1873
1874 for(; i<w; i++){
1875 acc+= src[i];
1876 dst[i]= acc;
1877 }
1878
1879 return acc;
1880 }
1881
1882 #if HAVE_BIGENDIAN
1883 #define B 3
1884 #define G 2
1885 #define R 1
1886 #define A 0
1887 #else
1888 #define B 0
1889 #define G 1
1890 #define R 2
1891 #define A 3
1892 #endif
1893 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1894 int i;
1895 int r,g,b,a;
1896 r= *red;
1897 g= *green;
1898 b= *blue;
1899 a= *alpha;
1900
1901 for(i=0; i<w; i++){
1902 b+= src[4*i+B];
1903 g+= src[4*i+G];
1904 r+= src[4*i+R];
1905 a+= src[4*i+A];
1906
1907 dst[4*i+B]= b;
1908 dst[4*i+G]= g;
1909 dst[4*i+R]= r;
1910 dst[4*i+A]= a;
1911 }
1912
1913 *red= r;
1914 *green= g;
1915 *blue= b;
1916 *alpha= a;
1917 }
1918 #undef B
1919 #undef G
1920 #undef R
1921 #undef A
1922
1923 #define BUTTERFLY2(o1,o2,i1,i2) \
1924 o1= (i1)+(i2);\
1925 o2= (i1)-(i2);
1926
1927 #define BUTTERFLY1(x,y) \
1928 {\
1929 int a,b;\
1930 a= x;\
1931 b= y;\
1932 x= a+b;\
1933 y= a-b;\
1934 }
1935
1936 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1937
1938 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1939 int i;
1940 int temp[64];
1941 int sum=0;
1942
1943 assert(h==8);
1944
1945 for(i=0; i<8; i++){
1946 //FIXME try pointer walks
1947 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1948 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1949 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1950 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1951
1952 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1953 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1954 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1955 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1956
1957 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1958 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1959 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1960 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1961 }
1962
1963 for(i=0; i<8; i++){
1964 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1965 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1966 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1967 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1968
1969 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1970 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1971 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1972 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1973
1974 sum +=
1975 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1976 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1977 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1978 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1979 }
1980 return sum;
1981 }
1982
1983 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1984 int i;
1985 int temp[64];
1986 int sum=0;
1987
1988 assert(h==8);
1989
1990 for(i=0; i<8; i++){
1991 //FIXME try pointer walks
1992 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1993 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1994 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1995 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1996
1997 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1998 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1999 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2000 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2001
2002 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2003 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2004 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2005 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2006 }
2007
2008 for(i=0; i<8; i++){
2009 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2010 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2011 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2012 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2013
2014 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2015 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2016 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2017 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2018
2019 sum +=
2020 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2021 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2022 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2023 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2024 }
2025
2026 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2027
2028 return sum;
2029 }
2030
2031 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2032 MpegEncContext * const s= (MpegEncContext *)c;
2033 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2034
2035 assert(h==8);
2036
2037 s->dsp.diff_pixels(temp, src1, src2, stride);
2038 s->dsp.fdct(temp);
2039 return s->dsp.sum_abs_dctelem(temp);
2040 }
2041
2042 #if CONFIG_GPL
2043 #define DCT8_1D {\
2044 const int s07 = SRC(0) + SRC(7);\
2045 const int s16 = SRC(1) + SRC(6);\
2046 const int s25 = SRC(2) + SRC(5);\
2047 const int s34 = SRC(3) + SRC(4);\
2048 const int a0 = s07 + s34;\
2049 const int a1 = s16 + s25;\
2050 const int a2 = s07 - s34;\
2051 const int a3 = s16 - s25;\
2052 const int d07 = SRC(0) - SRC(7);\
2053 const int d16 = SRC(1) - SRC(6);\
2054 const int d25 = SRC(2) - SRC(5);\
2055 const int d34 = SRC(3) - SRC(4);\
2056 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2057 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2058 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2059 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2060 DST(0, a0 + a1 ) ;\
2061 DST(1, a4 + (a7>>2)) ;\
2062 DST(2, a2 + (a3>>1)) ;\
2063 DST(3, a5 + (a6>>2)) ;\
2064 DST(4, a0 - a1 ) ;\
2065 DST(5, a6 - (a5>>2)) ;\
2066 DST(6, (a2>>1) - a3 ) ;\
2067 DST(7, (a4>>2) - a7 ) ;\
2068 }
2069
2070 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2071 MpegEncContext * const s= (MpegEncContext *)c;
2072 DCTELEM dct[8][8];
2073 int i;
2074 int sum=0;
2075
2076 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2077
2078 #define SRC(x) dct[i][x]
2079 #define DST(x,v) dct[i][x]= v
2080 for( i = 0; i < 8; i++ )
2081 DCT8_1D
2082 #undef SRC
2083 #undef DST
2084
2085 #define SRC(x) dct[x][i]
2086 #define DST(x,v) sum += FFABS(v)
2087 for( i = 0; i < 8; i++ )
2088 DCT8_1D
2089 #undef SRC
2090 #undef DST
2091 return sum;
2092 }
2093 #endif
2094
2095 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2096 MpegEncContext * const s= (MpegEncContext *)c;
2097 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2098 int sum=0, i;
2099
2100 assert(h==8);
2101
2102 s->dsp.diff_pixels(temp, src1, src2, stride);
2103 s->dsp.fdct(temp);
2104
2105 for(i=0; i<64; i++)
2106 sum= FFMAX(sum, FFABS(temp[i]));
2107
2108 return sum;
2109 }
2110
2111 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2112 MpegEncContext * const s= (MpegEncContext *)c;
2113 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2114 DCTELEM * const bak = temp+64;
2115 int sum=0, i;
2116
2117 assert(h==8);
2118 s->mb_intra=0;
2119
2120 s->dsp.diff_pixels(temp, src1, src2, stride);
2121
2122 memcpy(bak, temp, 64*sizeof(DCTELEM));
2123
2124 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2125 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2126 ff_simple_idct_8(temp); //FIXME
2127
2128 for(i=0; i<64; i++)
2129 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2130
2131 return sum;
2132 }
2133
2134 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2135 MpegEncContext * const s= (MpegEncContext *)c;
2136 const uint8_t *scantable= s->intra_scantable.permutated;
2137 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2138 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2139 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2140 int i, last, run, bits, level, distortion, start_i;
2141 const int esc_length= s->ac_esc_length;
2142 uint8_t * length;
2143 uint8_t * last_length;
2144
2145 assert(h==8);
2146
2147 copy_block8(lsrc1, src1, 8, stride, 8);
2148 copy_block8(lsrc2, src2, 8, stride, 8);
2149
2150 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2151
2152 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2153
2154 bits=0;
2155
2156 if (s->mb_intra) {
2157 start_i = 1;
2158 length = s->intra_ac_vlc_length;
2159 last_length= s->intra_ac_vlc_last_length;
2160 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2161 } else {
2162 start_i = 0;
2163 length = s->inter_ac_vlc_length;
2164 last_length= s->inter_ac_vlc_last_length;
2165 }
2166
2167 if(last>=start_i){
2168 run=0;
2169 for(i=start_i; i<last; i++){
2170 int j= scantable[i];
2171 level= temp[j];
2172
2173 if(level){
2174 level+=64;
2175 if((level&(~127)) == 0){
2176 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2177 }else
2178 bits+= esc_length;
2179 run=0;
2180 }else
2181 run++;
2182 }
2183 i= scantable[last];
2184
2185 level= temp[i] + 64;
2186
2187 assert(level - 64);
2188
2189 if((level&(~127)) == 0){
2190 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2191 }else
2192 bits+= esc_length;
2193
2194 }
2195
2196 if(last>=0){
2197 if(s->mb_intra)
2198 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2199 else
2200 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2201 }
2202
2203 s->dsp.idct_add(lsrc2, 8, temp);
2204
2205 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2206
2207 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2208 }
2209
2210 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2211 MpegEncContext * const s= (MpegEncContext *)c;
2212 const uint8_t *scantable= s->intra_scantable.permutated;
2213 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2214 int i, last, run, bits, level, start_i;
2215 const int esc_length= s->ac_esc_length;
2216 uint8_t * length;
2217 uint8_t * last_length;
2218
2219 assert(h==8);
2220
2221 s->dsp.diff_pixels(temp, src1, src2, stride);
2222
2223 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2224
2225 bits=0;
2226
2227 if (s->mb_intra) {
2228 start_i = 1;
2229 length = s->intra_ac_vlc_length;
2230 last_length= s->intra_ac_vlc_last_length;
2231 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2232 } else {
2233 start_i = 0;
2234 length = s->inter_ac_vlc_length;
2235 last_length= s->inter_ac_vlc_last_length;
2236 }
2237
2238 if(last>=start_i){
2239 run=0;
2240 for(i=start_i; i<last; i++){
2241 int j= scantable[i];
2242 level= temp[j];
2243
2244 if(level){
2245 level+=64;
2246 if((level&(~127)) == 0){
2247 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2248 }else
2249 bits+= esc_length;
2250 run=0;
2251 }else
2252 run++;
2253 }
2254 i= scantable[last];
2255
2256 level= temp[i] + 64;
2257
2258 assert(level - 64);
2259
2260 if((level&(~127)) == 0){
2261 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2262 }else
2263 bits+= esc_length;
2264 }
2265
2266 return bits;
2267 }
2268
2269 #define VSAD_INTRA(size) \
2270 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2271 int score=0; \
2272 int x,y; \
2273 \
2274 for(y=1; y<h; y++){ \
2275 for(x=0; x<size; x+=4){ \
2276 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2277 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2278 } \
2279 s+= stride; \
2280 } \
2281 \
2282 return score; \
2283 }
2284 VSAD_INTRA(8)
2285 VSAD_INTRA(16)
2286
2287 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2288 int score=0;
2289 int x,y;
2290
2291 for(y=1; y<h; y++){
2292 for(x=0; x<16; x++){
2293 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2294 }
2295 s1+= stride;
2296 s2+= stride;
2297 }
2298
2299 return score;
2300 }
2301
2302 #define SQ(a) ((a)*(a))
2303 #define VSSE_INTRA(size) \
2304 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2305 int score=0; \
2306 int x,y; \
2307 \
2308 for(y=1; y<h; y++){ \
2309 for(x=0; x<size; x+=4){ \
2310 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2311 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2312 } \
2313 s+= stride; \
2314 } \
2315 \
2316 return score; \
2317 }
2318 VSSE_INTRA(8)
2319 VSSE_INTRA(16)
2320
2321 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2322 int score=0;
2323 int x,y;
2324
2325 for(y=1; y<h; y++){
2326 for(x=0; x<16; x++){
2327 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2328 }
2329 s1+= stride;
2330 s2+= stride;
2331 }
2332
2333 return score;
2334 }
2335
2336 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2337 int size){
2338 int score=0;
2339 int i;
2340 for(i=0; i<size; i++)
2341 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2342 return score;
2343 }
2344
2345 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2346 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2347 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2348 #if CONFIG_GPL
2349 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2350 #endif
2351 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2352 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2353 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2354 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2355
2356 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2357 int i;
2358 src1 += len-1;
2359 for(i=0; i<len; i++)
2360 dst[i] = src0[i] * src1[-i];
2361 }
2362
2363 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2364 int i;
2365 for(i=0; i<len; i++)
2366 dst[i] = src0[i] * src1[i] + src2[i];
2367 }
2368
2369 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2370 int len)
2371 {
2372 int i;
2373 for (i = 0; i < len; i++) {
2374 float t = v1[i] - v2[i];
2375 v1[i] += v2[i];
2376 v2[i] = t;
2377 }
2378 }
2379
2380 float ff_scalarproduct_float_c(const float *v1, const float *v2, int len)
2381 {
2382 float p = 0.0;
2383 int i;
2384
2385 for (i = 0; i < len; i++)
2386 p += v1[i] * v2[i];
2387
2388 return p;
2389 }
2390
2391 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2392 uint32_t maxi, uint32_t maxisign)
2393 {
2394
2395 if(a > mini) return mini;
2396 else if((a^(1U<<31)) > maxisign) return maxi;
2397 else return a;
2398 }
2399
2400 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2401 int i;
2402 uint32_t mini = *(uint32_t*)min;
2403 uint32_t maxi = *(uint32_t*)max;
2404 uint32_t maxisign = maxi ^ (1U<<31);
2405 uint32_t *dsti = (uint32_t*)dst;
2406 const uint32_t *srci = (const uint32_t*)src;
2407 for(i=0; i<len; i+=8) {
2408 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2409 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2410 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2411 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2412 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2413 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2414 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2415 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2416 }
2417 }
2418 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2419 int i;
2420 if(min < 0 && max > 0) {
2421 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2422 } else {
2423 for(i=0; i < len; i+=8) {
2424 dst[i ] = av_clipf(src[i ], min, max);
2425 dst[i + 1] = av_clipf(src[i + 1], min, max);
2426 dst[i + 2] = av_clipf(src[i + 2], min, max);
2427 dst[i + 3] = av_clipf(src[i + 3], min, max);
2428 dst[i + 4] = av_clipf(src[i + 4], min, max);
2429 dst[i + 5] = av_clipf(src[i + 5], min, max);
2430 dst[i + 6] = av_clipf(src[i + 6], min, max);
2431 dst[i + 7] = av_clipf(src[i + 7], min, max);
2432 }
2433 }
2434 }
2435
2436 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2437 {
2438 int res = 0;
2439
2440 while (order--)
2441 res += *v1++ * *v2++;
2442
2443 return res;
2444 }
2445
2446 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2447 {
2448 int res = 0;
2449 while (order--) {
2450 res += *v1 * *v2++;
2451 *v1++ += mul * *v3++;
2452 }
2453 return res;
2454 }
2455
2456 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2457 const int16_t *window, unsigned int len)
2458 {
2459 int i;
2460 int len2 = len >> 1;
2461
2462 for (i = 0; i < len2; i++) {
2463 int16_t w = window[i];
2464 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2465 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2466 }
2467 }
2468
2469 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2470 int32_t max, unsigned int len)
2471 {
2472 do {
2473 *dst++ = av_clip(*src++, min, max);
2474 *dst++ = av_clip(*src++, min, max);
2475 *dst++ = av_clip(*src++, min, max);
2476 *dst++ = av_clip(*src++, min, max);
2477 *dst++ = av_clip(*src++, min, max);
2478 *dst++ = av_clip(*src++, min, max);
2479 *dst++ = av_clip(*src++, min, max);
2480 *dst++ = av_clip(*src++, min, max);
2481 len -= 8;
2482 } while (len > 0);
2483 }
2484
2485 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2486 {
2487 ff_j_rev_dct (block);
2488 put_pixels_clamped_c(block, dest, line_size);
2489 }
2490 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2491 {
2492 ff_j_rev_dct (block);
2493 add_pixels_clamped_c(block, dest, line_size);
2494 }
2495
2496 /* init static data */
2497 av_cold void ff_dsputil_static_init(void)
2498 {
2499 int i;
2500
2501 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2502 for(i=0;i<MAX_NEG_CROP;i++) {
2503 ff_cropTbl[i] = 0;
2504 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2505 }
2506
2507 for(i=0;i<512;i++) {
2508 ff_squareTbl[i] = (i - 256) * (i - 256);
2509 }
2510
2511 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2512 }
2513
2514 int ff_check_alignment(void){
2515 static int did_fail=0;
2516 LOCAL_ALIGNED_16(int, aligned, [4]);
2517
2518 if((intptr_t)aligned & 15){
2519 if(!did_fail){
2520 #if HAVE_MMX || HAVE_ALTIVEC
2521 av_log(NULL, AV_LOG_ERROR,
2522 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2523 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2524 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2525 "Do not report crashes to Libav developers.\n");
2526 #endif
2527 did_fail=1;
2528 }
2529 return -1;
2530 }
2531 return 0;
2532 }
2533
2534 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2535 {
2536 ff_check_alignment();
2537
2538 #if CONFIG_ENCODERS
2539 if (avctx->bits_per_raw_sample == 10) {
2540 c->fdct = ff_jpeg_fdct_islow_10;
2541 c->fdct248 = ff_fdct248_islow_10;
2542 } else {
2543 if(avctx->dct_algo==FF_DCT_FASTINT) {
2544 c->fdct = ff_fdct_ifast;
2545 c->fdct248 = ff_fdct_ifast248;
2546 }
2547 else if(avctx->dct_algo==FF_DCT_FAAN) {
2548 c->fdct = ff_faandct;
2549 c->fdct248 = ff_faandct248;
2550 }
2551 else {
2552 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2553 c->fdct248 = ff_fdct248_islow_8;
2554 }
2555 }
2556 #endif //CONFIG_ENCODERS
2557
2558 if (avctx->bits_per_raw_sample == 10) {
2559 c->idct_put = ff_simple_idct_put_10;
2560 c->idct_add = ff_simple_idct_add_10;
2561 c->idct = ff_simple_idct_10;
2562 c->idct_permutation_type = FF_NO_IDCT_PERM;
2563 } else {
2564 if(avctx->idct_algo==FF_IDCT_INT){
2565 c->idct_put= ff_jref_idct_put;
2566 c->idct_add= ff_jref_idct_add;
2567 c->idct = ff_j_rev_dct;
2568 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2569 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2570 c->idct_put= ff_faanidct_put;
2571 c->idct_add= ff_faanidct_add;
2572 c->idct = ff_faanidct;
2573 c->idct_permutation_type= FF_NO_IDCT_PERM;
2574 }else{ //accurate/default
2575 c->idct_put = ff_simple_idct_put_8;
2576 c->idct_add = ff_simple_idct_add_8;
2577 c->idct = ff_simple_idct_8;
2578 c->idct_permutation_type= FF_NO_IDCT_PERM;
2579 }
2580 }
2581
2582 c->diff_pixels = diff_pixels_c;
2583 c->put_pixels_clamped = put_pixels_clamped_c;
2584 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2585 c->add_pixels_clamped = add_pixels_clamped_c;
2586 c->sum_abs_dctelem = sum_abs_dctelem_c;
2587 c->gmc1 = gmc1_c;
2588 c->gmc = ff_gmc_c;
2589 c->pix_sum = pix_sum_c;
2590 c->pix_norm1 = pix_norm1_c;
2591
2592 c->fill_block_tab[0] = fill_block16_c;
2593 c->fill_block_tab[1] = fill_block8_c;
2594
2595 /* TODO [0] 16 [1] 8 */
2596 c->pix_abs[0][0] = pix_abs16_c;
2597 c->pix_abs[0][1] = pix_abs16_x2_c;
2598 c->pix_abs[0][2] = pix_abs16_y2_c;
2599 c->pix_abs[0][3] = pix_abs16_xy2_c;
2600 c->pix_abs[1][0] = pix_abs8_c;
2601 c->pix_abs[1][1] = pix_abs8_x2_c;
2602 c->pix_abs[1][2] = pix_abs8_y2_c;
2603 c->pix_abs[1][3] = pix_abs8_xy2_c;
2604
2605 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2606 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2607 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2608 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2609 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2610 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2611 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2612 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2613 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2614
2615 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2616 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2617 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2618 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2619 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2620 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2621 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2622 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2623 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2624
2625 #define dspfunc(PFX, IDX, NUM) \
2626 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2627 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2628 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2629 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2630 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2631 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2632 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2633 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2634 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2635 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2636 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2637 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2638 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2639 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2640 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2641 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2642
2643 dspfunc(put_qpel, 0, 16);
2644 dspfunc(put_no_rnd_qpel, 0, 16);
2645
2646 dspfunc(avg_qpel, 0, 16);
2647 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2648
2649 dspfunc(put_qpel, 1, 8);
2650 dspfunc(put_no_rnd_qpel, 1, 8);
2651
2652 dspfunc(avg_qpel, 1, 8);
2653 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2654
2655 #undef dspfunc
2656
2657 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2658 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2659 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2660 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2661 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2662 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2663 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2664 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2665
2666 #define SET_CMP_FUNC(name) \
2667 c->name[0]= name ## 16_c;\
2668 c->name[1]= name ## 8x8_c;
2669
2670 SET_CMP_FUNC(hadamard8_diff)
2671 c->hadamard8_diff[4]= hadamard8_intra16_c;
2672 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2673 SET_CMP_FUNC(dct_sad)
2674 SET_CMP_FUNC(dct_max)
2675 #if CONFIG_GPL
2676 SET_CMP_FUNC(dct264_sad)
2677 #endif
2678 c->sad[0]= pix_abs16_c;
2679 c->sad[1]= pix_abs8_c;
2680 c->sse[0]= sse16_c;
2681 c->sse[1]= sse8_c;
2682 c->sse[2]= sse4_c;
2683 SET_CMP_FUNC(quant_psnr)
2684 SET_CMP_FUNC(rd)
2685 SET_CMP_FUNC(bit)
2686 c->vsad[0]= vsad16_c;
2687 c->vsad[4]= vsad_intra16_c;
2688 c->vsad[5]= vsad_intra8_c;
2689 c->vsse[0]= vsse16_c;
2690 c->vsse[4]= vsse_intra16_c;
2691 c->vsse[5]= vsse_intra8_c;
2692 c->nsse[0]= nsse16_c;
2693 c->nsse[1]= nsse8_c;
2694
2695 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2696
2697 c->add_bytes= add_bytes_c;
2698 c->diff_bytes= diff_bytes_c;
2699 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2700 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2701 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2702 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2703 c->bswap_buf= bswap_buf;
2704 c->bswap16_buf = bswap16_buf;
2705
2706 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2707 c->h263_h_loop_filter= h263_h_loop_filter_c;
2708 c->h263_v_loop_filter= h263_v_loop_filter_c;
2709 }
2710
2711 c->h261_loop_filter= h261_loop_filter_c;
2712
2713 c->try_8x8basis= try_8x8basis_c;
2714 c->add_8x8basis= add_8x8basis_c;
2715
2716 c->vector_fmul_reverse = vector_fmul_reverse_c;
2717 c->vector_fmul_add = vector_fmul_add_c;
2718 c->vector_clipf = vector_clipf_c;
2719 c->scalarproduct_int16 = scalarproduct_int16_c;
2720 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2721 c->apply_window_int16 = apply_window_int16_c;
2722 c->vector_clip_int32 = vector_clip_int32_c;
2723 c->scalarproduct_float = ff_scalarproduct_float_c;
2724 c->butterflies_float = butterflies_float_c;
2725
2726 c->shrink[0]= av_image_copy_plane;
2727 c->shrink[1]= ff_shrink22;
2728 c->shrink[2]= ff_shrink44;
2729 c->shrink[3]= ff_shrink88;
2730
2731 #undef FUNC
2732 #undef FUNCC
2733 #define FUNC(f, depth) f ## _ ## depth
2734 #define FUNCC(f, depth) f ## _ ## depth ## _c
2735
2736 #define dspfunc1(PFX, IDX, NUM, depth)\
2737 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
2738 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
2739 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
2740 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
2741
2742 #define dspfunc2(PFX, IDX, NUM, depth)\
2743 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
2744 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
2745 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
2746 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
2747 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
2748 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
2749 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
2750 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
2751 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
2752 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
2753 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
2754 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
2755 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
2756 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
2757 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
2758 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
2759
2760
2761 #define BIT_DEPTH_FUNCS(depth, dct)\
2762 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2763 c->draw_edges = FUNCC(draw_edges , depth);\
2764 c->clear_block = FUNCC(clear_block ## dct , depth);\
2765 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2766 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
2767 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
2768 c->put_no_rnd_pixels_l2 = FUNCC(put_no_rnd_pixels8_l2 , depth);\
2769 \
2770 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
2771 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
2772 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
2773 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
2774 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
2775 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
2776 \
2777 dspfunc1(put , 0, 16, depth);\
2778 dspfunc1(put , 1, 8, depth);\
2779 dspfunc1(put , 2, 4, depth);\
2780 dspfunc1(put , 3, 2, depth);\
2781 dspfunc1(put_no_rnd, 0, 16, depth);\
2782 dspfunc1(put_no_rnd, 1, 8, depth);\
2783 dspfunc1(avg , 0, 16, depth);\
2784 dspfunc1(avg , 1, 8, depth);\
2785 dspfunc1(avg , 2, 4, depth);\
2786 dspfunc1(avg , 3, 2, depth);\
2787 dspfunc1(avg_no_rnd, 0, 16, depth);\
2788 dspfunc1(avg_no_rnd, 1, 8, depth);\
2789 \
2790 dspfunc2(put_h264_qpel, 0, 16, depth);\
2791 dspfunc2(put_h264_qpel, 1, 8, depth);\
2792 dspfunc2(put_h264_qpel, 2, 4, depth);\
2793 dspfunc2(put_h264_qpel, 3, 2, depth);\
2794 dspfunc2(avg_h264_qpel, 0, 16, depth);\
2795 dspfunc2(avg_h264_qpel, 1, 8, depth);\
2796 dspfunc2(avg_h264_qpel, 2, 4, depth);
2797
2798 switch (avctx->bits_per_raw_sample) {
2799 case 9:
2800 if (c->dct_bits == 32) {
2801 BIT_DEPTH_FUNCS(9, _32);
2802 } else {
2803 BIT_DEPTH_FUNCS(9, _16);
2804 }
2805 break;
2806 case 10:
2807 if (c->dct_bits == 32) {
2808 BIT_DEPTH_FUNCS(10, _32);
2809 } else {
2810 BIT_DEPTH_FUNCS(10, _16);
2811 }
2812 break;
2813 default:
2814 BIT_DEPTH_FUNCS(8, _16);
2815 break;
2816 }
2817
2818
2819 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2820 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2821 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2822 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2823 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2824 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2825 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2826
2827 ff_init_scantable_permutation(c->idct_permutation,
2828 c->idct_permutation_type);
2829 }