f24985ce4c2e1d20aa1132ef4d4897c8a9edbf48
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file
27 * DSP utils
28 */
29
30 #include "libavutil/imgutils.h"
31 #include "avcodec.h"
32 #include "dsputil.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "faanidct.h"
36 #include "mathops.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "ac3dec.h"
40 #include "vorbis.h"
41 #include "png.h"
42
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
45
46 #define BIT_DEPTH 9
47 #include "dsputil_template.c"
48 #undef BIT_DEPTH
49
50 #define BIT_DEPTH 10
51 #include "dsputil_template.c"
52 #undef BIT_DEPTH
53
54 #define BIT_DEPTH 8
55 #include "dsputil_template.c"
56
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
60
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
70 };
71
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
83 };
84
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
97 };
98
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
108 };
109
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120 };
121
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
125 int i;
126 int end;
127
128 st->scantable= src_scantable;
129
130 for(i=0; i<64; i++){
131 int j;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
134 #if ARCH_PPC
135 st->inverse[j] = i;
136 #endif
137 }
138
139 end=-1;
140 for(i=0; i<64; i++){
141 int j;
142 j = st->permutated[i];
143 if(j>end) end=j;
144 st->raster_end[i]= end;
145 }
146 }
147
148 static int pix_sum_c(uint8_t * pix, int line_size)
149 {
150 int s, i, j;
151
152 s = 0;
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
155 s += pix[0];
156 s += pix[1];
157 s += pix[2];
158 s += pix[3];
159 s += pix[4];
160 s += pix[5];
161 s += pix[6];
162 s += pix[7];
163 pix += 8;
164 }
165 pix += line_size - 16;
166 }
167 return s;
168 }
169
170 static int pix_norm1_c(uint8_t * pix, int line_size)
171 {
172 int s, i, j;
173 uint32_t *sq = ff_squareTbl + 256;
174
175 s = 0;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
178 #if 0
179 s += sq[pix[0]];
180 s += sq[pix[1]];
181 s += sq[pix[2]];
182 s += sq[pix[3]];
183 s += sq[pix[4]];
184 s += sq[pix[5]];
185 s += sq[pix[6]];
186 s += sq[pix[7]];
187 #else
188 #if LONG_MAX > 2147483647
189 register uint64_t x=*(uint64_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
198 #else
199 register uint32_t x=*(uint32_t*)pix;
200 s += sq[x&0xff];
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
205 s += sq[x&0xff];
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
209 #endif
210 #endif
211 pix += 8;
212 }
213 pix += line_size - 16;
214 }
215 return s;
216 }
217
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
219 int i;
220
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
230 }
231 for(;i<w; i++){
232 dst[i+0]= av_bswap32(src[i+0]);
233 }
234 }
235
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
237 {
238 while (len--)
239 *dst++ = av_bswap16(*src++);
240 }
241
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
243 {
244 int s, i;
245 uint32_t *sq = ff_squareTbl + 256;
246
247 s = 0;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 pix1 += line_size;
254 pix2 += line_size;
255 }
256 return s;
257 }
258
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
260 {
261 int s, i;
262 uint32_t *sq = ff_squareTbl + 256;
263
264 s = 0;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
274 pix1 += line_size;
275 pix2 += line_size;
276 }
277 return s;
278 }
279
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
281 {
282 int s, i;
283 uint32_t *sq = ff_squareTbl + 256;
284
285 s = 0;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
303
304 pix1 += line_size;
305 pix2 += line_size;
306 }
307 return s;
308 }
309
310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
311 {
312 int i;
313
314 /* read the pixels */
315 for(i=0;i<8;i++) {
316 block[0] = pixels[0];
317 block[1] = pixels[1];
318 block[2] = pixels[2];
319 block[3] = pixels[3];
320 block[4] = pixels[4];
321 block[5] = pixels[5];
322 block[6] = pixels[6];
323 block[7] = pixels[7];
324 pixels += line_size;
325 block += 8;
326 }
327 }
328
329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
330 const uint8_t *s2, int stride){
331 int i;
332
333 /* read the pixels */
334 for(i=0;i<8;i++) {
335 block[0] = s1[0] - s2[0];
336 block[1] = s1[1] - s2[1];
337 block[2] = s1[2] - s2[2];
338 block[3] = s1[3] - s2[3];
339 block[4] = s1[4] - s2[4];
340 block[5] = s1[5] - s2[5];
341 block[6] = s1[6] - s2[6];
342 block[7] = s1[7] - s2[7];
343 s1 += stride;
344 s2 += stride;
345 block += 8;
346 }
347 }
348
349
350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
351 int line_size)
352 {
353 int i;
354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
355
356 /* read the pixels */
357 for(i=0;i<8;i++) {
358 pixels[0] = cm[block[0]];
359 pixels[1] = cm[block[1]];
360 pixels[2] = cm[block[2]];
361 pixels[3] = cm[block[3]];
362 pixels[4] = cm[block[4]];
363 pixels[5] = cm[block[5]];
364 pixels[6] = cm[block[6]];
365 pixels[7] = cm[block[7]];
366
367 pixels += line_size;
368 block += 8;
369 }
370 }
371
372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
373 int line_size)
374 {
375 int i;
376 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377
378 /* read the pixels */
379 for(i=0;i<4;i++) {
380 pixels[0] = cm[block[0]];
381 pixels[1] = cm[block[1]];
382 pixels[2] = cm[block[2]];
383 pixels[3] = cm[block[3]];
384
385 pixels += line_size;
386 block += 8;
387 }
388 }
389
390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
391 int line_size)
392 {
393 int i;
394 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
395
396 /* read the pixels */
397 for(i=0;i<2;i++) {
398 pixels[0] = cm[block[0]];
399 pixels[1] = cm[block[1]];
400
401 pixels += line_size;
402 block += 8;
403 }
404 }
405
406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
407 uint8_t *restrict pixels,
408 int line_size)
409 {
410 int i, j;
411
412 for (i = 0; i < 8; i++) {
413 for (j = 0; j < 8; j++) {
414 if (*block < -128)
415 *pixels = 0;
416 else if (*block > 127)
417 *pixels = 255;
418 else
419 *pixels = (uint8_t)(*block + 128);
420 block++;
421 pixels++;
422 }
423 pixels += (line_size - 8);
424 }
425 }
426
427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
428 int line_size)
429 {
430 int i;
431
432 /* read the pixels */
433 for(i=0;i<8;i++) {
434 pixels[0] = block[0];
435 pixels[1] = block[1];
436 pixels[2] = block[2];
437 pixels[3] = block[3];
438 pixels[4] = block[4];
439 pixels[5] = block[5];
440 pixels[6] = block[6];
441 pixels[7] = block[7];
442
443 pixels += line_size;
444 block += 8;
445 }
446 }
447
448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
449 int line_size)
450 {
451 int i;
452 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
453
454 /* read the pixels */
455 for(i=0;i<8;i++) {
456 pixels[0] = cm[pixels[0] + block[0]];
457 pixels[1] = cm[pixels[1] + block[1]];
458 pixels[2] = cm[pixels[2] + block[2]];
459 pixels[3] = cm[pixels[3] + block[3]];
460 pixels[4] = cm[pixels[4] + block[4]];
461 pixels[5] = cm[pixels[5] + block[5]];
462 pixels[6] = cm[pixels[6] + block[6]];
463 pixels[7] = cm[pixels[7] + block[7]];
464 pixels += line_size;
465 block += 8;
466 }
467 }
468
469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
470 int line_size)
471 {
472 int i;
473 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474
475 /* read the pixels */
476 for(i=0;i<4;i++) {
477 pixels[0] = cm[pixels[0] + block[0]];
478 pixels[1] = cm[pixels[1] + block[1]];
479 pixels[2] = cm[pixels[2] + block[2]];
480 pixels[3] = cm[pixels[3] + block[3]];
481 pixels += line_size;
482 block += 8;
483 }
484 }
485
486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
487 int line_size)
488 {
489 int i;
490 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491
492 /* read the pixels */
493 for(i=0;i<2;i++) {
494 pixels[0] = cm[pixels[0] + block[0]];
495 pixels[1] = cm[pixels[1] + block[1]];
496 pixels += line_size;
497 block += 8;
498 }
499 }
500
501 static int sum_abs_dctelem_c(DCTELEM *block)
502 {
503 int sum=0, i;
504 for(i=0; i<64; i++)
505 sum+= FFABS(block[i]);
506 return sum;
507 }
508
509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
510 {
511 int i;
512
513 for (i = 0; i < h; i++) {
514 memset(block, value, 16);
515 block += line_size;
516 }
517 }
518
519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
520 {
521 int i;
522
523 for (i = 0; i < h; i++) {
524 memset(block, value, 8);
525 block += line_size;
526 }
527 }
528
529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
530 {
531 int i, j;
532 uint16_t *dst1 = (uint16_t *) dst;
533 uint16_t *dst2 = (uint16_t *)(dst + linesize);
534
535 for (j = 0; j < 8; j++) {
536 for (i = 0; i < 8; i++) {
537 dst1[i] = dst2[i] = src[i] * 0x0101;
538 }
539 src += 8;
540 dst1 += linesize;
541 dst2 += linesize;
542 }
543 }
544
545 #define avg2(a,b) ((a+b+1)>>1)
546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
547
548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
549 {
550 const int A=(16-x16)*(16-y16);
551 const int B=( x16)*(16-y16);
552 const int C=(16-x16)*( y16);
553 const int D=( x16)*( y16);
554 int i;
555
556 for(i=0; i<h; i++)
557 {
558 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
566 dst+= stride;
567 src+= stride;
568 }
569 }
570
571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
573 {
574 int y, vx, vy;
575 const int s= 1<<shift;
576
577 width--;
578 height--;
579
580 for(y=0; y<h; y++){
581 int x;
582
583 vx= ox;
584 vy= oy;
585 for(x=0; x<8; x++){ //XXX FIXME optimize
586 int src_x, src_y, frac_x, frac_y, index;
587
588 src_x= vx>>16;
589 src_y= vy>>16;
590 frac_x= src_x&(s-1);
591 frac_y= src_y&(s-1);
592 src_x>>=shift;
593 src_y>>=shift;
594
595 if((unsigned)src_x < width){
596 if((unsigned)src_y < height){
597 index= src_x + src_y*stride;
598 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
599 + src[index +1]* frac_x )*(s-frac_y)
600 + ( src[index+stride ]*(s-frac_x)
601 + src[index+stride+1]* frac_x )* frac_y
602 + r)>>(shift*2);
603 }else{
604 index= src_x + av_clip(src_y, 0, height)*stride;
605 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
606 + src[index +1]* frac_x )*s
607 + r)>>(shift*2);
608 }
609 }else{
610 if((unsigned)src_y < height){
611 index= av_clip(src_x, 0, width) + src_y*stride;
612 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
613 + src[index+stride ]* frac_y )*s
614 + r)>>(shift*2);
615 }else{
616 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617 dst[y*stride + x]= src[index ];
618 }
619 }
620
621 vx+= dxx;
622 vy+= dyx;
623 }
624 ox += dxy;
625 oy += dyy;
626 }
627 }
628
629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 switch(width){
631 case 2: put_pixels2_8_c (dst, src, stride, height); break;
632 case 4: put_pixels4_8_c (dst, src, stride, height); break;
633 case 8: put_pixels8_8_c (dst, src, stride, height); break;
634 case 16:put_pixels16_8_c(dst, src, stride, height); break;
635 }
636 }
637
638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 int i,j;
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
643 }
644 src += stride;
645 dst += stride;
646 }
647 }
648
649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 int i,j;
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
654 }
655 src += stride;
656 dst += stride;
657 }
658 }
659
660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 int i,j;
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
665 }
666 src += stride;
667 dst += stride;
668 }
669 }
670
671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 int i,j;
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
676 }
677 src += stride;
678 dst += stride;
679 }
680 }
681
682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 int i,j;
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
687 }
688 src += stride;
689 dst += stride;
690 }
691 }
692
693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 int i,j;
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
698 }
699 src += stride;
700 dst += stride;
701 }
702 }
703
704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 int i,j;
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
709 }
710 src += stride;
711 dst += stride;
712 }
713 }
714
715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 int i,j;
717 for (i=0; i < height; i++) {
718 for (j=0; j < width; j++) {
719 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
720 }
721 src += stride;
722 dst += stride;
723 }
724 }
725
726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 switch(width){
728 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
732 }
733 }
734
735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 int i,j;
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
740 }
741 src += stride;
742 dst += stride;
743 }
744 }
745
746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 int i,j;
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
751 }
752 src += stride;
753 dst += stride;
754 }
755 }
756
757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 int i,j;
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
762 }
763 src += stride;
764 dst += stride;
765 }
766 }
767
768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 int i,j;
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
773 }
774 src += stride;
775 dst += stride;
776 }
777 }
778
779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 int i,j;
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
784 }
785 src += stride;
786 dst += stride;
787 }
788 }
789
790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 int i,j;
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
795 }
796 src += stride;
797 dst += stride;
798 }
799 }
800
801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
802 int i,j;
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
806 }
807 src += stride;
808 dst += stride;
809 }
810 }
811
812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
813 int i,j;
814 for (i=0; i < height; i++) {
815 for (j=0; j < width; j++) {
816 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
817 }
818 src += stride;
819 dst += stride;
820 }
821 }
822
823 #define QPEL_MC(r, OPNAME, RND, OP) \
824 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
825 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
826 int i;\
827 for(i=0; i<h; i++)\
828 {\
829 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
830 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
831 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
832 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
833 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
834 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
835 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
836 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
837 dst+=dstStride;\
838 src+=srcStride;\
839 }\
840 }\
841 \
842 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
843 const int w=8;\
844 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
845 int i;\
846 for(i=0; i<w; i++)\
847 {\
848 const int src0= src[0*srcStride];\
849 const int src1= src[1*srcStride];\
850 const int src2= src[2*srcStride];\
851 const int src3= src[3*srcStride];\
852 const int src4= src[4*srcStride];\
853 const int src5= src[5*srcStride];\
854 const int src6= src[6*srcStride];\
855 const int src7= src[7*srcStride];\
856 const int src8= src[8*srcStride];\
857 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
858 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
859 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
860 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
861 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
862 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
863 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
864 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
865 dst++;\
866 src++;\
867 }\
868 }\
869 \
870 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
871 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
872 int i;\
873 \
874 for(i=0; i<h; i++)\
875 {\
876 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
877 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
878 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
879 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
880 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
881 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
882 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
883 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
884 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
885 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
886 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
887 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
888 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
889 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
890 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
891 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
892 dst+=dstStride;\
893 src+=srcStride;\
894 }\
895 }\
896 \
897 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
898 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
899 int i;\
900 const int w=16;\
901 for(i=0; i<w; i++)\
902 {\
903 const int src0= src[0*srcStride];\
904 const int src1= src[1*srcStride];\
905 const int src2= src[2*srcStride];\
906 const int src3= src[3*srcStride];\
907 const int src4= src[4*srcStride];\
908 const int src5= src[5*srcStride];\
909 const int src6= src[6*srcStride];\
910 const int src7= src[7*srcStride];\
911 const int src8= src[8*srcStride];\
912 const int src9= src[9*srcStride];\
913 const int src10= src[10*srcStride];\
914 const int src11= src[11*srcStride];\
915 const int src12= src[12*srcStride];\
916 const int src13= src[13*srcStride];\
917 const int src14= src[14*srcStride];\
918 const int src15= src[15*srcStride];\
919 const int src16= src[16*srcStride];\
920 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
921 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
922 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
923 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
924 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
925 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
926 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
927 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
928 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
929 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
930 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
931 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
932 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
933 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
934 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
935 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
936 dst++;\
937 src++;\
938 }\
939 }\
940 \
941 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
942 uint8_t half[64];\
943 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
944 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
945 }\
946 \
947 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
948 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
949 }\
950 \
951 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
952 uint8_t half[64];\
953 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
954 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
955 }\
956 \
957 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
958 uint8_t full[16*9];\
959 uint8_t half[64];\
960 copy_block9(full, src, 16, stride, 9);\
961 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
962 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
963 }\
964 \
965 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
966 uint8_t full[16*9];\
967 copy_block9(full, src, 16, stride, 9);\
968 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
969 }\
970 \
971 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
972 uint8_t full[16*9];\
973 uint8_t half[64];\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
976 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
977 }\
978 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
979 uint8_t full[16*9];\
980 uint8_t halfH[72];\
981 uint8_t halfV[64];\
982 uint8_t halfHV[64];\
983 copy_block9(full, src, 16, stride, 9);\
984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
987 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
988 }\
989 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
990 uint8_t full[16*9];\
991 uint8_t halfH[72];\
992 uint8_t halfHV[64];\
993 copy_block9(full, src, 16, stride, 9);\
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
995 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
997 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
998 }\
999 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1001 uint8_t halfH[72];\
1002 uint8_t halfV[64];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 }\
1010 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1012 uint8_t halfH[72];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 }\
1020 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1022 uint8_t halfH[72];\
1023 uint8_t halfV[64];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 }\
1031 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1033 uint8_t halfH[72];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 }\
1041 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1043 uint8_t halfH[72];\
1044 uint8_t halfV[64];\
1045 uint8_t halfHV[64];\
1046 copy_block9(full, src, 16, stride, 9);\
1047 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 }\
1052 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t full[16*9];\
1054 uint8_t halfH[72];\
1055 uint8_t halfHV[64];\
1056 copy_block9(full, src, 16, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 }\
1062 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t halfH[72];\
1064 uint8_t halfHV[64];\
1065 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1066 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1067 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1068 }\
1069 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1070 uint8_t halfH[72];\
1071 uint8_t halfHV[64];\
1072 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1075 }\
1076 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1078 uint8_t halfH[72];\
1079 uint8_t halfV[64];\
1080 uint8_t halfHV[64];\
1081 copy_block9(full, src, 16, stride, 9);\
1082 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1084 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1085 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1086 }\
1087 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1088 uint8_t full[16*9];\
1089 uint8_t halfH[72];\
1090 copy_block9(full, src, 16, stride, 9);\
1091 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1092 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1093 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1094 }\
1095 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1096 uint8_t full[16*9];\
1097 uint8_t halfH[72];\
1098 uint8_t halfV[64];\
1099 uint8_t halfHV[64];\
1100 copy_block9(full, src, 16, stride, 9);\
1101 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1102 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1103 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1104 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1105 }\
1106 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1107 uint8_t full[16*9];\
1108 uint8_t halfH[72];\
1109 copy_block9(full, src, 16, stride, 9);\
1110 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1112 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1113 }\
1114 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1115 uint8_t halfH[72];\
1116 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1117 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1118 }\
1119 \
1120 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t half[256];\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1124 }\
1125 \
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1127 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1128 }\
1129 \
1130 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1131 uint8_t half[256];\
1132 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1133 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1134 }\
1135 \
1136 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1138 uint8_t half[256];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1142 }\
1143 \
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1145 uint8_t full[24*17];\
1146 copy_block17(full, src, 24, stride, 17);\
1147 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1148 }\
1149 \
1150 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[24*17];\
1152 uint8_t half[256];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1155 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1156 }\
1157 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1159 uint8_t halfH[272];\
1160 uint8_t halfV[256];\
1161 uint8_t halfHV[256];\
1162 copy_block17(full, src, 24, stride, 17);\
1163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 }\
1168 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1169 uint8_t full[24*17];\
1170 uint8_t halfH[272];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 }\
1178 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 }\
1189 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 }\
1199 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 }\
1210 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 }\
1220 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfV[256];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 }\
1231 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1232 uint8_t full[24*17];\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 }\
1241 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1242 uint8_t halfH[272];\
1243 uint8_t halfHV[256];\
1244 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1247 }\
1248 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1249 uint8_t halfH[272];\
1250 uint8_t halfHV[256];\
1251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 }\
1255 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 uint8_t halfV[256];\
1259 uint8_t halfHV[256];\
1260 copy_block17(full, src, 24, stride, 17);\
1261 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1263 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1264 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1265 }\
1266 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1267 uint8_t full[24*17];\
1268 uint8_t halfH[272];\
1269 copy_block17(full, src, 24, stride, 17);\
1270 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1271 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1272 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1273 }\
1274 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1275 uint8_t full[24*17];\
1276 uint8_t halfH[272];\
1277 uint8_t halfV[256];\
1278 uint8_t halfHV[256];\
1279 copy_block17(full, src, 24, stride, 17);\
1280 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1283 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1284 }\
1285 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1286 uint8_t full[24*17];\
1287 uint8_t halfH[272];\
1288 copy_block17(full, src, 24, stride, 17);\
1289 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1290 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1291 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1292 }\
1293 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t halfH[272];\
1295 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1296 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1297 }
1298
1299 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1300 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1301 #define op_put(a, b) a = cm[((b) + 16)>>5]
1302 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1303
1304 QPEL_MC(0, put_ , _ , op_put)
1305 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1306 QPEL_MC(0, avg_ , _ , op_avg)
1307 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1308 #undef op_avg
1309 #undef op_avg_no_rnd
1310 #undef op_put
1311 #undef op_put_no_rnd
1312
1313 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1314 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1315 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1316 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1317 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1318 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1319
1320 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1321 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1322 int i;
1323
1324 for(i=0; i<h; i++){
1325 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1326 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1327 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1328 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1329 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1330 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1331 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1332 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1333 dst+=dstStride;
1334 src+=srcStride;
1335 }
1336 }
1337
1338 #if CONFIG_RV40_DECODER
1339 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1340 put_pixels16_xy2_8_c(dst, src, stride, 16);
1341 }
1342 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1343 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1344 }
1345 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1346 put_pixels8_xy2_8_c(dst, src, stride, 8);
1347 }
1348 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1349 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1350 }
1351 #endif /* CONFIG_RV40_DECODER */
1352
1353 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1355 int i;
1356
1357 for(i=0; i<w; i++){
1358 const int src_1= src[ -srcStride];
1359 const int src0 = src[0 ];
1360 const int src1 = src[ srcStride];
1361 const int src2 = src[2*srcStride];
1362 const int src3 = src[3*srcStride];
1363 const int src4 = src[4*srcStride];
1364 const int src5 = src[5*srcStride];
1365 const int src6 = src[6*srcStride];
1366 const int src7 = src[7*srcStride];
1367 const int src8 = src[8*srcStride];
1368 const int src9 = src[9*srcStride];
1369 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1370 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1371 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1372 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1373 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1374 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1375 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1376 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1377 src++;
1378 dst++;
1379 }
1380 }
1381
1382 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1383 uint8_t half[64];
1384 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1385 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1386 }
1387
1388 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1389 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1390 }
1391
1392 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1393 uint8_t half[64];
1394 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1395 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1396 }
1397
1398 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1399 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1400 }
1401
1402 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1403 uint8_t halfH[88];
1404 uint8_t halfV[64];
1405 uint8_t halfHV[64];
1406 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1407 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1408 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1409 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1410 }
1411 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1412 uint8_t halfH[88];
1413 uint8_t halfV[64];
1414 uint8_t halfHV[64];
1415 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1416 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1417 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1418 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1419 }
1420 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1421 uint8_t halfH[88];
1422 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1423 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1424 }
1425
1426 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1427 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1428 int x;
1429 const int strength= ff_h263_loop_filter_strength[qscale];
1430
1431 for(x=0; x<8; x++){
1432 int d1, d2, ad1;
1433 int p0= src[x-2*stride];
1434 int p1= src[x-1*stride];
1435 int p2= src[x+0*stride];
1436 int p3= src[x+1*stride];
1437 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1438
1439 if (d<-2*strength) d1= 0;
1440 else if(d<- strength) d1=-2*strength - d;
1441 else if(d< strength) d1= d;
1442 else if(d< 2*strength) d1= 2*strength - d;
1443 else d1= 0;
1444
1445 p1 += d1;
1446 p2 -= d1;
1447 if(p1&256) p1= ~(p1>>31);
1448 if(p2&256) p2= ~(p2>>31);
1449
1450 src[x-1*stride] = p1;
1451 src[x+0*stride] = p2;
1452
1453 ad1= FFABS(d1)>>1;
1454
1455 d2= av_clip((p0-p3)/4, -ad1, ad1);
1456
1457 src[x-2*stride] = p0 - d2;
1458 src[x+ stride] = p3 + d2;
1459 }
1460 }
1461 }
1462
1463 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1464 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1465 int y;
1466 const int strength= ff_h263_loop_filter_strength[qscale];
1467
1468 for(y=0; y<8; y++){
1469 int d1, d2, ad1;
1470 int p0= src[y*stride-2];
1471 int p1= src[y*stride-1];
1472 int p2= src[y*stride+0];
1473 int p3= src[y*stride+1];
1474 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1475
1476 if (d<-2*strength) d1= 0;
1477 else if(d<- strength) d1=-2*strength - d;
1478 else if(d< strength) d1= d;
1479 else if(d< 2*strength) d1= 2*strength - d;
1480 else d1= 0;
1481
1482 p1 += d1;
1483 p2 -= d1;
1484 if(p1&256) p1= ~(p1>>31);
1485 if(p2&256) p2= ~(p2>>31);
1486
1487 src[y*stride-1] = p1;
1488 src[y*stride+0] = p2;
1489
1490 ad1= FFABS(d1)>>1;
1491
1492 d2= av_clip((p0-p3)/4, -ad1, ad1);
1493
1494 src[y*stride-2] = p0 - d2;
1495 src[y*stride+1] = p3 + d2;
1496 }
1497 }
1498 }
1499
1500 static void h261_loop_filter_c(uint8_t *src, int stride){
1501 int x,y,xy,yz;
1502 int temp[64];
1503
1504 for(x=0; x<8; x++){
1505 temp[x ] = 4*src[x ];
1506 temp[x + 7*8] = 4*src[x + 7*stride];
1507 }
1508 for(y=1; y<7; y++){
1509 for(x=0; x<8; x++){
1510 xy = y * stride + x;
1511 yz = y * 8 + x;
1512 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1513 }
1514 }
1515
1516 for(y=0; y<8; y++){
1517 src[ y*stride] = (temp[ y*8] + 2)>>2;
1518 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1519 for(x=1; x<7; x++){
1520 xy = y * stride + x;
1521 yz = y * 8 + x;
1522 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1523 }
1524 }
1525 }
1526
1527 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1528 {
1529 int s, i;
1530
1531 s = 0;
1532 for(i=0;i<h;i++) {
1533 s += abs(pix1[0] - pix2[0]);
1534 s += abs(pix1[1] - pix2[1]);
1535 s += abs(pix1[2] - pix2[2]);
1536 s += abs(pix1[3] - pix2[3]);
1537 s += abs(pix1[4] - pix2[4]);
1538 s += abs(pix1[5] - pix2[5]);
1539 s += abs(pix1[6] - pix2[6]);
1540 s += abs(pix1[7] - pix2[7]);
1541 s += abs(pix1[8] - pix2[8]);
1542 s += abs(pix1[9] - pix2[9]);
1543 s += abs(pix1[10] - pix2[10]);
1544 s += abs(pix1[11] - pix2[11]);
1545 s += abs(pix1[12] - pix2[12]);
1546 s += abs(pix1[13] - pix2[13]);
1547 s += abs(pix1[14] - pix2[14]);
1548 s += abs(pix1[15] - pix2[15]);
1549 pix1 += line_size;
1550 pix2 += line_size;
1551 }
1552 return s;
1553 }
1554
1555 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1556 {
1557 int s, i;
1558
1559 s = 0;
1560 for(i=0;i<h;i++) {
1561 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1562 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1563 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1564 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1565 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1566 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1567 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1568 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1569 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1570 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1571 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1572 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1573 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1574 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1575 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1576 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1577 pix1 += line_size;
1578 pix2 += line_size;
1579 }
1580 return s;
1581 }
1582
1583 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1584 {
1585 int s, i;
1586 uint8_t *pix3 = pix2 + line_size;
1587
1588 s = 0;
1589 for(i=0;i<h;i++) {
1590 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1591 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1592 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1593 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1594 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1595 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1596 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1597 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1598 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1599 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1600 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1601 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1602 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1603 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1604 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1605 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1606 pix1 += line_size;
1607 pix2 += line_size;
1608 pix3 += line_size;
1609 }
1610 return s;
1611 }
1612
1613 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1614 {
1615 int s, i;
1616 uint8_t *pix3 = pix2 + line_size;
1617
1618 s = 0;
1619 for(i=0;i<h;i++) {
1620 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1621 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1622 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1623 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1624 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1625 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1626 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1627 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1628 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1629 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1630 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1631 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1632 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1633 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1634 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1635 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1636 pix1 += line_size;
1637 pix2 += line_size;
1638 pix3 += line_size;
1639 }
1640 return s;
1641 }
1642
1643 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1644 {
1645 int s, i;
1646
1647 s = 0;
1648 for(i=0;i<h;i++) {
1649 s += abs(pix1[0] - pix2[0]);
1650 s += abs(pix1[1] - pix2[1]);
1651 s += abs(pix1[2] - pix2[2]);
1652 s += abs(pix1[3] - pix2[3]);
1653 s += abs(pix1[4] - pix2[4]);
1654 s += abs(pix1[5] - pix2[5]);
1655 s += abs(pix1[6] - pix2[6]);
1656 s += abs(pix1[7] - pix2[7]);
1657 pix1 += line_size;
1658 pix2 += line_size;
1659 }
1660 return s;
1661 }
1662
1663 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664 {
1665 int s, i;
1666
1667 s = 0;
1668 for(i=0;i<h;i++) {
1669 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1670 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1671 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1672 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1673 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1674 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1675 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1676 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1677 pix1 += line_size;
1678 pix2 += line_size;
1679 }
1680 return s;
1681 }
1682
1683 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 {
1685 int s, i;
1686 uint8_t *pix3 = pix2 + line_size;
1687
1688 s = 0;
1689 for(i=0;i<h;i++) {
1690 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1691 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1692 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1693 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1694 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1695 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1696 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1697 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1698 pix1 += line_size;
1699 pix2 += line_size;
1700 pix3 += line_size;
1701 }
1702 return s;
1703 }
1704
1705 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1706 {
1707 int s, i;
1708 uint8_t *pix3 = pix2 + line_size;
1709
1710 s = 0;
1711 for(i=0;i<h;i++) {
1712 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1713 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1714 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1715 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1716 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1717 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1718 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1719 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1720 pix1 += line_size;
1721 pix2 += line_size;
1722 pix3 += line_size;
1723 }
1724 return s;
1725 }
1726
1727 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1728 MpegEncContext *c = v;
1729 int score1=0;
1730 int score2=0;
1731 int x,y;
1732
1733 for(y=0; y<h; y++){
1734 for(x=0; x<16; x++){
1735 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1736 }
1737 if(y+1<h){
1738 for(x=0; x<15; x++){
1739 score2+= FFABS( s1[x ] - s1[x +stride]
1740 - s1[x+1] + s1[x+1+stride])
1741 -FFABS( s2[x ] - s2[x +stride]
1742 - s2[x+1] + s2[x+1+stride]);
1743 }
1744 }
1745 s1+= stride;
1746 s2+= stride;
1747 }
1748
1749 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1750 else return score1 + FFABS(score2)*8;
1751 }
1752
1753 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1754 MpegEncContext *c = v;
1755 int score1=0;
1756 int score2=0;
1757 int x,y;
1758
1759 for(y=0; y<h; y++){
1760 for(x=0; x<8; x++){
1761 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1762 }
1763 if(y+1<h){
1764 for(x=0; x<7; x++){
1765 score2+= FFABS( s1[x ] - s1[x +stride]
1766 - s1[x+1] + s1[x+1+stride])
1767 -FFABS( s2[x ] - s2[x +stride]
1768 - s2[x+1] + s2[x+1+stride]);
1769 }
1770 }
1771 s1+= stride;
1772 s2+= stride;
1773 }
1774
1775 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1776 else return score1 + FFABS(score2)*8;
1777 }
1778
1779 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1780 int i;
1781 unsigned int sum=0;
1782
1783 for(i=0; i<8*8; i++){
1784 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1785 int w= weight[i];
1786 b>>= RECON_SHIFT;
1787 assert(-512<b && b<512);
1788
1789 sum += (w*b)*(w*b)>>4;
1790 }
1791 return sum>>2;
1792 }
1793
1794 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1795 int i;
1796
1797 for(i=0; i<8*8; i++){
1798 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1799 }
1800 }
1801
1802 /**
1803 * permutes an 8x8 block.
1804 * @param block the block which will be permuted according to the given permutation vector
1805 * @param permutation the permutation vector
1806 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1807 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1808 * (inverse) permutated to scantable order!
1809 */
1810 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1811 {
1812 int i;
1813 DCTELEM temp[64];
1814
1815 if(last<=0) return;
1816 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1817
1818 for(i=0; i<=last; i++){
1819 const int j= scantable[i];
1820 temp[j]= block[j];
1821 block[j]=0;
1822 }
1823
1824 for(i=0; i<=last; i++){
1825 const int j= scantable[i];
1826 const int perm_j= permutation[j];
1827 block[perm_j]= temp[j];
1828 }
1829 }
1830
1831 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1832 return 0;
1833 }
1834
1835 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1836 int i;
1837
1838 memset(cmp, 0, sizeof(void*)*6);
1839
1840 for(i=0; i<6; i++){
1841 switch(type&0xFF){
1842 case FF_CMP_SAD:
1843 cmp[i]= c->sad[i];
1844 break;
1845 case FF_CMP_SATD:
1846 cmp[i]= c->hadamard8_diff[i];
1847 break;
1848 case FF_CMP_SSE:
1849 cmp[i]= c->sse[i];
1850 break;
1851 case FF_CMP_DCT:
1852 cmp[i]= c->dct_sad[i];
1853 break;
1854 case FF_CMP_DCT264:
1855 cmp[i]= c->dct264_sad[i];
1856 break;
1857 case FF_CMP_DCTMAX:
1858 cmp[i]= c->dct_max[i];
1859 break;
1860 case FF_CMP_PSNR:
1861 cmp[i]= c->quant_psnr[i];
1862 break;
1863 case FF_CMP_BIT:
1864 cmp[i]= c->bit[i];
1865 break;
1866 case FF_CMP_RD:
1867 cmp[i]= c->rd[i];
1868 break;
1869 case FF_CMP_VSAD:
1870 cmp[i]= c->vsad[i];
1871 break;
1872 case FF_CMP_VSSE:
1873 cmp[i]= c->vsse[i];
1874 break;
1875 case FF_CMP_ZERO:
1876 cmp[i]= zero_cmp;
1877 break;
1878 case FF_CMP_NSSE:
1879 cmp[i]= c->nsse[i];
1880 break;
1881 #if CONFIG_DWT
1882 case FF_CMP_W53:
1883 cmp[i]= c->w53[i];
1884 break;
1885 case FF_CMP_W97:
1886 cmp[i]= c->w97[i];
1887 break;
1888 #endif
1889 default:
1890 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1891 }
1892 }
1893 }
1894
1895 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1896 long i;
1897 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1898 long a = *(long*)(src+i);
1899 long b = *(long*)(dst+i);
1900 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1901 }
1902 for(; i<w; i++)
1903 dst[i+0] += src[i+0];
1904 }
1905
1906 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1907 long i;
1908 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1909 long a = *(long*)(src1+i);
1910 long b = *(long*)(src2+i);
1911 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1912 }
1913 for(; i<w; i++)
1914 dst[i] = src1[i]+src2[i];
1915 }
1916
1917 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1918 long i;
1919 #if !HAVE_FAST_UNALIGNED
1920 if((long)src2 & (sizeof(long)-1)){
1921 for(i=0; i+7<w; i+=8){
1922 dst[i+0] = src1[i+0]-src2[i+0];
1923 dst[i+1] = src1[i+1]-src2[i+1];
1924 dst[i+2] = src1[i+2]-src2[i+2];
1925 dst[i+3] = src1[i+3]-src2[i+3];
1926 dst[i+4] = src1[i+4]-src2[i+4];
1927 dst[i+5] = src1[i+5]-src2[i+5];
1928 dst[i+6] = src1[i+6]-src2[i+6];
1929 dst[i+7] = src1[i+7]-src2[i+7];
1930 }
1931 }else
1932 #endif
1933 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1934 long a = *(long*)(src1+i);
1935 long b = *(long*)(src2+i);
1936 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1937 }
1938 for(; i<w; i++)
1939 dst[i+0] = src1[i+0]-src2[i+0];
1940 }
1941
1942 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1943 int i;
1944 uint8_t l, lt;
1945
1946 l= *left;
1947 lt= *left_top;
1948
1949 for(i=0; i<w; i++){
1950 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1951 lt= src1[i];
1952 dst[i]= l;
1953 }
1954
1955 *left= l;
1956 *left_top= lt;
1957 }
1958
1959 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1960 int i;
1961 uint8_t l, lt;
1962
1963 l= *left;
1964 lt= *left_top;
1965
1966 for(i=0; i<w; i++){
1967 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1968 lt= src1[i];
1969 l= src2[i];
1970 dst[i]= l - pred;
1971 }
1972
1973 *left= l;
1974 *left_top= lt;
1975 }
1976
1977 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1978 int i;
1979
1980 for(i=0; i<w-1; i++){
1981 acc+= src[i];
1982 dst[i]= acc;
1983 i++;
1984 acc+= src[i];
1985 dst[i]= acc;
1986 }
1987
1988 for(; i<w; i++){
1989 acc+= src[i];
1990 dst[i]= acc;
1991 }
1992
1993 return acc;
1994 }
1995
1996 #if HAVE_BIGENDIAN
1997 #define B 3
1998 #define G 2
1999 #define R 1
2000 #define A 0
2001 #else
2002 #define B 0
2003 #define G 1
2004 #define R 2
2005 #define A 3
2006 #endif
2007 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2008 int i;
2009 int r,g,b,a;
2010 r= *red;
2011 g= *green;
2012 b= *blue;
2013 a= *alpha;
2014
2015 for(i=0; i<w; i++){
2016 b+= src[4*i+B];
2017 g+= src[4*i+G];
2018 r+= src[4*i+R];
2019 a+= src[4*i+A];
2020
2021 dst[4*i+B]= b;
2022 dst[4*i+G]= g;
2023 dst[4*i+R]= r;
2024 dst[4*i+A]= a;
2025 }
2026
2027 *red= r;
2028 *green= g;
2029 *blue= b;
2030 *alpha= a;
2031 }
2032 #undef B
2033 #undef G
2034 #undef R
2035 #undef A
2036
2037 #define BUTTERFLY2(o1,o2,i1,i2) \
2038 o1= (i1)+(i2);\
2039 o2= (i1)-(i2);
2040
2041 #define BUTTERFLY1(x,y) \
2042 {\
2043 int a,b;\
2044 a= x;\
2045 b= y;\
2046 x= a+b;\
2047 y= a-b;\
2048 }
2049
2050 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2051
2052 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2053 int i;
2054 int temp[64];
2055 int sum=0;
2056
2057 assert(h==8);
2058
2059 for(i=0; i<8; i++){
2060 //FIXME try pointer walks
2061 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2062 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2063 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2064 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2065
2066 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2067 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2068 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2069 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2070
2071 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2072 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2073 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2074 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2075 }
2076
2077 for(i=0; i<8; i++){
2078 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2079 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2080 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2081 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2082
2083 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2084 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2085 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2086 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2087
2088 sum +=
2089 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2090 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2091 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2092 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2093 }
2094 return sum;
2095 }
2096
2097 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2098 int i;
2099 int temp[64];
2100 int sum=0;
2101
2102 assert(h==8);
2103
2104 for(i=0; i<8; i++){
2105 //FIXME try pointer walks
2106 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2107 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2108 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2109 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2110
2111 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2112 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2113 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2114 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2115
2116 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2117 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2118 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2119 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2120 }
2121
2122 for(i=0; i<8; i++){
2123 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2124 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2125 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2126 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2127
2128 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2129 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2130 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2131 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2132
2133 sum +=
2134 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2135 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2136 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2137 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2138 }
2139
2140 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2141
2142 return sum;
2143 }
2144
2145 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2146 MpegEncContext * const s= (MpegEncContext *)c;
2147 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2148
2149 assert(h==8);
2150
2151 s->dsp.diff_pixels(temp, src1, src2, stride);
2152 s->dsp.fdct(temp);
2153 return s->dsp.sum_abs_dctelem(temp);
2154 }
2155
2156 #if CONFIG_GPL
2157 #define DCT8_1D {\
2158 const int s07 = SRC(0) + SRC(7);\
2159 const int s16 = SRC(1) + SRC(6);\
2160 const int s25 = SRC(2) + SRC(5);\
2161 const int s34 = SRC(3) + SRC(4);\
2162 const int a0 = s07 + s34;\
2163 const int a1 = s16 + s25;\
2164 const int a2 = s07 - s34;\
2165 const int a3 = s16 - s25;\
2166 const int d07 = SRC(0) - SRC(7);\
2167 const int d16 = SRC(1) - SRC(6);\
2168 const int d25 = SRC(2) - SRC(5);\
2169 const int d34 = SRC(3) - SRC(4);\
2170 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2171 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2172 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2173 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2174 DST(0, a0 + a1 ) ;\
2175 DST(1, a4 + (a7>>2)) ;\
2176 DST(2, a2 + (a3>>1)) ;\
2177 DST(3, a5 + (a6>>2)) ;\
2178 DST(4, a0 - a1 ) ;\
2179 DST(5, a6 - (a5>>2)) ;\
2180 DST(6, (a2>>1) - a3 ) ;\
2181 DST(7, (a4>>2) - a7 ) ;\
2182 }
2183
2184 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2185 MpegEncContext * const s= (MpegEncContext *)c;
2186 DCTELEM dct[8][8];
2187 int i;
2188 int sum=0;
2189
2190 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2191
2192 #define SRC(x) dct[i][x]
2193 #define DST(x,v) dct[i][x]= v
2194 for( i = 0; i < 8; i++ )
2195 DCT8_1D
2196 #undef SRC
2197 #undef DST
2198
2199 #define SRC(x) dct[x][i]
2200 #define DST(x,v) sum += FFABS(v)
2201 for( i = 0; i < 8; i++ )
2202 DCT8_1D
2203 #undef SRC
2204 #undef DST
2205 return sum;
2206 }
2207 #endif
2208
2209 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2210 MpegEncContext * const s= (MpegEncContext *)c;
2211 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2212 int sum=0, i;
2213
2214 assert(h==8);
2215
2216 s->dsp.diff_pixels(temp, src1, src2, stride);
2217 s->dsp.fdct(temp);
2218
2219 for(i=0; i<64; i++)
2220 sum= FFMAX(sum, FFABS(temp[i]));
2221
2222 return sum;
2223 }
2224
2225 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2226 MpegEncContext * const s= (MpegEncContext *)c;
2227 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2228 DCTELEM * const bak = temp+64;
2229 int sum=0, i;
2230
2231 assert(h==8);
2232 s->mb_intra=0;
2233
2234 s->dsp.diff_pixels(temp, src1, src2, stride);
2235
2236 memcpy(bak, temp, 64*sizeof(DCTELEM));
2237
2238 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2239 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2240 ff_simple_idct(temp); //FIXME
2241
2242 for(i=0; i<64; i++)
2243 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2244
2245 return sum;
2246 }
2247
2248 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2249 MpegEncContext * const s= (MpegEncContext *)c;
2250 const uint8_t *scantable= s->intra_scantable.permutated;
2251 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2252 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2253 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2254 int i, last, run, bits, level, distortion, start_i;
2255 const int esc_length= s->ac_esc_length;
2256 uint8_t * length;
2257 uint8_t * last_length;
2258
2259 assert(h==8);
2260
2261 copy_block8(lsrc1, src1, 8, stride, 8);
2262 copy_block8(lsrc2, src2, 8, stride, 8);
2263
2264 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2265
2266 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2267
2268 bits=0;
2269
2270 if (s->mb_intra) {
2271 start_i = 1;
2272 length = s->intra_ac_vlc_length;
2273 last_length= s->intra_ac_vlc_last_length;
2274 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2275 } else {
2276 start_i = 0;
2277 length = s->inter_ac_vlc_length;
2278 last_length= s->inter_ac_vlc_last_length;
2279 }
2280
2281 if(last>=start_i){
2282 run=0;
2283 for(i=start_i; i<last; i++){
2284 int j= scantable[i];
2285 level= temp[j];
2286
2287 if(level){
2288 level+=64;
2289 if((level&(~127)) == 0){
2290 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2291 }else
2292 bits+= esc_length;
2293 run=0;
2294 }else
2295 run++;
2296 }
2297 i= scantable[last];
2298
2299 level= temp[i] + 64;
2300
2301 assert(level - 64);
2302
2303 if((level&(~127)) == 0){
2304 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2305 }else
2306 bits+= esc_length;
2307
2308 }
2309
2310 if(last>=0){
2311 if(s->mb_intra)
2312 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2313 else
2314 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2315 }
2316
2317 s->dsp.idct_add(lsrc2, 8, temp);
2318
2319 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2320
2321 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2322 }
2323
2324 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2325 MpegEncContext * const s= (MpegEncContext *)c;
2326 const uint8_t *scantable= s->intra_scantable.permutated;
2327 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2328 int i, last, run, bits, level, start_i;
2329 const int esc_length= s->ac_esc_length;
2330 uint8_t * length;
2331 uint8_t * last_length;
2332
2333 assert(h==8);
2334
2335 s->dsp.diff_pixels(temp, src1, src2, stride);
2336
2337 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2338
2339 bits=0;
2340
2341 if (s->mb_intra) {
2342 start_i = 1;
2343 length = s->intra_ac_vlc_length;
2344 last_length= s->intra_ac_vlc_last_length;
2345 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2346 } else {
2347 start_i = 0;
2348 length = s->inter_ac_vlc_length;
2349 last_length= s->inter_ac_vlc_last_length;
2350 }
2351
2352 if(last>=start_i){
2353 run=0;
2354 for(i=start_i; i<last; i++){
2355 int j= scantable[i];
2356 level= temp[j];
2357
2358 if(level){
2359 level+=64;
2360 if((level&(~127)) == 0){
2361 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2362 }else
2363 bits+= esc_length;
2364 run=0;
2365 }else
2366 run++;
2367 }
2368 i= scantable[last];
2369
2370 level= temp[i] + 64;
2371
2372 assert(level - 64);
2373
2374 if((level&(~127)) == 0){
2375 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2376 }else
2377 bits+= esc_length;
2378 }
2379
2380 return bits;
2381 }
2382
2383 #define VSAD_INTRA(size) \
2384 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2385 int score=0; \
2386 int x,y; \
2387 \
2388 for(y=1; y<h; y++){ \
2389 for(x=0; x<size; x+=4){ \
2390 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2391 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2392 } \
2393 s+= stride; \
2394 } \
2395 \
2396 return score; \
2397 }
2398 VSAD_INTRA(8)
2399 VSAD_INTRA(16)
2400
2401 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2402 int score=0;
2403 int x,y;
2404
2405 for(y=1; y<h; y++){
2406 for(x=0; x<16; x++){
2407 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2408 }
2409 s1+= stride;
2410 s2+= stride;
2411 }
2412
2413 return score;
2414 }
2415
2416 #define SQ(a) ((a)*(a))
2417 #define VSSE_INTRA(size) \
2418 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2419 int score=0; \
2420 int x,y; \
2421 \
2422 for(y=1; y<h; y++){ \
2423 for(x=0; x<size; x+=4){ \
2424 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2425 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2426 } \
2427 s+= stride; \
2428 } \
2429 \
2430 return score; \
2431 }
2432 VSSE_INTRA(8)
2433 VSSE_INTRA(16)
2434
2435 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2436 int score=0;
2437 int x,y;
2438
2439 for(y=1; y<h; y++){
2440 for(x=0; x<16; x++){
2441 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2442 }
2443 s1+= stride;
2444 s2+= stride;
2445 }
2446
2447 return score;
2448 }
2449
2450 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2451 int size){
2452 int score=0;
2453 int i;
2454 for(i=0; i<size; i++)
2455 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2456 return score;
2457 }
2458
2459 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2460 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2461 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2462 #if CONFIG_GPL
2463 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2464 #endif
2465 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2466 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2467 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2468 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2469
2470 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2471 int i;
2472 for(i=0; i<len; i++)
2473 dst[i] = src0[i] * src1[i];
2474 }
2475
2476 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2477 int i;
2478 src1 += len-1;
2479 for(i=0; i<len; i++)
2480 dst[i] = src0[i] * src1[-i];
2481 }
2482
2483 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2484 int i;
2485 for(i=0; i<len; i++)
2486 dst[i] = src0[i] * src1[i] + src2[i];
2487 }
2488
2489 static void vector_fmul_window_c(float *dst, const float *src0,
2490 const float *src1, const float *win, int len)
2491 {
2492 int i,j;
2493 dst += len;
2494 win += len;
2495 src0+= len;
2496 for(i=-len, j=len-1; i<0; i++, j--) {
2497 float s0 = src0[i];
2498 float s1 = src1[j];
2499 float wi = win[i];
2500 float wj = win[j];
2501 dst[i] = s0*wj - s1*wi;
2502 dst[j] = s0*wi + s1*wj;
2503 }
2504 }
2505
2506 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2507 int len)
2508 {
2509 int i;
2510 for (i = 0; i < len; i++)
2511 dst[i] = src[i] * mul;
2512 }
2513
2514 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2515 const float **sv, float mul, int len)
2516 {
2517 int i;
2518 for (i = 0; i < len; i += 2, sv++) {
2519 dst[i ] = src[i ] * sv[0][0] * mul;
2520 dst[i+1] = src[i+1] * sv[0][1] * mul;
2521 }
2522 }
2523
2524 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2525 const float **sv, float mul, int len)
2526 {
2527 int i;
2528 for (i = 0; i < len; i += 4, sv++) {
2529 dst[i ] = src[i ] * sv[0][0] * mul;
2530 dst[i+1] = src[i+1] * sv[0][1] * mul;
2531 dst[i+2] = src[i+2] * sv[0][2] * mul;
2532 dst[i+3] = src[i+3] * sv[0][3] * mul;
2533 }
2534 }
2535
2536 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2537 int len)
2538 {
2539 int i;
2540 for (i = 0; i < len; i += 2, sv++) {
2541 dst[i ] = sv[0][0] * mul;
2542 dst[i+1] = sv[0][1] * mul;
2543 }
2544 }
2545
2546 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2547 int len)
2548 {
2549 int i;
2550 for (i = 0; i < len; i += 4, sv++) {
2551 dst[i ] = sv[0][0] * mul;
2552 dst[i+1] = sv[0][1] * mul;
2553 dst[i+2] = sv[0][2] * mul;
2554 dst[i+3] = sv[0][3] * mul;
2555 }
2556 }
2557
2558 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2559 int len)
2560 {
2561 int i;
2562 for (i = 0; i < len; i++) {
2563 float t = v1[i] - v2[i];
2564 v1[i] += v2[i];
2565 v2[i] = t;
2566 }
2567 }
2568
2569 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2570 {
2571 float p = 0.0;
2572 int i;
2573
2574 for (i = 0; i < len; i++)
2575 p += v1[i] * v2[i];
2576
2577 return p;
2578 }
2579
2580 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2581 uint32_t maxi, uint32_t maxisign)
2582 {
2583
2584 if(a > mini) return mini;
2585 else if((a^(1U<<31)) > maxisign) return maxi;
2586 else return a;
2587 }
2588
2589 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2590 int i;
2591 uint32_t mini = *(uint32_t*)min;
2592 uint32_t maxi = *(uint32_t*)max;
2593 uint32_t maxisign = maxi ^ (1U<<31);
2594 uint32_t *dsti = (uint32_t*)dst;
2595 const uint32_t *srci = (const uint32_t*)src;
2596 for(i=0; i<len; i+=8) {
2597 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2598 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2599 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2600 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2601 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2602 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2603 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2604 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2605 }
2606 }
2607 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2608 int i;
2609 if(min < 0 && max > 0) {
2610 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2611 } else {
2612 for(i=0; i < len; i+=8) {
2613 dst[i ] = av_clipf(src[i ], min, max);
2614 dst[i + 1] = av_clipf(src[i + 1], min, max);
2615 dst[i + 2] = av_clipf(src[i + 2], min, max);
2616 dst[i + 3] = av_clipf(src[i + 3], min, max);
2617 dst[i + 4] = av_clipf(src[i + 4], min, max);
2618 dst[i + 5] = av_clipf(src[i + 5], min, max);
2619 dst[i + 6] = av_clipf(src[i + 6], min, max);
2620 dst[i + 7] = av_clipf(src[i + 7], min, max);
2621 }
2622 }
2623 }
2624
2625 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2626 {
2627 int res = 0;
2628
2629 while (order--)
2630 res += (*v1++ * *v2++) >> shift;
2631
2632 return res;
2633 }
2634
2635 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2636 {
2637 int res = 0;
2638 while (order--) {
2639 res += *v1 * *v2++;
2640 *v1++ += mul * *v3++;
2641 }
2642 return res;
2643 }
2644
2645 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2646 const int16_t *window, unsigned int len)
2647 {
2648 int i;
2649 int len2 = len >> 1;
2650
2651 for (i = 0; i < len2; i++) {
2652 int16_t w = window[i];
2653 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2654 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2655 }
2656 }
2657
2658 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2659 int32_t max, unsigned int len)
2660 {
2661 do {
2662 *dst++ = av_clip(*src++, min, max);
2663 *dst++ = av_clip(*src++, min, max);
2664 *dst++ = av_clip(*src++, min, max);
2665 *dst++ = av_clip(*src++, min, max);
2666 *dst++ = av_clip(*src++, min, max);
2667 *dst++ = av_clip(*src++, min, max);
2668 *dst++ = av_clip(*src++, min, max);
2669 *dst++ = av_clip(*src++, min, max);
2670 len -= 8;
2671 } while (len > 0);
2672 }
2673
2674 #define W0 2048
2675 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2676 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2677 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2678 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2679 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2680 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2681 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2682
2683 static void wmv2_idct_row(short * b)
2684 {
2685 int s1,s2;
2686 int a0,a1,a2,a3,a4,a5,a6,a7;
2687 /*step 1*/
2688 a1 = W1*b[1]+W7*b[7];
2689 a7 = W7*b[1]-W1*b[7];
2690 a5 = W5*b[5]+W3*b[3];
2691 a3 = W3*b[5]-W5*b[3];
2692 a2 = W2*b[2]+W6*b[6];
2693 a6 = W6*b[2]-W2*b[6];
2694 a0 = W0*b[0]+W0*b[4];
2695 a4 = W0*b[0]-W0*b[4];
2696 /*step 2*/
2697 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2698 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2699 /*step 3*/
2700 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2701 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2702 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2703 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2704 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2705 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2706 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2707 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2708 }
2709 static void wmv2_idct_col(short * b)
2710 {
2711 int s1,s2;
2712 int a0,a1,a2,a3,a4,a5,a6,a7;
2713 /*step 1, with extended precision*/
2714 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2715 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2716 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2717 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2718 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2719 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2720 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2721 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2722 /*step 2*/
2723 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2724 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2725 /*step 3*/
2726 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2727 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2728 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2729 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2730
2731 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2732 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2733 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2734 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2735 }
2736 void ff_wmv2_idct_c(short * block){
2737 int i;
2738
2739 for(i=0;i<64;i+=8){
2740 wmv2_idct_row(block+i);
2741 }
2742 for(i=0;i<8;i++){
2743 wmv2_idct_col(block+i);
2744 }
2745 }
2746 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2747 converted */
2748 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2749 {
2750 ff_wmv2_idct_c(block);
2751 ff_put_pixels_clamped_c(block, dest, line_size);
2752 }
2753 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2754 {
2755 ff_wmv2_idct_c(block);
2756 ff_add_pixels_clamped_c(block, dest, line_size);
2757 }
2758 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2759 {
2760 j_rev_dct (block);
2761 ff_put_pixels_clamped_c(block, dest, line_size);
2762 }
2763 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2764 {
2765 j_rev_dct (block);
2766 ff_add_pixels_clamped_c(block, dest, line_size);
2767 }
2768
2769 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2770 {
2771 j_rev_dct4 (block);
2772 put_pixels_clamped4_c(block, dest, line_size);
2773 }
2774 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2775 {
2776 j_rev_dct4 (block);
2777 add_pixels_clamped4_c(block, dest, line_size);
2778 }
2779
2780 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2781 {
2782 j_rev_dct2 (block);
2783 put_pixels_clamped2_c(block, dest, line_size);
2784 }
2785 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2786 {
2787 j_rev_dct2 (block);
2788 add_pixels_clamped2_c(block, dest, line_size);
2789 }
2790
2791 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2792 {
2793 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2794
2795 dest[0] = cm[(block[0] + 4)>>3];
2796 }
2797 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2798 {
2799 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2800
2801 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2802 }
2803
2804 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2805
2806 /* init static data */
2807 av_cold void dsputil_static_init(void)
2808 {
2809 int i;
2810
2811 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2812 for(i=0;i<MAX_NEG_CROP;i++) {
2813 ff_cropTbl[i] = 0;
2814 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2815 }
2816
2817 for(i=0;i<512;i++) {
2818 ff_squareTbl[i] = (i - 256) * (i - 256);
2819 }
2820
2821 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2822 }
2823
2824 int ff_check_alignment(void){
2825 static int did_fail=0;
2826 LOCAL_ALIGNED_16(int, aligned);
2827
2828 if((intptr_t)&aligned & 15){
2829 if(!did_fail){
2830 #if HAVE_MMX || HAVE_ALTIVEC
2831 av_log(NULL, AV_LOG_ERROR,
2832 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2833 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2834 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2835 "Do not report crashes to Libav developers.\n");
2836 #endif
2837 did_fail=1;
2838 }
2839 return -1;
2840 }
2841 return 0;
2842 }
2843
2844 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2845 {
2846 int i;
2847
2848 ff_check_alignment();
2849
2850 #if CONFIG_ENCODERS
2851 if(avctx->dct_algo==FF_DCT_FASTINT) {
2852 c->fdct = fdct_ifast;
2853 c->fdct248 = fdct_ifast248;
2854 }
2855 else if(avctx->dct_algo==FF_DCT_FAAN) {
2856 c->fdct = ff_faandct;
2857 c->fdct248 = ff_faandct248;
2858 }
2859 else {
2860 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2861 c->fdct248 = ff_fdct248_islow;
2862 }
2863 #endif //CONFIG_ENCODERS
2864
2865 if(avctx->lowres==1){
2866 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2867 c->idct_put= ff_jref_idct4_put;
2868 c->idct_add= ff_jref_idct4_add;
2869 }else{
2870 if (avctx->codec_id != CODEC_ID_H264) {
2871 c->idct_put= ff_h264_lowres_idct_put_8_c;
2872 c->idct_add= ff_h264_lowres_idct_add_8_c;
2873 } else {
2874 switch (avctx->bits_per_raw_sample) {
2875 case 9:
2876 c->idct_put= ff_h264_lowres_idct_put_9_c;
2877 c->idct_add= ff_h264_lowres_idct_add_9_c;
2878 break;
2879 case 10:
2880 c->idct_put= ff_h264_lowres_idct_put_10_c;
2881 c->idct_add= ff_h264_lowres_idct_add_10_c;
2882 break;
2883 default:
2884 c->idct_put= ff_h264_lowres_idct_put_8_c;
2885 c->idct_add= ff_h264_lowres_idct_add_8_c;
2886 }
2887 }
2888 }
2889 c->idct = j_rev_dct4;
2890 c->idct_permutation_type= FF_NO_IDCT_PERM;
2891 }else if(avctx->lowres==2){
2892 c->idct_put= ff_jref_idct2_put;
2893 c->idct_add= ff_jref_idct2_add;
2894 c->idct = j_rev_dct2;
2895 c->idct_permutation_type= FF_NO_IDCT_PERM;
2896 }else if(avctx->lowres==3){
2897 c->idct_put= ff_jref_idct1_put;
2898 c->idct_add= ff_jref_idct1_add;
2899 c->idct = j_rev_dct1;
2900 c->idct_permutation_type= FF_NO_IDCT_PERM;
2901 }else{
2902 if(avctx->idct_algo==FF_IDCT_INT){
2903 c->idct_put= ff_jref_idct_put;
2904 c->idct_add= ff_jref_idct_add;
2905 c->idct = j_rev_dct;
2906 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2907 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2908 avctx->idct_algo==FF_IDCT_VP3){
2909 c->idct_put= ff_vp3_idct_put_c;
2910 c->idct_add= ff_vp3_idct_add_c;
2911 c->idct = ff_vp3_idct_c;
2912 c->idct_permutation_type= FF_NO_IDCT_PERM;
2913 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2914 c->idct_put= ff_wmv2_idct_put_c;
2915 c->idct_add= ff_wmv2_idct_add_c;
2916 c->idct = ff_wmv2_idct_c;
2917 c->idct_permutation_type= FF_NO_IDCT_PERM;
2918 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2919 c->idct_put= ff_faanidct_put;
2920 c->idct_add= ff_faanidct_add;
2921 c->idct = ff_faanidct;
2922 c->idct_permutation_type= FF_NO_IDCT_PERM;
2923 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2924 c->idct_put= ff_ea_idct_put_c;
2925 c->idct_permutation_type= FF_NO_IDCT_PERM;
2926 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2927 c->idct = ff_bink_idct_c;
2928 c->idct_add = ff_bink_idct_add_c;
2929 c->idct_put = ff_bink_idct_put_c;
2930 c->idct_permutation_type = FF_NO_IDCT_PERM;
2931 }else{ //accurate/default
2932 c->idct_put= ff_simple_idct_put;
2933 c->idct_add= ff_simple_idct_add;
2934 c->idct = ff_simple_idct;
2935 c->idct_permutation_type= FF_NO_IDCT_PERM;
2936 }
2937 }
2938
2939 c->get_pixels = get_pixels_c;
2940 c->diff_pixels = diff_pixels_c;
2941 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2942 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2943 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2944 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2945 c->sum_abs_dctelem = sum_abs_dctelem_c;
2946 c->gmc1 = gmc1_c;
2947 c->gmc = ff_gmc_c;
2948 c->pix_sum = pix_sum_c;
2949 c->pix_norm1 = pix_norm1_c;
2950
2951 c->fill_block_tab[0] = fill_block16_c;
2952 c->fill_block_tab[1] = fill_block8_c;
2953 c->scale_block = scale_block_c;
2954
2955 /* TODO [0] 16 [1] 8 */
2956 c->pix_abs[0][0] = pix_abs16_c;
2957 c->pix_abs[0][1] = pix_abs16_x2_c;
2958 c->pix_abs[0][2] = pix_abs16_y2_c;
2959 c->pix_abs[0][3] = pix_abs16_xy2_c;
2960 c->pix_abs[1][0] = pix_abs8_c;
2961 c->pix_abs[1][1] = pix_abs8_x2_c;
2962 c->pix_abs[1][2] = pix_abs8_y2_c;
2963 c->pix_abs[1][3] = pix_abs8_xy2_c;
2964
2965 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2966 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2967 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2968 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2969 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2970 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2971 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2972 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2973 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2974
2975 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2976 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2977 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2978 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2979 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2980 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2981 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2982 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2983 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2984
2985 #define dspfunc(PFX, IDX, NUM) \
2986 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2987 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2988 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2989 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2990 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2991 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2992 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2993 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2994 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2995 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2996 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2997 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2998 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2999 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3000 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3001 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3002
3003 dspfunc(put_qpel, 0, 16);
3004 dspfunc(put_no_rnd_qpel, 0, 16);
3005
3006 dspfunc(avg_qpel, 0, 16);
3007 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3008
3009 dspfunc(put_qpel, 1, 8);
3010 dspfunc(put_no_rnd_qpel, 1, 8);
3011
3012 dspfunc(avg_qpel, 1, 8);
3013 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3014
3015 #undef dspfunc
3016
3017 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3018 ff_mlp_init(c, avctx);
3019 #endif
3020 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3021 ff_intrax8dsp_init(c,avctx);
3022 #endif
3023 #if CONFIG_RV30_DECODER
3024 ff_rv30dsp_init(c,avctx);
3025 #endif
3026 #if CONFIG_RV40_DECODER
3027 ff_rv40dsp_init(c,avctx);
3028 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3029 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3030 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3031 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3032 #endif
3033
3034 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3035 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3036 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3037 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3038 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3039 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3040 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3041 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3042
3043 #define SET_CMP_FUNC(name) \
3044 c->name[0]= name ## 16_c;\
3045 c->name[1]= name ## 8x8_c;
3046
3047 SET_CMP_FUNC(hadamard8_diff)
3048 c->hadamard8_diff[4]= hadamard8_intra16_c;
3049 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3050 SET_CMP_FUNC(dct_sad)
3051 SET_CMP_FUNC(dct_max)
3052 #if CONFIG_GPL
3053 SET_CMP_FUNC(dct264_sad)
3054 #endif
3055 c->sad[0]= pix_abs16_c;
3056 c->sad[1]= pix_abs8_c;
3057 c->sse[0]= sse16_c;
3058 c->sse[1]= sse8_c;
3059 c->sse[2]= sse4_c;
3060 SET_CMP_FUNC(quant_psnr)
3061 SET_CMP_FUNC(rd)
3062 SET_CMP_FUNC(bit)
3063 c->vsad[0]= vsad16_c;
3064 c->vsad[4]= vsad_intra16_c;
3065 c->vsad[5]= vsad_intra8_c;
3066 c->vsse[0]= vsse16_c;
3067 c->vsse[4]= vsse_intra16_c;
3068 c->vsse[5]= vsse_intra8_c;
3069 c->nsse[0]= nsse16_c;
3070 c->nsse[1]= nsse8_c;
3071 #if CONFIG_DWT
3072 ff_dsputil_init_dwt(c);
3073 #endif
3074
3075 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3076
3077 c->add_bytes= add_bytes_c;
3078 c->add_bytes_l2= add_bytes_l2_c;
3079 c->diff_bytes= diff_bytes_c;
3080 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3081 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3082 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3083 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3084 c->bswap_buf= bswap_buf;
3085 c->bswap16_buf = bswap16_buf;
3086 #if CONFIG_PNG_DECODER
3087 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3088 #endif
3089
3090 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3091 c->h263_h_loop_filter= h263_h_loop_filter_c;
3092 c->h263_v_loop_filter= h263_v_loop_filter_c;
3093 }
3094
3095 if (CONFIG_VP3_DECODER) {
3096 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3097 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3098 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3099 }
3100
3101 c->h261_loop_filter= h261_loop_filter_c;
3102
3103 c->try_8x8basis= try_8x8basis_c;
3104 c->add_8x8basis= add_8x8basis_c;
3105
3106 #if CONFIG_VORBIS_DECODER
3107 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3108 #endif
3109 #if CONFIG_AC3_DECODER
3110 c->ac3_downmix = ff_ac3_downmix_c;
3111 #endif
3112 c->vector_fmul = vector_fmul_c;
3113 c->vector_fmul_reverse = vector_fmul_reverse_c;
3114 c->vector_fmul_add = vector_fmul_add_c;
3115 c->vector_fmul_window = vector_fmul_window_c;
3116 c->vector_clipf = vector_clipf_c;
3117 c->scalarproduct_int16 = scalarproduct_int16_c;
3118 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3119 c->apply_window_int16 = apply_window_int16_c;
3120 c->vector_clip_int32 = vector_clip_int32_c;
3121 c->scalarproduct_float = scalarproduct_float_c;
3122 c->butterflies_float = butterflies_float_c;
3123 c->vector_fmul_scalar = vector_fmul_scalar_c;
3124
3125 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3126 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3127
3128 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3129 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3130
3131 c->shrink[0]= av_image_copy_plane;
3132 c->shrink[1]= ff_shrink22;
3133 c->shrink[2]= ff_shrink44;
3134 c->shrink[3]= ff_shrink88;
3135
3136 c->prefetch= just_return;
3137
3138 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3139 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3140
3141 #undef FUNC
3142 #undef FUNCC
3143 #define FUNC(f, depth) f ## _ ## depth
3144 #define FUNCC(f, depth) f ## _ ## depth ## _c
3145
3146 #define dspfunc1(PFX, IDX, NUM, depth)\
3147 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3148 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3149 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3150 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3151
3152 #define dspfunc2(PFX, IDX, NUM, depth)\
3153 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3154 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3155 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3156 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3157 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3158 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3159 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3160 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3161 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3162 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3163 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3164 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3165 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3166 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3167 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3168 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3169
3170
3171 #define BIT_DEPTH_FUNCS(depth)\
3172 c->draw_edges = FUNCC(draw_edges , depth);\
3173 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3174 c->clear_block = FUNCC(clear_block , depth);\
3175 c->clear_blocks = FUNCC(clear_blocks , depth);\
3176 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3177 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3178 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3179 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3180 \
3181 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3182 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3183 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3184 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3185 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3186 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3187 \
3188 dspfunc1(put , 0, 16, depth);\
3189 dspfunc1(put , 1, 8, depth);\
3190 dspfunc1(put , 2, 4, depth);\
3191 dspfunc1(put , 3, 2, depth);\
3192 dspfunc1(put_no_rnd, 0, 16, depth);\
3193 dspfunc1(put_no_rnd, 1, 8, depth);\
3194 dspfunc1(avg , 0, 16, depth);\
3195 dspfunc1(avg , 1, 8, depth);\
3196 dspfunc1(avg , 2, 4, depth);\
3197 dspfunc1(avg , 3, 2, depth);\
3198 dspfunc1(avg_no_rnd, 0, 16, depth);\
3199 dspfunc1(avg_no_rnd, 1, 8, depth);\
3200 \
3201 dspfunc2(put_h264_qpel, 0, 16, depth);\
3202 dspfunc2(put_h264_qpel, 1, 8, depth);\
3203 dspfunc2(put_h264_qpel, 2, 4, depth);\
3204 dspfunc2(put_h264_qpel, 3, 2, depth);\
3205 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3206 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3207 dspfunc2(avg_h264_qpel, 2, 4, depth);
3208
3209 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3210 BIT_DEPTH_FUNCS(8)
3211 } else {
3212 switch (avctx->bits_per_raw_sample) {
3213 case 9:
3214 BIT_DEPTH_FUNCS(9)
3215 break;
3216 case 10:
3217 BIT_DEPTH_FUNCS(10)
3218 break;
3219 default:
3220 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3221 BIT_DEPTH_FUNCS(8)
3222 break;
3223 }
3224 }
3225
3226
3227 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3228 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3229 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3230 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3231 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3232 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3233 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3234 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3235 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3236
3237 for(i=0; i<64; i++){
3238 if(!c->put_2tap_qpel_pixels_tab[0][i])
3239 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3240 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3241 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3242 }
3243
3244 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3245 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3246 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3247 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3248
3249 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3250 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3251 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3252 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3253
3254 switch(c->idct_permutation_type){
3255 case FF_NO_IDCT_PERM:
3256 for(i=0; i<64; i++)
3257 c->idct_permutation[i]= i;
3258 break;
3259 case FF_LIBMPEG2_IDCT_PERM:
3260 for(i=0; i<64; i++)
3261 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3262 break;
3263 case FF_SIMPLE_IDCT_PERM:
3264 for(i