a7c69c0341d31e934593de790a9964b088e7da30
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of Libav.
9 *
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file
27 * DSP utils
28 */
29
30 #include "libavutil/imgutils.h"
31 #include "avcodec.h"
32 #include "dsputil.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "faanidct.h"
36 #include "mathops.h"
37 #include "mpegvideo.h"
38 #include "config.h"
39 #include "ac3dec.h"
40 #include "vorbis.h"
41
42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43 uint32_t ff_squareTbl[512] = {0, };
44
45 #define BIT_DEPTH 9
46 #include "dsputil_template.c"
47 #undef BIT_DEPTH
48
49 #define BIT_DEPTH 10
50 #include "dsputil_template.c"
51 #undef BIT_DEPTH
52
53 #define BIT_DEPTH 8
54 #include "dsputil_template.c"
55
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL/255 * 0x7f)
58 #define pb_80 (~0UL/255 * 0x80)
59
60 const uint8_t ff_zigzag_direct[64] = {
61 0, 1, 8, 16, 9, 2, 3, 10,
62 17, 24, 32, 25, 18, 11, 4, 5,
63 12, 19, 26, 33, 40, 48, 41, 34,
64 27, 20, 13, 6, 7, 14, 21, 28,
65 35, 42, 49, 56, 57, 50, 43, 36,
66 29, 22, 15, 23, 30, 37, 44, 51,
67 58, 59, 52, 45, 38, 31, 39, 46,
68 53, 60, 61, 54, 47, 55, 62, 63
69 };
70
71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
72 specification, we interleave the fields */
73 const uint8_t ff_zigzag248_direct[64] = {
74 0, 8, 1, 9, 16, 24, 2, 10,
75 17, 25, 32, 40, 48, 56, 33, 41,
76 18, 26, 3, 11, 4, 12, 19, 27,
77 34, 42, 49, 57, 50, 58, 35, 43,
78 20, 28, 5, 13, 6, 14, 21, 29,
79 36, 44, 51, 59, 52, 60, 37, 45,
80 22, 30, 7, 15, 23, 31, 38, 46,
81 53, 61, 54, 62, 39, 47, 55, 63,
82 };
83
84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
86
87 const uint8_t ff_alternate_horizontal_scan[64] = {
88 0, 1, 2, 3, 8, 9, 16, 17,
89 10, 11, 4, 5, 6, 7, 15, 14,
90 13, 12, 19, 18, 24, 25, 32, 33,
91 26, 27, 20, 21, 22, 23, 28, 29,
92 30, 31, 34, 35, 40, 41, 48, 49,
93 42, 43, 36, 37, 38, 39, 44, 45,
94 46, 47, 50, 51, 56, 57, 58, 59,
95 52, 53, 54, 55, 60, 61, 62, 63,
96 };
97
98 const uint8_t ff_alternate_vertical_scan[64] = {
99 0, 8, 16, 24, 1, 9, 2, 10,
100 17, 25, 32, 40, 48, 56, 57, 49,
101 41, 33, 26, 18, 3, 11, 4, 12,
102 19, 27, 34, 42, 50, 58, 35, 43,
103 51, 59, 20, 28, 5, 13, 6, 14,
104 21, 29, 36, 44, 52, 60, 37, 45,
105 53, 61, 22, 30, 7, 15, 23, 31,
106 38, 46, 54, 62, 39, 47, 55, 63,
107 };
108
109 /* Input permutation for the simple_idct_mmx */
110 static const uint8_t simple_mmx_permutation[64]={
111 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
112 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
113 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
114 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
115 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
116 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
117 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
118 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
119 };
120
121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
122
123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
124 int i;
125 int end;
126
127 st->scantable= src_scantable;
128
129 for(i=0; i<64; i++){
130 int j;
131 j = src_scantable[i];
132 st->permutated[i] = permutation[j];
133 #if ARCH_PPC
134 st->inverse[j] = i;
135 #endif
136 }
137
138 end=-1;
139 for(i=0; i<64; i++){
140 int j;
141 j = st->permutated[i];
142 if(j>end) end=j;
143 st->raster_end[i]= end;
144 }
145 }
146
147 void ff_init_scantable_permutation(uint8_t *idct_permutation,
148 int idct_permutation_type)
149 {
150 int i;
151
152 switch(idct_permutation_type){
153 case FF_NO_IDCT_PERM:
154 for(i=0; i<64; i++)
155 idct_permutation[i]= i;
156 break;
157 case FF_LIBMPEG2_IDCT_PERM:
158 for(i=0; i<64; i++)
159 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
160 break;
161 case FF_SIMPLE_IDCT_PERM:
162 for(i=0; i<64; i++)
163 idct_permutation[i]= simple_mmx_permutation[i];
164 break;
165 case FF_TRANSPOSE_IDCT_PERM:
166 for(i=0; i<64; i++)
167 idct_permutation[i]= ((i&7)<<3) | (i>>3);
168 break;
169 case FF_PARTTRANS_IDCT_PERM:
170 for(i=0; i<64; i++)
171 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
172 break;
173 case FF_SSE2_IDCT_PERM:
174 for(i=0; i<64; i++)
175 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
176 break;
177 default:
178 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
179 }
180 }
181
182 static int pix_sum_c(uint8_t * pix, int line_size)
183 {
184 int s, i, j;
185
186 s = 0;
187 for (i = 0; i < 16; i++) {
188 for (j = 0; j < 16; j += 8) {
189 s += pix[0];
190 s += pix[1];
191 s += pix[2];
192 s += pix[3];
193 s += pix[4];
194 s += pix[5];
195 s += pix[6];
196 s += pix[7];
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static int pix_norm1_c(uint8_t * pix, int line_size)
205 {
206 int s, i, j;
207 uint32_t *sq = ff_squareTbl + 256;
208
209 s = 0;
210 for (i = 0; i < 16; i++) {
211 for (j = 0; j < 16; j += 8) {
212 #if 0
213 s += sq[pix[0]];
214 s += sq[pix[1]];
215 s += sq[pix[2]];
216 s += sq[pix[3]];
217 s += sq[pix[4]];
218 s += sq[pix[5]];
219 s += sq[pix[6]];
220 s += sq[pix[7]];
221 #else
222 #if HAVE_FAST_64BIT
223 register uint64_t x=*(uint64_t*)pix;
224 s += sq[x&0xff];
225 s += sq[(x>>8)&0xff];
226 s += sq[(x>>16)&0xff];
227 s += sq[(x>>24)&0xff];
228 s += sq[(x>>32)&0xff];
229 s += sq[(x>>40)&0xff];
230 s += sq[(x>>48)&0xff];
231 s += sq[(x>>56)&0xff];
232 #else
233 register uint32_t x=*(uint32_t*)pix;
234 s += sq[x&0xff];
235 s += sq[(x>>8)&0xff];
236 s += sq[(x>>16)&0xff];
237 s += sq[(x>>24)&0xff];
238 x=*(uint32_t*)(pix+4);
239 s += sq[x&0xff];
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 #endif
244 #endif
245 pix += 8;
246 }
247 pix += line_size - 16;
248 }
249 return s;
250 }
251
252 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
253 int i;
254
255 for(i=0; i+8<=w; i+=8){
256 dst[i+0]= av_bswap32(src[i+0]);
257 dst[i+1]= av_bswap32(src[i+1]);
258 dst[i+2]= av_bswap32(src[i+2]);
259 dst[i+3]= av_bswap32(src[i+3]);
260 dst[i+4]= av_bswap32(src[i+4]);
261 dst[i+5]= av_bswap32(src[i+5]);
262 dst[i+6]= av_bswap32(src[i+6]);
263 dst[i+7]= av_bswap32(src[i+7]);
264 }
265 for(;i<w; i++){
266 dst[i+0]= av_bswap32(src[i+0]);
267 }
268 }
269
270 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
271 {
272 while (len--)
273 *dst++ = av_bswap16(*src++);
274 }
275
276 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 {
278 int s, i;
279 uint32_t *sq = ff_squareTbl + 256;
280
281 s = 0;
282 for (i = 0; i < h; i++) {
283 s += sq[pix1[0] - pix2[0]];
284 s += sq[pix1[1] - pix2[1]];
285 s += sq[pix1[2] - pix2[2]];
286 s += sq[pix1[3] - pix2[3]];
287 pix1 += line_size;
288 pix2 += line_size;
289 }
290 return s;
291 }
292
293 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 {
295 int s, i;
296 uint32_t *sq = ff_squareTbl + 256;
297
298 s = 0;
299 for (i = 0; i < h; i++) {
300 s += sq[pix1[0] - pix2[0]];
301 s += sq[pix1[1] - pix2[1]];
302 s += sq[pix1[2] - pix2[2]];
303 s += sq[pix1[3] - pix2[3]];
304 s += sq[pix1[4] - pix2[4]];
305 s += sq[pix1[5] - pix2[5]];
306 s += sq[pix1[6] - pix2[6]];
307 s += sq[pix1[7] - pix2[7]];
308 pix1 += line_size;
309 pix2 += line_size;
310 }
311 return s;
312 }
313
314 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 {
316 int s, i;
317 uint32_t *sq = ff_squareTbl + 256;
318
319 s = 0;
320 for (i = 0; i < h; i++) {
321 s += sq[pix1[ 0] - pix2[ 0]];
322 s += sq[pix1[ 1] - pix2[ 1]];
323 s += sq[pix1[ 2] - pix2[ 2]];
324 s += sq[pix1[ 3] - pix2[ 3]];
325 s += sq[pix1[ 4] - pix2[ 4]];
326 s += sq[pix1[ 5] - pix2[ 5]];
327 s += sq[pix1[ 6] - pix2[ 6]];
328 s += sq[pix1[ 7] - pix2[ 7]];
329 s += sq[pix1[ 8] - pix2[ 8]];
330 s += sq[pix1[ 9] - pix2[ 9]];
331 s += sq[pix1[10] - pix2[10]];
332 s += sq[pix1[11] - pix2[11]];
333 s += sq[pix1[12] - pix2[12]];
334 s += sq[pix1[13] - pix2[13]];
335 s += sq[pix1[14] - pix2[14]];
336 s += sq[pix1[15] - pix2[15]];
337
338 pix1 += line_size;
339 pix2 += line_size;
340 }
341 return s;
342 }
343
344 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
345 const uint8_t *s2, int stride){
346 int i;
347
348 /* read the pixels */
349 for(i=0;i<8;i++) {
350 block[0] = s1[0] - s2[0];
351 block[1] = s1[1] - s2[1];
352 block[2] = s1[2] - s2[2];
353 block[3] = s1[3] - s2[3];
354 block[4] = s1[4] - s2[4];
355 block[5] = s1[5] - s2[5];
356 block[6] = s1[6] - s2[6];
357 block[7] = s1[7] - s2[7];
358 s1 += stride;
359 s2 += stride;
360 block += 8;
361 }
362 }
363
364
365 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
366 int line_size)
367 {
368 int i;
369 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
370
371 /* read the pixels */
372 for(i=0;i<8;i++) {
373 pixels[0] = cm[block[0]];
374 pixels[1] = cm[block[1]];
375 pixels[2] = cm[block[2]];
376 pixels[3] = cm[block[3]];
377 pixels[4] = cm[block[4]];
378 pixels[5] = cm[block[5]];
379 pixels[6] = cm[block[6]];
380 pixels[7] = cm[block[7]];
381
382 pixels += line_size;
383 block += 8;
384 }
385 }
386
387 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
388 int line_size)
389 {
390 int i;
391 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
392
393 /* read the pixels */
394 for(i=0;i<4;i++) {
395 pixels[0] = cm[block[0]];
396 pixels[1] = cm[block[1]];
397 pixels[2] = cm[block[2]];
398 pixels[3] = cm[block[3]];
399
400 pixels += line_size;
401 block += 8;
402 }
403 }
404
405 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
406 int line_size)
407 {
408 int i;
409 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
410
411 /* read the pixels */
412 for(i=0;i<2;i++) {
413 pixels[0] = cm[block[0]];
414 pixels[1] = cm[block[1]];
415
416 pixels += line_size;
417 block += 8;
418 }
419 }
420
421 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
422 uint8_t *restrict pixels,
423 int line_size)
424 {
425 int i, j;
426
427 for (i = 0; i < 8; i++) {
428 for (j = 0; j < 8; j++) {
429 if (*block < -128)
430 *pixels = 0;
431 else if (*block > 127)
432 *pixels = 255;
433 else
434 *pixels = (uint8_t)(*block + 128);
435 block++;
436 pixels++;
437 }
438 pixels += (line_size - 8);
439 }
440 }
441
442 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
443 int line_size)
444 {
445 int i;
446 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
447
448 /* read the pixels */
449 for(i=0;i<8;i++) {
450 pixels[0] = cm[pixels[0] + block[0]];
451 pixels[1] = cm[pixels[1] + block[1]];
452 pixels[2] = cm[pixels[2] + block[2]];
453 pixels[3] = cm[pixels[3] + block[3]];
454 pixels[4] = cm[pixels[4] + block[4]];
455 pixels[5] = cm[pixels[5] + block[5]];
456 pixels[6] = cm[pixels[6] + block[6]];
457 pixels[7] = cm[pixels[7] + block[7]];
458 pixels += line_size;
459 block += 8;
460 }
461 }
462
463 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
464 int line_size)
465 {
466 int i;
467 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
468
469 /* read the pixels */
470 for(i=0;i<4;i++) {
471 pixels[0] = cm[pixels[0] + block[0]];
472 pixels[1] = cm[pixels[1] + block[1]];
473 pixels[2] = cm[pixels[2] + block[2]];
474 pixels[3] = cm[pixels[3] + block[3]];
475 pixels += line_size;
476 block += 8;
477 }
478 }
479
480 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
481 int line_size)
482 {
483 int i;
484 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485
486 /* read the pixels */
487 for(i=0;i<2;i++) {
488 pixels[0] = cm[pixels[0] + block[0]];
489 pixels[1] = cm[pixels[1] + block[1]];
490 pixels += line_size;
491 block += 8;
492 }
493 }
494
495 static int sum_abs_dctelem_c(DCTELEM *block)
496 {
497 int sum=0, i;
498 for(i=0; i<64; i++)
499 sum+= FFABS(block[i]);
500 return sum;
501 }
502
503 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
504 {
505 int i;
506
507 for (i = 0; i < h; i++) {
508 memset(block, value, 16);
509 block += line_size;
510 }
511 }
512
513 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
514 {
515 int i;
516
517 for (i = 0; i < h; i++) {
518 memset(block, value, 8);
519 block += line_size;
520 }
521 }
522
523 #define avg2(a,b) ((a+b+1)>>1)
524 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
525
526 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
527 {
528 const int A=(16-x16)*(16-y16);
529 const int B=( x16)*(16-y16);
530 const int C=(16-x16)*( y16);
531 const int D=( x16)*( y16);
532 int i;
533
534 for(i=0; i<h; i++)
535 {
536 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
537 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
538 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
539 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
540 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
541 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
542 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
543 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
544 dst+= stride;
545 src+= stride;
546 }
547 }
548
549 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
550 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
551 {
552 int y, vx, vy;
553 const int s= 1<<shift;
554
555 width--;
556 height--;
557
558 for(y=0; y<h; y++){
559 int x;
560
561 vx= ox;
562 vy= oy;
563 for(x=0; x<8; x++){ //XXX FIXME optimize
564 int src_x, src_y, frac_x, frac_y, index;
565
566 src_x= vx>>16;
567 src_y= vy>>16;
568 frac_x= src_x&(s-1);
569 frac_y= src_y&(s-1);
570 src_x>>=shift;
571 src_y>>=shift;
572
573 if((unsigned)src_x < width){
574 if((unsigned)src_y < height){
575 index= src_x + src_y*stride;
576 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
577 + src[index +1]* frac_x )*(s-frac_y)
578 + ( src[index+stride ]*(s-frac_x)
579 + src[index+stride+1]* frac_x )* frac_y
580 + r)>>(shift*2);
581 }else{
582 index= src_x + av_clip(src_y, 0, height)*stride;
583 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
584 + src[index +1]* frac_x )*s
585 + r)>>(shift*2);
586 }
587 }else{
588 if((unsigned)src_y < height){
589 index= av_clip(src_x, 0, width) + src_y*stride;
590 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
591 + src[index+stride ]* frac_y )*s
592 + r)>>(shift*2);
593 }else{
594 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
595 dst[y*stride + x]= src[index ];
596 }
597 }
598
599 vx+= dxx;
600 vy+= dyx;
601 }
602 ox += dxy;
603 oy += dyy;
604 }
605 }
606
607 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
608 switch(width){
609 case 2: put_pixels2_8_c (dst, src, stride, height); break;
610 case 4: put_pixels4_8_c (dst, src, stride, height); break;
611 case 8: put_pixels8_8_c (dst, src, stride, height); break;
612 case 16:put_pixels16_8_c(dst, src, stride, height); break;
613 }
614 }
615
616 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617 int i,j;
618 for (i=0; i < height; i++) {
619 for (j=0; j < width; j++) {
620 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
621 }
622 src += stride;
623 dst += stride;
624 }
625 }
626
627 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
628 int i,j;
629 for (i=0; i < height; i++) {
630 for (j=0; j < width; j++) {
631 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
632 }
633 src += stride;
634 dst += stride;
635 }
636 }
637
638 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 int i,j;
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
643 }
644 src += stride;
645 dst += stride;
646 }
647 }
648
649 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 int i,j;
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
654 }
655 src += stride;
656 dst += stride;
657 }
658 }
659
660 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 int i,j;
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
665 }
666 src += stride;
667 dst += stride;
668 }
669 }
670
671 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 int i,j;
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
676 }
677 src += stride;
678 dst += stride;
679 }
680 }
681
682 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 int i,j;
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
687 }
688 src += stride;
689 dst += stride;
690 }
691 }
692
693 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 int i,j;
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
698 }
699 src += stride;
700 dst += stride;
701 }
702 }
703
704 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 switch(width){
706 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
707 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
708 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
709 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
710 }
711 }
712
713 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714 int i,j;
715 for (i=0; i < height; i++) {
716 for (j=0; j < width; j++) {
717 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
718 }
719 src += stride;
720 dst += stride;
721 }
722 }
723
724 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
725 int i,j;
726 for (i=0; i < height; i++) {
727 for (j=0; j < width; j++) {
728 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
729 }
730 src += stride;
731 dst += stride;
732 }
733 }
734
735 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 int i,j;
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
740 }
741 src += stride;
742 dst += stride;
743 }
744 }
745
746 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 int i,j;
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
751 }
752 src += stride;
753 dst += stride;
754 }
755 }
756
757 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 int i,j;
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
762 }
763 src += stride;
764 dst += stride;
765 }
766 }
767
768 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 int i,j;
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
773 }
774 src += stride;
775 dst += stride;
776 }
777 }
778
779 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 int i,j;
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
784 }
785 src += stride;
786 dst += stride;
787 }
788 }
789
790 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 int i,j;
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
795 }
796 src += stride;
797 dst += stride;
798 }
799 }
800
801 #define QPEL_MC(r, OPNAME, RND, OP) \
802 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
803 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
804 int i;\
805 for(i=0; i<h; i++)\
806 {\
807 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
808 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
809 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
810 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
811 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
812 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
813 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
814 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
815 dst+=dstStride;\
816 src+=srcStride;\
817 }\
818 }\
819 \
820 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
821 const int w=8;\
822 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
823 int i;\
824 for(i=0; i<w; i++)\
825 {\
826 const int src0= src[0*srcStride];\
827 const int src1= src[1*srcStride];\
828 const int src2= src[2*srcStride];\
829 const int src3= src[3*srcStride];\
830 const int src4= src[4*srcStride];\
831 const int src5= src[5*srcStride];\
832 const int src6= src[6*srcStride];\
833 const int src7= src[7*srcStride];\
834 const int src8= src[8*srcStride];\
835 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
836 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
837 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
838 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
839 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
840 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
841 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
842 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
843 dst++;\
844 src++;\
845 }\
846 }\
847 \
848 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
849 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
850 int i;\
851 \
852 for(i=0; i<h; i++)\
853 {\
854 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
855 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
856 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
857 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
858 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
859 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
860 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
861 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
862 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
863 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
864 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
865 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
866 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
867 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
868 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
869 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
870 dst+=dstStride;\
871 src+=srcStride;\
872 }\
873 }\
874 \
875 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
876 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
877 int i;\
878 const int w=16;\
879 for(i=0; i<w; i++)\
880 {\
881 const int src0= src[0*srcStride];\
882 const int src1= src[1*srcStride];\
883 const int src2= src[2*srcStride];\
884 const int src3= src[3*srcStride];\
885 const int src4= src[4*srcStride];\
886 const int src5= src[5*srcStride];\
887 const int src6= src[6*srcStride];\
888 const int src7= src[7*srcStride];\
889 const int src8= src[8*srcStride];\
890 const int src9= src[9*srcStride];\
891 const int src10= src[10*srcStride];\
892 const int src11= src[11*srcStride];\
893 const int src12= src[12*srcStride];\
894 const int src13= src[13*srcStride];\
895 const int src14= src[14*srcStride];\
896 const int src15= src[15*srcStride];\
897 const int src16= src[16*srcStride];\
898 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
899 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
900 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
901 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
902 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
903 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
904 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
905 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
906 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
907 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
908 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
909 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
910 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
911 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
912 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
913 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
914 dst++;\
915 src++;\
916 }\
917 }\
918 \
919 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
920 uint8_t half[64];\
921 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
922 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
923 }\
924 \
925 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
926 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
927 }\
928 \
929 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
930 uint8_t half[64];\
931 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
932 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
933 }\
934 \
935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
936 uint8_t full[16*9];\
937 uint8_t half[64];\
938 copy_block9(full, src, 16, stride, 9);\
939 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
940 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
941 }\
942 \
943 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
944 uint8_t full[16*9];\
945 copy_block9(full, src, 16, stride, 9);\
946 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
947 }\
948 \
949 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
950 uint8_t full[16*9];\
951 uint8_t half[64];\
952 copy_block9(full, src, 16, stride, 9);\
953 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
954 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
955 }\
956 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
957 uint8_t full[16*9];\
958 uint8_t halfH[72];\
959 uint8_t halfV[64];\
960 uint8_t halfHV[64];\
961 copy_block9(full, src, 16, stride, 9);\
962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
963 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
965 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
966 }\
967 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
968 uint8_t full[16*9];\
969 uint8_t halfH[72];\
970 uint8_t halfHV[64];\
971 copy_block9(full, src, 16, stride, 9);\
972 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
973 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
974 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
975 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
976 }\
977 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
978 uint8_t full[16*9];\
979 uint8_t halfH[72];\
980 uint8_t halfV[64];\
981 uint8_t halfHV[64];\
982 copy_block9(full, src, 16, stride, 9);\
983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
984 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
986 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
987 }\
988 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
989 uint8_t full[16*9];\
990 uint8_t halfH[72];\
991 uint8_t halfHV[64];\
992 copy_block9(full, src, 16, stride, 9);\
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
994 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
996 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
997 }\
998 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
999 uint8_t full[16*9];\
1000 uint8_t halfH[72];\
1001 uint8_t halfV[64];\
1002 uint8_t halfHV[64];\
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1008 }\
1009 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1010 uint8_t full[16*9];\
1011 uint8_t halfH[72];\
1012 uint8_t halfHV[64];\
1013 copy_block9(full, src, 16, stride, 9);\
1014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1018 }\
1019 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t full[16*9];\
1021 uint8_t halfH[72];\
1022 uint8_t halfV[64];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1029 }\
1030 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1032 uint8_t halfH[72];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1039 }\
1040 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t halfH[72];\
1042 uint8_t halfHV[64];\
1043 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1044 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1045 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1046 }\
1047 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t halfH[72];\
1049 uint8_t halfHV[64];\
1050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1053 }\
1054 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1055 uint8_t full[16*9];\
1056 uint8_t halfH[72];\
1057 uint8_t halfV[64];\
1058 uint8_t halfHV[64];\
1059 copy_block9(full, src, 16, stride, 9);\
1060 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1061 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1063 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1064 }\
1065 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1066 uint8_t full[16*9];\
1067 uint8_t halfH[72];\
1068 copy_block9(full, src, 16, stride, 9);\
1069 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1071 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1072 }\
1073 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1074 uint8_t full[16*9];\
1075 uint8_t halfH[72];\
1076 uint8_t halfV[64];\
1077 uint8_t halfHV[64];\
1078 copy_block9(full, src, 16, stride, 9);\
1079 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1080 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1081 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1082 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1083 }\
1084 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1085 uint8_t full[16*9];\
1086 uint8_t halfH[72];\
1087 copy_block9(full, src, 16, stride, 9);\
1088 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1089 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1090 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 }\
1092 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1093 uint8_t halfH[72];\
1094 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1095 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1096 }\
1097 \
1098 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1099 uint8_t half[256];\
1100 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1101 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1102 }\
1103 \
1104 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1105 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1106 }\
1107 \
1108 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1109 uint8_t half[256];\
1110 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1111 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1112 }\
1113 \
1114 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1115 uint8_t full[24*17];\
1116 uint8_t half[256];\
1117 copy_block17(full, src, 24, stride, 17);\
1118 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1119 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1120 }\
1121 \
1122 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1123 uint8_t full[24*17];\
1124 copy_block17(full, src, 24, stride, 17);\
1125 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1126 }\
1127 \
1128 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1129 uint8_t full[24*17];\
1130 uint8_t half[256];\
1131 copy_block17(full, src, 24, stride, 17);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1133 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1134 }\
1135 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1136 uint8_t full[24*17];\
1137 uint8_t halfH[272];\
1138 uint8_t halfV[256];\
1139 uint8_t halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1145 }\
1146 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1147 uint8_t full[24*17];\
1148 uint8_t halfH[272];\
1149 uint8_t halfHV[256];\
1150 copy_block17(full, src, 24, stride, 17);\
1151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1152 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1153 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1154 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1155 }\
1156 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157 uint8_t full[24*17];\
1158 uint8_t halfH[272];\
1159 uint8_t halfV[256];\
1160 uint8_t halfHV[256];\
1161 copy_block17(full, src, 24, stride, 17);\
1162 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1166 }\
1167 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1168 uint8_t full[24*17];\
1169 uint8_t halfH[272];\
1170 uint8_t halfHV[256];\
1171 copy_block17(full, src, 24, stride, 17);\
1172 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1176 }\
1177 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178 uint8_t full[24*17];\
1179 uint8_t halfH[272];\
1180 uint8_t halfV[256];\
1181 uint8_t halfHV[256];\
1182 copy_block17(full, src, 24, stride, 17);\
1183 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187 }\
1188 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1189 uint8_t full[24*17];\
1190 uint8_t halfH[272];\
1191 uint8_t halfHV[256];\
1192 copy_block17(full, src, 24, stride, 17);\
1193 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1197 }\
1198 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfV[256];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208 }\
1209 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfHV[256];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1218 }\
1219 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1220 uint8_t halfH[272];\
1221 uint8_t halfHV[256];\
1222 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1224 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1225 }\
1226 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1227 uint8_t halfH[272];\
1228 uint8_t halfHV[256];\
1229 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1230 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1231 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1232 }\
1233 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1234 uint8_t full[24*17];\
1235 uint8_t halfH[272];\
1236 uint8_t halfV[256];\
1237 uint8_t halfHV[256];\
1238 copy_block17(full, src, 24, stride, 17);\
1239 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1240 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1241 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1242 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1243 }\
1244 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 copy_block17(full, src, 24, stride, 17);\
1248 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1250 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1251 }\
1252 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1253 uint8_t full[24*17];\
1254 uint8_t halfH[272];\
1255 uint8_t halfV[256];\
1256 uint8_t halfHV[256];\
1257 copy_block17(full, src, 24, stride, 17);\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1260 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1261 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1262 }\
1263 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t full[24*17];\
1265 uint8_t halfH[272];\
1266 copy_block17(full, src, 24, stride, 17);\
1267 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1268 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1269 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270 }\
1271 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1272 uint8_t halfH[272];\
1273 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1274 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1275 }
1276
1277 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1278 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1279 #define op_put(a, b) a = cm[((b) + 16)>>5]
1280 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1281
1282 QPEL_MC(0, put_ , _ , op_put)
1283 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1284 QPEL_MC(0, avg_ , _ , op_avg)
1285 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1286 #undef op_avg
1287 #undef op_avg_no_rnd
1288 #undef op_put
1289 #undef op_put_no_rnd
1290
1291 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1292 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1293 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1294 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1295 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1296 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1297
1298 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1299 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1300 int i;
1301
1302 for(i=0; i<h; i++){
1303 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1304 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1305 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1306 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1307 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1308 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1309 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1310 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1311 dst+=dstStride;
1312 src+=srcStride;
1313 }
1314 }
1315
1316 #if CONFIG_RV40_DECODER
1317 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1318 put_pixels16_xy2_8_c(dst, src, stride, 16);
1319 }
1320 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1321 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1322 }
1323 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1324 put_pixels8_xy2_8_c(dst, src, stride, 8);
1325 }
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1327 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1328 }
1329 #endif /* CONFIG_RV40_DECODER */
1330
1331 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1332 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1333 int i;
1334
1335 for(i=0; i<w; i++){
1336 const int src_1= src[ -srcStride];
1337 const int src0 = src[0 ];
1338 const int src1 = src[ srcStride];
1339 const int src2 = src[2*srcStride];
1340 const int src3 = src[3*srcStride];
1341 const int src4 = src[4*srcStride];
1342 const int src5 = src[5*srcStride];
1343 const int src6 = src[6*srcStride];
1344 const int src7 = src[7*srcStride];
1345 const int src8 = src[8*srcStride];
1346 const int src9 = src[9*srcStride];
1347 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1348 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1349 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1350 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1351 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1352 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1353 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1354 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1355 src++;
1356 dst++;
1357 }
1358 }
1359
1360 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1361 uint8_t half[64];
1362 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1363 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1364 }
1365
1366 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1367 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1368 }
1369
1370 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1371 uint8_t half[64];
1372 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1373 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1374 }
1375
1376 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1377 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1378 }
1379
1380 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1381 uint8_t halfH[88];
1382 uint8_t halfV[64];
1383 uint8_t halfHV[64];
1384 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1385 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1386 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1387 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1388 }
1389 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1390 uint8_t halfH[88];
1391 uint8_t halfV[64];
1392 uint8_t halfHV[64];
1393 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1394 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1395 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1396 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1397 }
1398 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1399 uint8_t halfH[88];
1400 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1402 }
1403
1404 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1405 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1406 int x;
1407 const int strength= ff_h263_loop_filter_strength[qscale];
1408
1409 for(x=0; x<8; x++){
1410 int d1, d2, ad1;
1411 int p0= src[x-2*stride];
1412 int p1= src[x-1*stride];
1413 int p2= src[x+0*stride];
1414 int p3= src[x+1*stride];
1415 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1416
1417 if (d<-2*strength) d1= 0;
1418 else if(d<- strength) d1=-2*strength - d;
1419 else if(d< strength) d1= d;
1420 else if(d< 2*strength) d1= 2*strength - d;
1421 else d1= 0;
1422
1423 p1 += d1;
1424 p2 -= d1;
1425 if(p1&256) p1= ~(p1>>31);
1426 if(p2&256) p2= ~(p2>>31);
1427
1428 src[x-1*stride] = p1;
1429 src[x+0*stride] = p2;
1430
1431 ad1= FFABS(d1)>>1;
1432
1433 d2= av_clip((p0-p3)/4, -ad1, ad1);
1434
1435 src[x-2*stride] = p0 - d2;
1436 src[x+ stride] = p3 + d2;
1437 }
1438 }
1439 }
1440
1441 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1442 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1443 int y;
1444 const int strength= ff_h263_loop_filter_strength[qscale];
1445
1446 for(y=0; y<8; y++){
1447 int d1, d2, ad1;
1448 int p0= src[y*stride-2];
1449 int p1= src[y*stride-1];
1450 int p2= src[y*stride+0];
1451 int p3= src[y*stride+1];
1452 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1453
1454 if (d<-2*strength) d1= 0;
1455 else if(d<- strength) d1=-2*strength - d;
1456 else if(d< strength) d1= d;
1457 else if(d< 2*strength) d1= 2*strength - d;
1458 else d1= 0;
1459
1460 p1 += d1;
1461 p2 -= d1;
1462 if(p1&256) p1= ~(p1>>31);
1463 if(p2&256) p2= ~(p2>>31);
1464
1465 src[y*stride-1] = p1;
1466 src[y*stride+0] = p2;
1467
1468 ad1= FFABS(d1)>>1;
1469
1470 d2= av_clip((p0-p3)/4, -ad1, ad1);
1471
1472 src[y*stride-2] = p0 - d2;
1473 src[y*stride+1] = p3 + d2;
1474 }
1475 }
1476 }
1477
1478 static void h261_loop_filter_c(uint8_t *src, int stride){
1479 int x,y,xy,yz;
1480 int temp[64];
1481
1482 for(x=0; x<8; x++){
1483 temp[x ] = 4*src[x ];
1484 temp[x + 7*8] = 4*src[x + 7*stride];
1485 }
1486 for(y=1; y<7; y++){
1487 for(x=0; x<8; x++){
1488 xy = y * stride + x;
1489 yz = y * 8 + x;
1490 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1491 }
1492 }
1493
1494 for(y=0; y<8; y++){
1495 src[ y*stride] = (temp[ y*8] + 2)>>2;
1496 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1497 for(x=1; x<7; x++){
1498 xy = y * stride + x;
1499 yz = y * 8 + x;
1500 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1501 }
1502 }
1503 }
1504
1505 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1506 {
1507 int s, i;
1508
1509 s = 0;
1510 for(i=0;i<h;i++) {
1511 s += abs(pix1[0] - pix2[0]);
1512 s += abs(pix1[1] - pix2[1]);
1513 s += abs(pix1[2] - pix2[2]);
1514 s += abs(pix1[3] - pix2[3]);
1515 s += abs(pix1[4] - pix2[4]);
1516 s += abs(pix1[5] - pix2[5]);
1517 s += abs(pix1[6] - pix2[6]);
1518 s += abs(pix1[7] - pix2[7]);
1519 s += abs(pix1[8] - pix2[8]);
1520 s += abs(pix1[9] - pix2[9]);
1521 s += abs(pix1[10] - pix2[10]);
1522 s += abs(pix1[11] - pix2[11]);
1523 s += abs(pix1[12] - pix2[12]);
1524 s += abs(pix1[13] - pix2[13]);
1525 s += abs(pix1[14] - pix2[14]);
1526 s += abs(pix1[15] - pix2[15]);
1527 pix1 += line_size;
1528 pix2 += line_size;
1529 }
1530 return s;
1531 }
1532
1533 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1534 {
1535 int s, i;
1536
1537 s = 0;
1538 for(i=0;i<h;i++) {
1539 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1540 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1541 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1542 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1543 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1544 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1545 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1546 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1547 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1548 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1549 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1550 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1551 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1552 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1553 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1554 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1555 pix1 += line_size;
1556 pix2 += line_size;
1557 }
1558 return s;
1559 }
1560
1561 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1562 {
1563 int s, i;
1564 uint8_t *pix3 = pix2 + line_size;
1565
1566 s = 0;
1567 for(i=0;i<h;i++) {
1568 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1569 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1570 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1571 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1572 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1573 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1574 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1575 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1576 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1577 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1578 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1579 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1580 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1581 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1582 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1583 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1584 pix1 += line_size;
1585 pix2 += line_size;
1586 pix3 += line_size;
1587 }
1588 return s;
1589 }
1590
1591 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1592 {
1593 int s, i;
1594 uint8_t *pix3 = pix2 + line_size;
1595
1596 s = 0;
1597 for(i=0;i<h;i++) {
1598 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1599 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1600 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1601 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1602 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1603 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1604 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1605 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1606 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1607 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1608 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1609 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1610 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1611 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1612 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1613 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1614 pix1 += line_size;
1615 pix2 += line_size;
1616 pix3 += line_size;
1617 }
1618 return s;
1619 }
1620
1621 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1622 {
1623 int s, i;
1624
1625 s = 0;
1626 for(i=0;i<h;i++) {
1627 s += abs(pix1[0] - pix2[0]);
1628 s += abs(pix1[1] - pix2[1]);
1629 s += abs(pix1[2] - pix2[2]);
1630 s += abs(pix1[3] - pix2[3]);
1631 s += abs(pix1[4] - pix2[4]);
1632 s += abs(pix1[5] - pix2[5]);
1633 s += abs(pix1[6] - pix2[6]);
1634 s += abs(pix1[7] - pix2[7]);
1635 pix1 += line_size;
1636 pix2 += line_size;
1637 }
1638 return s;
1639 }
1640
1641 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1642 {
1643 int s, i;
1644
1645 s = 0;
1646 for(i=0;i<h;i++) {
1647 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1648 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1649 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1650 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1651 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1652 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1653 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1654 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1655 pix1 += line_size;
1656 pix2 += line_size;
1657 }
1658 return s;
1659 }
1660
1661 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1662 {
1663 int s, i;
1664 uint8_t *pix3 = pix2 + line_size;
1665
1666 s = 0;
1667 for(i=0;i<h;i++) {
1668 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1669 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1670 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1671 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1672 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1673 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1674 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1675 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1676 pix1 += line_size;
1677 pix2 += line_size;
1678 pix3 += line_size;
1679 }
1680 return s;
1681 }
1682
1683 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 {
1685 int s, i;
1686 uint8_t *pix3 = pix2 + line_size;
1687
1688 s = 0;
1689 for(i=0;i<h;i++) {
1690 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1691 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1692 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1693 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1694 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1695 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1696 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1697 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1698 pix1 += line_size;
1699 pix2 += line_size;
1700 pix3 += line_size;
1701 }
1702 return s;
1703 }
1704
1705 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1706 MpegEncContext *c = v;
1707 int score1=0;
1708 int score2=0;
1709 int x,y;
1710
1711 for(y=0; y<h; y++){
1712 for(x=0; x<16; x++){
1713 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1714 }
1715 if(y+1<h){
1716 for(x=0; x<15; x++){
1717 score2+= FFABS( s1[x ] - s1[x +stride]
1718 - s1[x+1] + s1[x+1+stride])
1719 -FFABS( s2[x ] - s2[x +stride]
1720 - s2[x+1] + s2[x+1+stride]);
1721 }
1722 }
1723 s1+= stride;
1724 s2+= stride;
1725 }
1726
1727 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1728 else return score1 + FFABS(score2)*8;
1729 }
1730
1731 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1732 MpegEncContext *c = v;
1733 int score1=0;
1734 int score2=0;
1735 int x,y;
1736
1737 for(y=0; y<h; y++){
1738 for(x=0; x<8; x++){
1739 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1740 }
1741 if(y+1<h){
1742 for(x=0; x<7; x++){
1743 score2+= FFABS( s1[x ] - s1[x +stride]
1744 - s1[x+1] + s1[x+1+stride])
1745 -FFABS( s2[x ] - s2[x +stride]
1746 - s2[x+1] + s2[x+1+stride]);
1747 }
1748 }
1749 s1+= stride;
1750 s2+= stride;
1751 }
1752
1753 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1754 else return score1 + FFABS(score2)*8;
1755 }
1756
1757 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1758 int i;
1759 unsigned int sum=0;
1760
1761 for(i=0; i<8*8; i++){
1762 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1763 int w= weight[i];
1764 b>>= RECON_SHIFT;
1765 assert(-512<b && b<512);
1766
1767 sum += (w*b)*(w*b)>>4;
1768 }
1769 return sum>>2;
1770 }
1771
1772 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1773 int i;
1774
1775 for(i=0; i<8*8; i++){
1776 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1777 }
1778 }
1779
1780 /**
1781 * Permute an 8x8 block.
1782 * @param block the block which will be permuted according to the given permutation vector
1783 * @param permutation the permutation vector
1784 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1785 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1786 * (inverse) permutated to scantable order!
1787 */
1788 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1789 {
1790 int i;
1791 DCTELEM temp[64];
1792
1793 if(last<=0) return;
1794 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1795
1796 for(i=0; i<=last; i++){
1797 const int j= scantable[i];
1798 temp[j]= block[j];
1799 block[j]=0;
1800 }
1801
1802 for(i=0; i<=last; i++){
1803 const int j= scantable[i];
1804 const int perm_j= permutation[j];
1805 block[perm_j]= temp[j];
1806 }
1807 }
1808
1809 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1810 return 0;
1811 }
1812
1813 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1814 int i;
1815
1816 memset(cmp, 0, sizeof(void*)*6);
1817
1818 for(i=0; i<6; i++){
1819 switch(type&0xFF){
1820 case FF_CMP_SAD:
1821 cmp[i]= c->sad[i];
1822 break;
1823 case FF_CMP_SATD:
1824 cmp[i]= c->hadamard8_diff[i];
1825 break;
1826 case FF_CMP_SSE:
1827 cmp[i]= c->sse[i];
1828 break;
1829 case FF_CMP_DCT:
1830 cmp[i]= c->dct_sad[i];
1831 break;
1832 case FF_CMP_DCT264:
1833 cmp[i]= c->dct264_sad[i];
1834 break;
1835 case FF_CMP_DCTMAX:
1836 cmp[i]= c->dct_max[i];
1837 break;
1838 case FF_CMP_PSNR:
1839 cmp[i]= c->quant_psnr[i];
1840 break;
1841 case FF_CMP_BIT:
1842 cmp[i]= c->bit[i];
1843 break;
1844 case FF_CMP_RD:
1845 cmp[i]= c->rd[i];
1846 break;
1847 case FF_CMP_VSAD:
1848 cmp[i]= c->vsad[i];
1849 break;
1850 case FF_CMP_VSSE:
1851 cmp[i]= c->vsse[i];
1852 break;
1853 case FF_CMP_ZERO:
1854 cmp[i]= zero_cmp;
1855 break;
1856 case FF_CMP_NSSE:
1857 cmp[i]= c->nsse[i];
1858 break;
1859 #if CONFIG_DWT
1860 case FF_CMP_W53:
1861 cmp[i]= c->w53[i];
1862 break;
1863 case FF_CMP_W97:
1864 cmp[i]= c->w97[i];
1865 break;
1866 #endif
1867 default:
1868 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1869 }
1870 }
1871 }
1872
1873 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1874 long i;
1875 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1876 long a = *(long*)(src+i);
1877 long b = *(long*)(dst+i);
1878 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1879 }
1880 for(; i<w; i++)
1881 dst[i+0] += src[i+0];
1882 }
1883
1884 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1885 long i;
1886 #if !HAVE_FAST_UNALIGNED
1887 if((long)src2 & (sizeof(long)-1)){
1888 for(i=0; i+7<w; i+=8){
1889 dst[i+0] = src1[i+0]-src2[i+0];
1890 dst[i+1] = src1[i+1]-src2[i+1];
1891 dst[i+2] = src1[i+2]-src2[i+2];
1892 dst[i+3] = src1[i+3]-src2[i+3];
1893 dst[i+4] = src1[i+4]-src2[i+4];
1894 dst[i+5] = src1[i+5]-src2[i+5];
1895 dst[i+6] = src1[i+6]-src2[i+6];
1896 dst[i+7] = src1[i+7]-src2[i+7];
1897 }
1898 }else
1899 #endif
1900 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1901 long a = *(long*)(src1+i);
1902 long b = *(long*)(src2+i);
1903 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1904 }
1905 for(; i<w; i++)
1906 dst[i+0] = src1[i+0]-src2[i+0];
1907 }
1908
1909 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1910 int i;
1911 uint8_t l, lt;
1912
1913 l= *left;
1914 lt= *left_top;
1915
1916 for(i=0; i<w; i++){
1917 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1918 lt= src1[i];
1919 dst[i]= l;
1920 }
1921
1922 *left= l;
1923 *left_top= lt;
1924 }
1925
1926 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1927 int i;
1928 uint8_t l, lt;
1929
1930 l= *left;
1931 lt= *left_top;
1932
1933 for(i=0; i<w; i++){
1934 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1935 lt= src1[i];
1936 l= src2[i];
1937 dst[i]= l - pred;
1938 }
1939
1940 *left= l;
1941 *left_top= lt;
1942 }
1943
1944 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1945 int i;
1946
1947 for(i=0; i<w-1; i++){
1948 acc+= src[i];
1949 dst[i]= acc;
1950 i++;
1951 acc+= src[i];
1952 dst[i]= acc;
1953 }
1954
1955 for(; i<w; i++){
1956 acc+= src[i];
1957 dst[i]= acc;
1958 }
1959
1960 return acc;
1961 }
1962
1963 #if HAVE_BIGENDIAN
1964 #define B 3
1965 #define G 2
1966 #define R 1
1967 #define A 0
1968 #else
1969 #define B 0
1970 #define G 1
1971 #define R 2
1972 #define A 3
1973 #endif
1974 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1975 int i;
1976 int r,g,b,a;
1977 r= *red;
1978 g= *green;
1979 b= *blue;
1980 a= *alpha;
1981
1982 for(i=0; i<w; i++){
1983 b+= src[4*i+B];
1984 g+= src[4*i+G];
1985 r+= src[4*i+R];
1986 a+= src[4*i+A];
1987
1988 dst[4*i+B]= b;
1989 dst[4*i+G]= g;
1990 dst[4*i+R]= r;
1991 dst[4*i+A]= a;
1992 }
1993
1994 *red= r;
1995 *green= g;
1996 *blue= b;
1997 *alpha= a;
1998 }
1999 #undef B
2000 #undef G
2001 #undef R
2002 #undef A
2003
2004 #define BUTTERFLY2(o1,o2,i1,i2) \
2005 o1= (i1)+(i2);\
2006 o2= (i1)-(i2);
2007
2008 #define BUTTERFLY1(x,y) \
2009 {\
2010 int a,b;\
2011 a= x;\
2012 b= y;\
2013 x= a+b;\
2014 y= a-b;\
2015 }
2016
2017 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2018
2019 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2020 int i;
2021 int temp[64];
2022 int sum=0;
2023
2024 assert(h==8);
2025
2026 for(i=0; i<8; i++){
2027 //FIXME try pointer walks
2028 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2029 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2030 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2031 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2032
2033 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2034 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2035 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2036 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2037
2038 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2039 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2040 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2041 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2042 }
2043
2044 for(i=0; i<8; i++){
2045 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2046 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2047 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2048 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2049
2050 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2051 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2052 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2053 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2054
2055 sum +=
2056 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2057 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2058 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2059 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2060 }
2061 return sum;
2062 }
2063
2064 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2065 int i;
2066 int temp[64];
2067 int sum=0;
2068
2069 assert(h==8);
2070
2071 for(i=0; i<8; i++){
2072 //FIXME try pointer walks
2073 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2074 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2075 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2076 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2077
2078 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2079 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2080 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2081 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2082
2083 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2084 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2085 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2086 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2087 }
2088
2089 for(i=0; i<8; i++){
2090 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2091 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2092 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2093 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2094
2095 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2096 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2097 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2098 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2099
2100 sum +=
2101 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2102 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2103 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2104 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2105 }
2106
2107 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2108
2109 return sum;
2110 }
2111
2112 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2113 MpegEncContext * const s= (MpegEncContext *)c;
2114 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2115
2116 assert(h==8);
2117
2118 s->dsp.diff_pixels(temp, src1, src2, stride);
2119 s->dsp.fdct(temp);
2120 return s->dsp.sum_abs_dctelem(temp);
2121 }
2122
2123 #if CONFIG_GPL
2124 #define DCT8_1D {\
2125 const int s07 = SRC(0) + SRC(7);\
2126 const int s16 = SRC(1) + SRC(6);\
2127 const int s25 = SRC(2) + SRC(5);\
2128 const int s34 = SRC(3) + SRC(4);\
2129 const int a0 = s07 + s34;\
2130 const int a1 = s16 + s25;\
2131 const int a2 = s07 - s34;\
2132 const int a3 = s16 - s25;\
2133 const int d07 = SRC(0) - SRC(7);\
2134 const int d16 = SRC(1) - SRC(6);\
2135 const int d25 = SRC(2) - SRC(5);\
2136 const int d34 = SRC(3) - SRC(4);\
2137 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2138 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2139 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2140 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2141 DST(0, a0 + a1 ) ;\
2142 DST(1, a4 + (a7>>2)) ;\
2143 DST(2, a2 + (a3>>1)) ;\
2144 DST(3, a5 + (a6>>2)) ;\
2145 DST(4, a0 - a1 ) ;\
2146 DST(5, a6 - (a5>>2)) ;\
2147 DST(6, (a2>>1) - a3 ) ;\
2148 DST(7, (a4>>2) - a7 ) ;\
2149 }
2150
2151 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2152 MpegEncContext * const s= (MpegEncContext *)c;
2153 DCTELEM dct[8][8];
2154 int i;
2155 int sum=0;
2156
2157 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2158
2159 #define SRC(x) dct[i][x]
2160 #define DST(x,v) dct[i][x]= v
2161 for( i = 0; i < 8; i++ )
2162 DCT8_1D
2163 #undef SRC
2164 #undef DST
2165
2166 #define SRC(x) dct[x][i]
2167 #define DST(x,v) sum += FFABS(v)
2168 for( i = 0; i < 8; i++ )
2169 DCT8_1D
2170 #undef SRC
2171 #undef DST
2172 return sum;
2173 }
2174 #endif
2175
2176 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2177 MpegEncContext * const s= (MpegEncContext *)c;
2178 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2179 int sum=0, i;
2180
2181 assert(h==8);
2182
2183 s->dsp.diff_pixels(temp, src1, src2, stride);
2184 s->dsp.fdct(temp);
2185
2186 for(i=0; i<64; i++)
2187 sum= FFMAX(sum, FFABS(temp[i]));
2188
2189 return sum;
2190 }
2191
2192 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2193 MpegEncContext * const s= (MpegEncContext *)c;
2194 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2195 DCTELEM * const bak = temp+64;
2196 int sum=0, i;
2197
2198 assert(h==8);
2199 s->mb_intra=0;
2200
2201 s->dsp.diff_pixels(temp, src1, src2, stride);
2202
2203 memcpy(bak, temp, 64*sizeof(DCTELEM));
2204
2205 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2206 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2207 ff_simple_idct_8(temp); //FIXME
2208
2209 for(i=0; i<64; i++)
2210 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2211
2212 return sum;
2213 }
2214
2215 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2216 MpegEncContext * const s= (MpegEncContext *)c;
2217 const uint8_t *scantable= s->intra_scantable.permutated;
2218 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2219 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2220 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2221 int i, last, run, bits, level, distortion, start_i;
2222 const int esc_length= s->ac_esc_length;
2223 uint8_t * length;
2224 uint8_t * last_length;
2225
2226 assert(h==8);
2227
2228 copy_block8(lsrc1, src1, 8, stride, 8);
2229 copy_block8(lsrc2, src2, 8, stride, 8);
2230
2231 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2232
2233 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2234
2235 bits=0;
2236
2237 if (s->mb_intra) {
2238 start_i = 1;
2239 length = s->intra_ac_vlc_length;
2240 last_length= s->intra_ac_vlc_last_length;
2241 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2242 } else {
2243 start_i = 0;
2244 length = s->inter_ac_vlc_length;
2245 last_length= s->inter_ac_vlc_last_length;
2246 }
2247
2248 if(last>=start_i){
2249 run=0;
2250 for(i=start_i; i<last; i++){
2251 int j= scantable[i];
2252 level= temp[j];
2253
2254 if(level){
2255 level+=64;
2256 if((level&(~127)) == 0){
2257 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2258 }else
2259 bits+= esc_length;
2260 run=0;
2261 }else
2262 run++;
2263 }
2264 i= scantable[last];
2265
2266 level= temp[i] + 64;
2267
2268 assert(level - 64);
2269
2270 if((level&(~127)) == 0){
2271 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2272 }else
2273 bits+= esc_length;
2274
2275 }
2276
2277 if(last>=0){
2278 if(s->mb_intra)
2279 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2280 else
2281 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2282 }
2283
2284 s->dsp.idct_add(lsrc2, 8, temp);
2285
2286 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2287
2288 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2289 }
2290
2291 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2292 MpegEncContext * const s= (MpegEncContext *)c;
2293 const uint8_t *scantable= s->intra_scantable.permutated;
2294 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2295 int i, last, run, bits, level, start_i;
2296 const int esc_length= s->ac_esc_length;
2297 uint8_t * length;
2298 uint8_t * last_length;
2299
2300 assert(h==8);
2301
2302 s->dsp.diff_pixels(temp, src1, src2, stride);
2303
2304 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2305
2306 bits=0;
2307
2308 if (s->mb_intra) {
2309 start_i = 1;
2310 length = s->intra_ac_vlc_length;
2311 last_length= s->intra_ac_vlc_last_length;
2312 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2313 } else {
2314 start_i = 0;
2315 length = s->inter_ac_vlc_length;
2316 last_length= s->inter_ac_vlc_last_length;
2317 }
2318
2319 if(last>=start_i){
2320 run=0;
2321 for(i=start_i; i<last; i++){
2322 int j= scantable[i];
2323 level= temp[j];
2324
2325 if(level){
2326 level+=64;
2327 if((level&(~127)) == 0){
2328 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2329 }else
2330 bits+= esc_length;
2331 run=0;
2332 }else
2333 run++;
2334 }
2335 i= scantable[last];
2336
2337 level= temp[i] + 64;
2338
2339 assert(level - 64);
2340
2341 if((level&(~127)) == 0){
2342 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2343 }else
2344 bits+= esc_length;
2345 }
2346
2347 return bits;
2348 }
2349
2350 #define VSAD_INTRA(size) \
2351 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2352 int score=0; \
2353 int x,y; \
2354 \
2355 for(y=1; y<h; y++){ \
2356 for(x=0; x<size; x+=4){ \
2357 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2358 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2359 } \
2360 s+= stride; \
2361 } \
2362 \
2363 return score; \
2364 }
2365 VSAD_INTRA(8)
2366 VSAD_INTRA(16)
2367
2368 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2369 int score=0;
2370 int x,y;
2371
2372 for(y=1; y<h; y++){
2373 for(x=0; x<16; x++){
2374 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2375 }
2376 s1+= stride;
2377 s2+= stride;
2378 }
2379
2380 return score;
2381 }
2382
2383 #define SQ(a) ((a)*(a))
2384 #define VSSE_INTRA(size) \
2385 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2386 int score=0; \
2387 int x,y; \
2388 \
2389 for(y=1; y<h; y++){ \
2390 for(x=0; x<size; x+=4){ \
2391 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2392 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2393 } \
2394 s+= stride; \
2395 } \
2396 \
2397 return score; \
2398 }
2399 VSSE_INTRA(8)
2400 VSSE_INTRA(16)
2401
2402 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2403 int score=0;
2404 int x,y;
2405
2406 for(y=1; y<h; y++){
2407 for(x=0; x<16; x++){
2408 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2409 }
2410 s1+= stride;
2411 s2+= stride;
2412 }
2413
2414 return score;
2415 }
2416
2417 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2418 int size){
2419 int score=0;
2420 int i;
2421 for(i=0; i<size; i++)
2422 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2423 return score;
2424 }
2425
2426 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2427 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2428 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2429 #if CONFIG_GPL
2430 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2431 #endif
2432 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2433 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2434 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2435 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2436
2437 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2438 int i;
2439 for(i=0; i<len; i++)
2440 dst[i] = src0[i] * src1[i];
2441 }
2442
2443 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2444 int i;
2445 src1 += len-1;
2446 for(i=0; i<len; i++)
2447 dst[i] = src0[i] * src1[-i];
2448 }
2449
2450 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2451 int i;
2452 for(i=0; i<len; i++)
2453 dst[i] = src0[i] * src1[i] + src2[i];
2454 }
2455
2456 static void vector_fmul_window_c(float *dst, const float *src0,
2457 const float *src1, const float *win, int len)
2458 {
2459 int i,j;
2460 dst += len;
2461 win += len;
2462 src0+= len;
2463 for(i=-len, j=len-1; i<0; i++, j--) {
2464 float s0 = src0[i];
2465 float s1 = src1[j];
2466 float wi = win[i];
2467 float wj = win[j];
2468 dst[i] = s0*wj - s1*wi;
2469 dst[j] = s0*wi + s1*wj;
2470 }
2471 }
2472
2473 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2474 int len)
2475 {
2476 int i;
2477 for (i = 0; i < len; i++)
2478 dst[i] = src[i] * mul;
2479 }
2480
2481 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2482 int len)
2483 {
2484 int i;
2485 for (i = 0; i < len; i++)
2486 dst[i] += src[i] * mul;
2487 }
2488
2489 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2490 int len)
2491 {
2492 int i;
2493 for (i = 0; i < len; i++) {
2494 float t = v1[i] - v2[i];
2495 v1[i] += v2[i];
2496 v2[i] = t;
2497 }
2498 }
2499
2500 static void butterflies_float_interleave_c(float *dst, const float *src0,
2501 const float *src1, int len)
2502 {
2503 int i;
2504 for (i = 0; i < len; i++) {
2505 float f1 = src0[i];
2506 float f2 = src1[i];
2507 dst[2*i ] = f1 + f2;
2508 dst[2*i + 1] = f1 - f2;
2509 }
2510 }
2511
2512 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2513 {
2514 float p = 0.0;
2515 int i;
2516
2517 for (i = 0; i < len; i++)
2518 p += v1[i] * v2[i];
2519
2520 return p;
2521 }
2522
2523 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2524 uint32_t maxi, uint32_t maxisign)
2525 {
2526
2527 if(a > mini) return mini;
2528 else if((a^(1U<<31)) > maxisign) return maxi;
2529 else return a;
2530 }
2531
2532 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2533 int i;
2534 uint32_t mini = *(uint32_t*)min;
2535 uint32_t maxi = *(uint32_t*)max;
2536 uint32_t maxisign = maxi ^ (1U<<31);
2537 uint32_t *dsti = (uint32_t*)dst;
2538 const uint32_t *srci = (const uint32_t*)src;
2539 for(i=0; i<len; i+=8) {
2540 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2541 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2542 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2543 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2544 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2545 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2546 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2547 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2548 }
2549 }
2550 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2551 int i;
2552 if(min < 0 && max > 0) {
2553 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2554 } else {
2555 for(i=0; i < len; i+=8) {
2556 dst[i ] = av_clipf(src[i ], min, max);
2557 dst[i + 1] = av_clipf(src[i + 1], min, max);
2558 dst[i + 2] = av_clipf(src[i + 2], min, max);
2559 dst[i + 3] = av_clipf(src[i + 3], min, max);
2560 dst[i + 4] = av_clipf(src[i + 4], min, max);
2561 dst[i + 5] = av_clipf(src[i + 5], min, max);
2562 dst[i + 6] = av_clipf(src[i + 6], min, max);
2563 dst[i + 7] = av_clipf(src[i + 7], min, max);
2564 }
2565 }
2566 }
2567
2568 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2569 {
2570 int res = 0;
2571
2572 while (order--)
2573 res += (*v1++ * *v2++) >> shift;
2574
2575 return res;
2576 }
2577
2578 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2579 {
2580 int res = 0;
2581 while (order--) {
2582 res += *v1 * *v2++;
2583 *v1++ += mul * *v3++;
2584 }
2585 return res;
2586 }
2587
2588 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2589 const int16_t *window, unsigned int len)
2590 {
2591 int i;
2592 int len2 = len >> 1;
2593
2594 for (i = 0; i < len2; i++) {
2595 int16_t w = window[i];
2596 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2597 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2598 }
2599 }
2600
2601 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2602 int32_t max, unsigned int len)
2603 {
2604 do {
2605 *dst++ = av_clip(*src++, min, max);
2606 *dst++ = av_clip(*src++, min, max);
2607 *dst++ = av_clip(*src++, min, max);
2608 *dst++ = av_clip(*src++, min, max);
2609 *dst++ = av_clip(*src++, min, max);
2610 *dst++ = av_clip(*src++, min, max);
2611 *dst++ = av_clip(*src++, min, max);
2612 *dst++ = av_clip(*src++, min, max);
2613 len -= 8;
2614 } while (len > 0);
2615 }
2616
2617 #define W0 2048
2618 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2619 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2620 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2621 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2622 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2623 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2624 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2625
2626 static void wmv2_idct_row(short * b)
2627 {
2628 int s1,s2;
2629 int a0,a1,a2,a3,a4,a5,a6,a7;
2630 /*step 1*/
2631 a1 = W1*b[1]+W7*b[7];
2632 a7 = W7*b[1]-W1*b[7];
2633 a5 = W5*b[5]+W3*b[3];
2634 a3 = W3*b[5]-W5*b[3];
2635 a2 = W2*b[2]+W6*b[6];
2636 a6 = W6*b[2]-W2*b[6];
2637 a0 = W0*b[0]+W0*b[4];
2638 a4 = W0*b[0]-W0*b[4];
2639 /*step 2*/
2640 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2641 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2642 /*step 3*/
2643 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2644 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2645 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2646 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2647 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2648 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2649 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2650 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2651 }
2652 static void wmv2_idct_col(short * b)
2653 {
2654 int s1,s2;
2655 int a0,a1,a2,a3,a4,a5,a6,a7;
2656 /*step 1, with extended precision*/
2657 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2658 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2659 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2660 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2661 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2662 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2663 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2664 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2665 /*step 2*/
2666 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2667 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2668 /*step 3*/
2669 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2670 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2671 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2672 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2673
2674 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2675 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2676 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2677 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2678 }
2679 void ff_wmv2_idct_c(short * block){
2680 int i;
2681
2682 for(i=0;i<64;i+=8){
2683 wmv2_idct_row(block+i);
2684 }
2685 for(i=0;i<8;i++){
2686 wmv2_idct_col(block+i);
2687 }
2688 }
2689 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2690 converted */
2691 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2692 {
2693 ff_wmv2_idct_c(block);
2694 ff_put_pixels_clamped_c(block, dest, line_size);
2695 }
2696 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2697 {
2698 ff_wmv2_idct_c(block);
2699 ff_add_pixels_clamped_c(block, dest, line_size);
2700 }
2701 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2702 {
2703 j_rev_dct (block);
2704 ff_put_pixels_clamped_c(block, dest, line_size);
2705 }
2706 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2707 {
2708 j_rev_dct (block);
2709 ff_add_pixels_clamped_c(block, dest, line_size);
2710 }
2711
2712 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2713 {
2714 j_rev_dct4 (block);
2715 put_pixels_clamped4_c(block, dest, line_size);
2716 }
2717 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2718 {
2719 j_rev_dct4 (block);
2720 add_pixels_clamped4_c(block, dest, line_size);
2721 }
2722
2723 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2724 {
2725 j_rev_dct2 (block);
2726 put_pixels_clamped2_c(block, dest, line_size);
2727 }
2728 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2729 {
2730 j_rev_dct2 (block);
2731 add_pixels_clamped2_c(block, dest, line_size);
2732 }
2733
2734 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2735 {
2736 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2737
2738 dest[0] = cm[(block[0] + 4)>>3];
2739 }
2740 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2741 {
2742 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2743
2744 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2745 }
2746
2747 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2748
2749 /* init static data */
2750 av_cold void dsputil_static_init(void)
2751 {
2752 int i;
2753
2754 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2755 for(i=0;i<MAX_NEG_CROP;i++) {
2756 ff_cropTbl[i] = 0;
2757 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2758 }
2759
2760 for(i=0;i<512;i++) {
2761 ff_squareTbl[i] = (i - 256) * (i - 256);
2762 }
2763
2764 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2765 }
2766
2767 int ff_check_alignment(void){
2768 static int did_fail=0;
2769 LOCAL_ALIGNED_16(int, aligned, [4]);
2770
2771 if((intptr_t)aligned & 15){
2772 if(!did_fail){
2773 #if HAVE_MMX || HAVE_ALTIVEC
2774 av_log(NULL, AV_LOG_ERROR,
2775 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2776 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2777 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2778 "Do not report crashes to Libav developers.\n");
2779 #endif
2780 did_fail=1;
2781 }
2782 return -1;
2783 }
2784 return 0;
2785 }
2786
2787 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2788 {
2789 int i;
2790
2791 ff_check_alignment();
2792
2793 #if CONFIG_ENCODERS
2794 if (avctx->bits_per_raw_sample == 10) {
2795 c->fdct = ff_jpeg_fdct_islow_10;
2796 c->fdct248 = ff_fdct248_islow_10;
2797 } else {
2798 if(avctx->dct_algo==FF_DCT_FASTINT) {
2799 c->fdct = fdct_ifast;
2800 c->fdct248 = fdct_ifast248;
2801 }
2802 else if(avctx->dct_algo==FF_DCT_FAAN) {
2803 c->fdct = ff_faandct;
2804 c->fdct248 = ff_faandct248;
2805 }
2806 else {
2807 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2808 c->fdct248 = ff_fdct248_islow_8;
2809 }
2810 }
2811 #endif //CONFIG_ENCODERS
2812
2813 if(avctx->lowres==1){
2814 c->idct_put= ff_jref_idct4_put;
2815 c->idct_add= ff_jref_idct4_add;
2816 c->idct = j_rev_dct4;
2817 c->idct_permutation_type= FF_NO_IDCT_PERM;
2818 }else if(avctx->lowres==2){
2819 c->idct_put= ff_jref_idct2_put;
2820 c->idct_add= ff_jref_idct2_add;
2821 c->idct = j_rev_dct2;
2822 c->idct_permutation_type= FF_NO_IDCT_PERM;
2823 }else if(avctx->lowres==3){
2824 c->idct_put= ff_jref_idct1_put;
2825 c->idct_add= ff_jref_idct1_add;
2826 c->idct = j_rev_dct1;
2827 c->idct_permutation_type= FF_NO_IDCT_PERM;
2828 }else{
2829 if (avctx->bits_per_raw_sample == 10) {
2830 c->idct_put = ff_simple_idct_put_10;
2831 c->idct_add = ff_simple_idct_add_10;
2832 c->idct = ff_simple_idct_10;
2833 c->idct_permutation_type = FF_NO_IDCT_PERM;
2834 } else {
2835 if(avctx->idct_algo==FF_IDCT_INT){
2836 c->idct_put= ff_jref_idct_put;
2837 c->idct_add= ff_jref_idct_add;
2838 c->idct = j_rev_dct;
2839 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2840 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2841 avctx->idct_algo==FF_IDCT_VP3){
2842 c->idct_put= ff_vp3_idct_put_c;
2843 c->idct_add= ff_vp3_idct_add_c;
2844 c->idct = ff_vp3_idct_c;
2845 c->idct_permutation_type= FF_NO_IDCT_PERM;
2846 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2847 c->idct_put= ff_wmv2_idct_put_c;
2848 c->idct_add= ff_wmv2_idct_add_c;
2849 c->idct = ff_wmv2_idct_c;
2850 c->idct_permutation_type= FF_NO_IDCT_PERM;
2851 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2852 c->idct_put= ff_faanidct_put;
2853 c->idct_add= ff_faanidct_add;
2854 c->idct = ff_faanidct;
2855 c->idct_permutation_type= FF_NO_IDCT_PERM;
2856 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2857 c->idct_put= ff_ea_idct_put_c;
2858 c->idct_permutation_type= FF_NO_IDCT_PERM;
2859 }else{ //accurate/default
2860 c->idct_put = ff_simple_idct_put_8;
2861 c->idct_add = ff_simple_idct_add_8;
2862 c->idct = ff_simple_idct_8;
2863 c->idct_permutation_type= FF_NO_IDCT_PERM;
2864 }
2865 }
2866 }
2867
2868 c->diff_pixels = diff_pixels_c;
2869 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2870 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2871 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2872 c->sum_abs_dctelem = sum_abs_dctelem_c;
2873 c->gmc1 = gmc1_c;
2874 c->gmc = ff_gmc_c;
2875 c->pix_sum = pix_sum_c;
2876 c->pix_norm1 = pix_norm1_c;
2877
2878 c->fill_block_tab[0] = fill_block16_c;
2879 c->fill_block_tab[1] = fill_block8_c;
2880
2881 /* TODO [0] 16 [1] 8 */
2882 c->pix_abs[0][0] = pix_abs16_c;
2883 c->pix_abs[0][1] = pix_abs16_x2_c;
2884 c->pix_abs[0][2] = pix_abs16_y2_c;
2885 c->pix_abs[0][3] = pix_abs16_xy2_c;
2886 c->pix_abs[1][0] = pix_abs8_c;
2887 c->pix_abs[1][1] = pix_abs8_x2_c;
2888 c->pix_abs[1][2] = pix_abs8_y2_c;
2889 c->pix_abs[1][3] = pix_abs8_xy2_c;
2890
2891 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2892 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2893 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2894 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2895 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2896 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2897 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2898 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2899 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2900
2901 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2902 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2903 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2904 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2905 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2906 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2907 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2908 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2909 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2910
2911 #define dspfunc(PFX, IDX, NUM) \
2912 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2913 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2914 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2915 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2916 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2917 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2918 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2919 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2920 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2921 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2922 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2923 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2924 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2925 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2926 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2927 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2928
2929 dspfunc(put_qpel, 0, 16);
2930 dspfunc(put_no_rnd_qpel, 0, 16);
2931
2932 dspfunc(avg_qpel, 0, 16);
2933 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2934
2935 dspfunc(put_qpel, 1, 8);
2936 dspfunc(put_no_rnd_qpel, 1, 8);
2937
2938 dspfunc(avg_qpel, 1, 8);
2939 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2940
2941 #undef dspfunc
2942
2943 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2944 ff_mlp_init(c, avctx);
2945 #endif
2946 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2947 ff_intrax8dsp_init(c,avctx);
2948 #endif
2949
2950 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2951 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2952 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2953 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2954 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2955 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2956 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2957 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2958
2959 #define SET_CMP_FUNC(name) \
2960 c->name[0]= name ## 16_c;\
2961 c->name[1]= name ## 8x8_c;
2962
2963 SET_CMP_FUNC(hadamard8_diff)
2964 c->hadamard8_diff[4]= hadamard8_intra16_c;
2965 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2966 SET_CMP_FUNC(dct_sad)
2967 SET_CMP_FUNC(dct_max)
2968 #if CONFIG_GPL
2969 SET_CMP_FUNC(dct264_sad)
2970 #endif
2971 c->sad[0]= pix_abs16_c;
2972 c->sad[1]= pix_abs8_c;
2973 c->sse[0]= sse16_c;
2974 c->sse[1]= sse8_c;
2975 c->sse[2]= sse4_c;
2976 SET_CMP_FUNC(quant_psnr)
2977 SET_CMP_FUNC(rd)
2978 SET_CMP_FUNC(bit)
2979 c->vsad[0]= vsad16_c;
2980 c->vsad[4]= vsad_intra16_c;
2981 c->vsad[5]= vsad_intra8_c;
2982 c->vsse[0]= vsse16_c;
2983 c->vsse[4]= vsse_intra16_c;
2984 c->vsse[5]= vsse_intra8_c;
2985 c->nsse[0]= nsse16_c;
2986 c->nsse[1]= nsse8_c;
2987 #if CONFIG_DWT
2988 ff_dsputil_init_dwt(c);
2989 #endif
2990
2991 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2992
2993 c->add_bytes= add_bytes_c;
2994 c->diff_bytes= diff_bytes_c;
2995 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2996 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2997 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2998 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2999 c->bswap_buf= bswap_buf;
3000 c->bswap16_buf = bswap16_buf;
3001
3002 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3003 c->h263_h_loop_filter= h263_h_loop_filter_c;
3004 c->h263_v_loop_filter= h263_v_loop_filter_c;
3005 }
3006
3007 if (CONFIG_VP3_DECODER) {
3008 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3009 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3010 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3011 }
3012
3013 c->h261_loop_filter= h261_loop_filter_c;
3014
3015 c->try_8x8basis= try_8x8basis_c;
3016 c->add_8x8basis= add_8x8basis_c;
3017
3018 #if CONFIG_VORBIS_DECODER
3019 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3020 #endif
3021 #if CONFIG_AC3_DECODER
3022 c->ac3_downmix = ff_ac3_downmix_c;
3023 #endif
3024 c->vector_fmul = vector_fmul_c;
3025 c->vector_fmul_reverse = vector_fmul_reverse_c;
3026 c->vector_fmul_add = vector_fmul_add_c;
3027 c->vector_fmul_window = vector_fmul_window_c;
3028 c->vector_clipf = vector_clipf_c;
3029 c->scalarproduct_int16 = scalarproduct_int16_c;
3030 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3031 c->apply_window_int16 = apply_window_int16_c;
3032 c->vector_clip_int32 = vector_clip_int32_c;
3033 c->scalarproduct_float = scalarproduct_float_c;
3034 c->butterflies_float = butterflies_float_c;
3035 c->butterflies_float_interleave = butterflies_float_interleave_c;
3036 c->vector_fmul_scalar = vector_fmul_scalar_c;
3037 c->vector_fmac_scalar = vector_fmac_scalar_c;
3038
3039 c->shrink[0]= av_image_copy_plane;
3040 c->shrink[1]= ff_shrink22;
3041 c->shrink[2]= ff_shrink44;
3042 c->shrink[3]= ff_shrink88;
3043
3044 c->prefetch= just_return;
3045
3046 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3047 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3048
3049 #undef FUNC
3050 #undef FUNCC
3051 #define FUNC(f, depth) f ## _ ## depth
3052 #define FUNCC(f, depth) f ## _ ## depth ## _c
3053
3054 #define dspfunc1(PFX, IDX, NUM, depth)\
3055 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3056 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3057 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3058 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3059
3060 #define dspfunc2(PFX, IDX, NUM, depth)\
3061 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3062 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3063 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3064 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3065 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3066 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3067 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3068 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3069 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3070 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3071 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3072 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3073 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3074 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3075 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3076 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3077
3078
3079 #define BIT_DEPTH_FUNCS(depth, dct)\
3080 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3081 c->draw_edges = FUNCC(draw_edges , depth);\
3082 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3083 c->clear_block = FUNCC(clear_block ## dct , depth);\
3084 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3085 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3086 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3087 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3088 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3089 \
3090 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3091 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3092 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3093 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3094 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3095 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3096 \
3097 dspfunc1(put , 0, 16, depth);\
3098 dspfunc1(put , 1, 8, depth);\
3099 dspfunc1(put , 2, 4, depth);\
3100 dspfunc1(put , 3, 2, depth);\
3101 dspfunc1(put_no_rnd, 0, 16, depth);\
3102 dspfunc1(put_no_rnd, 1, 8, depth);\
3103 dspfunc1(avg , 0, 16, depth);\
3104 dspfunc1(avg , 1, 8, depth);\
3105 dspfunc1(avg , 2, 4, depth);\
3106 dspfunc1(avg , 3, 2, depth);\
3107 dspfunc1(avg_no_rnd, 0, 16, depth);\
3108 dspfunc1(avg_no_rnd, 1, 8, depth);\
3109 \
3110 dspfunc2(put_h264_qpel, 0, 16, depth);\
3111 dspfunc2(put_h264_qpel, 1, 8, depth);\
3112 dspfunc2(put_h264_qpel, 2, 4, depth);\
3113 dspfunc2(put_h264_qpel, 3, 2, depth);\
3114 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3115 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3116 dspfunc2(avg_h264_qpel, 2, 4, depth);
3117
3118 switch (avctx->bits_per_raw_sample) {
3119 case 9:
3120 if (c->dct_bits == 32) {
3121 BIT_DEPTH_FUNCS(9, _32);
3122 } else {
3123 BIT_DEPTH_FUNCS(9, _16);
3124 }
3125 break;
3126 case 10:
3127 if (c->dct_bits == 32) {
3128 BIT_DEPTH_FUNCS(10, _32);
3129 } else {
3130 BIT_DEPTH_FUNCS(10, _16);
3131 }
3132 break;
3133 default:
3134 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3135 case 8:
3136 BIT_DEPTH_FUNCS(8, _16);
3137 break;
3138 }
3139
3140
3141 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3142 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3143 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3144 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3145 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3146 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3147 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3148 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3149 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3150
3151 for(i=0; i<64; i++){
3152 if(!c->put_2tap_qpel_pixels_tab[0][i])
3153 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3154 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3155 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3156 }
3157
3158 ff_init_scantable_permutation(c->idct_permutation,
3159 c->idct_permutation_type);
3160 }