Separate DWT from snow and dsputil
[libav.git] / libavcodec / dsputil.c
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "mpegvideo.h"
37 #include "config.h"
38 #include "lpc.h"
39 #include "ac3dec.h"
40 #include "vorbis.h"
41 #include "png.h"
42
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
45
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
49
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
59 };
60
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
72 };
73
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
76
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
86 };
87
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
97 };
98
99 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
100 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
101 const uint32_t ff_inverse[257]={
102 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
103 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
104 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
105 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
106 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
107 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
108 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
109 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
110 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
111 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
112 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
113 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
114 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
115 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
116 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
117 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
118 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
119 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
120 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
121 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
122 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
123 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
124 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
125 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
126 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
127 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
128 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
129 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
130 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
131 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
132 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
133 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
134 16777216
135 };
136
137 /* Input permutation for the simple_idct_mmx */
138 static const uint8_t simple_mmx_permutation[64]={
139 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
140 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
141 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
142 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
143 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
144 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
145 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
146 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147 };
148
149 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
150
151 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
152 int i;
153 int end;
154
155 st->scantable= src_scantable;
156
157 for(i=0; i<64; i++){
158 int j;
159 j = src_scantable[i];
160 st->permutated[i] = permutation[j];
161 #if ARCH_PPC
162 st->inverse[j] = i;
163 #endif
164 }
165
166 end=-1;
167 for(i=0; i<64; i++){
168 int j;
169 j = st->permutated[i];
170 if(j>end) end=j;
171 st->raster_end[i]= end;
172 }
173 }
174
175 static int pix_sum_c(uint8_t * pix, int line_size)
176 {
177 int s, i, j;
178
179 s = 0;
180 for (i = 0; i < 16; i++) {
181 for (j = 0; j < 16; j += 8) {
182 s += pix[0];
183 s += pix[1];
184 s += pix[2];
185 s += pix[3];
186 s += pix[4];
187 s += pix[5];
188 s += pix[6];
189 s += pix[7];
190 pix += 8;
191 }
192 pix += line_size - 16;
193 }
194 return s;
195 }
196
197 static int pix_norm1_c(uint8_t * pix, int line_size)
198 {
199 int s, i, j;
200 uint32_t *sq = ff_squareTbl + 256;
201
202 s = 0;
203 for (i = 0; i < 16; i++) {
204 for (j = 0; j < 16; j += 8) {
205 #if 0
206 s += sq[pix[0]];
207 s += sq[pix[1]];
208 s += sq[pix[2]];
209 s += sq[pix[3]];
210 s += sq[pix[4]];
211 s += sq[pix[5]];
212 s += sq[pix[6]];
213 s += sq[pix[7]];
214 #else
215 #if LONG_MAX > 2147483647
216 register uint64_t x=*(uint64_t*)pix;
217 s += sq[x&0xff];
218 s += sq[(x>>8)&0xff];
219 s += sq[(x>>16)&0xff];
220 s += sq[(x>>24)&0xff];
221 s += sq[(x>>32)&0xff];
222 s += sq[(x>>40)&0xff];
223 s += sq[(x>>48)&0xff];
224 s += sq[(x>>56)&0xff];
225 #else
226 register uint32_t x=*(uint32_t*)pix;
227 s += sq[x&0xff];
228 s += sq[(x>>8)&0xff];
229 s += sq[(x>>16)&0xff];
230 s += sq[(x>>24)&0xff];
231 x=*(uint32_t*)(pix+4);
232 s += sq[x&0xff];
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 #endif
237 #endif
238 pix += 8;
239 }
240 pix += line_size - 16;
241 }
242 return s;
243 }
244
245 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
246 int i;
247
248 for(i=0; i+8<=w; i+=8){
249 dst[i+0]= bswap_32(src[i+0]);
250 dst[i+1]= bswap_32(src[i+1]);
251 dst[i+2]= bswap_32(src[i+2]);
252 dst[i+3]= bswap_32(src[i+3]);
253 dst[i+4]= bswap_32(src[i+4]);
254 dst[i+5]= bswap_32(src[i+5]);
255 dst[i+6]= bswap_32(src[i+6]);
256 dst[i+7]= bswap_32(src[i+7]);
257 }
258 for(;i<w; i++){
259 dst[i+0]= bswap_32(src[i+0]);
260 }
261 }
262
263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
264 {
265 int s, i;
266 uint32_t *sq = ff_squareTbl + 256;
267
268 s = 0;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[0] - pix2[0]];
271 s += sq[pix1[1] - pix2[1]];
272 s += sq[pix1[2] - pix2[2]];
273 s += sq[pix1[3] - pix2[3]];
274 pix1 += line_size;
275 pix2 += line_size;
276 }
277 return s;
278 }
279
280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281 {
282 int s, i;
283 uint32_t *sq = ff_squareTbl + 256;
284
285 s = 0;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[0] - pix2[0]];
288 s += sq[pix1[1] - pix2[1]];
289 s += sq[pix1[2] - pix2[2]];
290 s += sq[pix1[3] - pix2[3]];
291 s += sq[pix1[4] - pix2[4]];
292 s += sq[pix1[5] - pix2[5]];
293 s += sq[pix1[6] - pix2[6]];
294 s += sq[pix1[7] - pix2[7]];
295 pix1 += line_size;
296 pix2 += line_size;
297 }
298 return s;
299 }
300
301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
302 {
303 int s, i;
304 uint32_t *sq = ff_squareTbl + 256;
305
306 s = 0;
307 for (i = 0; i < h; i++) {
308 s += sq[pix1[ 0] - pix2[ 0]];
309 s += sq[pix1[ 1] - pix2[ 1]];
310 s += sq[pix1[ 2] - pix2[ 2]];
311 s += sq[pix1[ 3] - pix2[ 3]];
312 s += sq[pix1[ 4] - pix2[ 4]];
313 s += sq[pix1[ 5] - pix2[ 5]];
314 s += sq[pix1[ 6] - pix2[ 6]];
315 s += sq[pix1[ 7] - pix2[ 7]];
316 s += sq[pix1[ 8] - pix2[ 8]];
317 s += sq[pix1[ 9] - pix2[ 9]];
318 s += sq[pix1[10] - pix2[10]];
319 s += sq[pix1[11] - pix2[11]];
320 s += sq[pix1[12] - pix2[12]];
321 s += sq[pix1[13] - pix2[13]];
322 s += sq[pix1[14] - pix2[14]];
323 s += sq[pix1[15] - pix2[15]];
324
325 pix1 += line_size;
326 pix2 += line_size;
327 }
328 return s;
329 }
330
331 /* draw the edges of width 'w' of an image of size width, height */
332 //FIXME check that this is ok for mpeg4 interlaced
333 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
334 {
335 uint8_t *ptr, *last_line;
336 int i;
337
338 last_line = buf + (height - 1) * wrap;
339 for(i=0;i<w;i++) {
340 /* top and bottom */
341 memcpy(buf - (i + 1) * wrap, buf, width);
342 memcpy(last_line + (i + 1) * wrap, last_line, width);
343 }
344 /* left and right */
345 ptr = buf;
346 for(i=0;i<height;i++) {
347 memset(ptr - w, ptr[0], w);
348 memset(ptr + width, ptr[width-1], w);
349 ptr += wrap;
350 }
351 /* corners */
352 for(i=0;i<w;i++) {
353 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
354 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
355 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
356 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
357 }
358 }
359
360 /**
361 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
362 * @param buf destination buffer
363 * @param src source buffer
364 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
365 * @param block_w width of block
366 * @param block_h height of block
367 * @param src_x x coordinate of the top left sample of the block in the source buffer
368 * @param src_y y coordinate of the top left sample of the block in the source buffer
369 * @param w width of the source buffer
370 * @param h height of the source buffer
371 */
372 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
373 int src_x, int src_y, int w, int h){
374 int x, y;
375 int start_y, start_x, end_y, end_x;
376
377 if(src_y>= h){
378 src+= (h-1-src_y)*linesize;
379 src_y=h-1;
380 }else if(src_y<=-block_h){
381 src+= (1-block_h-src_y)*linesize;
382 src_y=1-block_h;
383 }
384 if(src_x>= w){
385 src+= (w-1-src_x);
386 src_x=w-1;
387 }else if(src_x<=-block_w){
388 src+= (1-block_w-src_x);
389 src_x=1-block_w;
390 }
391
392 start_y= FFMAX(0, -src_y);
393 start_x= FFMAX(0, -src_x);
394 end_y= FFMIN(block_h, h-src_y);
395 end_x= FFMIN(block_w, w-src_x);
396
397 // copy existing part
398 for(y=start_y; y<end_y; y++){
399 for(x=start_x; x<end_x; x++){
400 buf[x + y*linesize]= src[x + y*linesize];
401 }
402 }
403
404 //top
405 for(y=0; y<start_y; y++){
406 for(x=start_x; x<end_x; x++){
407 buf[x + y*linesize]= buf[x + start_y*linesize];
408 }
409 }
410
411 //bottom
412 for(y=end_y; y<block_h; y++){
413 for(x=start_x; x<end_x; x++){
414 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
415 }
416 }
417
418 for(y=0; y<block_h; y++){
419 //left
420 for(x=0; x<start_x; x++){
421 buf[x + y*linesize]= buf[start_x + y*linesize];
422 }
423
424 //right
425 for(x=end_x; x<block_w; x++){
426 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
427 }
428 }
429 }
430
431 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
432 {
433 int i;
434
435 /* read the pixels */
436 for(i=0;i<8;i++) {
437 block[0] = pixels[0];
438 block[1] = pixels[1];
439 block[2] = pixels[2];
440 block[3] = pixels[3];
441 block[4] = pixels[4];
442 block[5] = pixels[5];
443 block[6] = pixels[6];
444 block[7] = pixels[7];
445 pixels += line_size;
446 block += 8;
447 }
448 }
449
450 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
451 const uint8_t *s2, int stride){
452 int i;
453
454 /* read the pixels */
455 for(i=0;i<8;i++) {
456 block[0] = s1[0] - s2[0];
457 block[1] = s1[1] - s2[1];
458 block[2] = s1[2] - s2[2];
459 block[3] = s1[3] - s2[3];
460 block[4] = s1[4] - s2[4];
461 block[5] = s1[5] - s2[5];
462 block[6] = s1[6] - s2[6];
463 block[7] = s1[7] - s2[7];
464 s1 += stride;
465 s2 += stride;
466 block += 8;
467 }
468 }
469
470
471 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
472 int line_size)
473 {
474 int i;
475 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
476
477 /* read the pixels */
478 for(i=0;i<8;i++) {
479 pixels[0] = cm[block[0]];
480 pixels[1] = cm[block[1]];
481 pixels[2] = cm[block[2]];
482 pixels[3] = cm[block[3]];
483 pixels[4] = cm[block[4]];
484 pixels[5] = cm[block[5]];
485 pixels[6] = cm[block[6]];
486 pixels[7] = cm[block[7]];
487
488 pixels += line_size;
489 block += 8;
490 }
491 }
492
493 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
494 int line_size)
495 {
496 int i;
497 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
498
499 /* read the pixels */
500 for(i=0;i<4;i++) {
501 pixels[0] = cm[block[0]];
502 pixels[1] = cm[block[1]];
503 pixels[2] = cm[block[2]];
504 pixels[3] = cm[block[3]];
505
506 pixels += line_size;
507 block += 8;
508 }
509 }
510
511 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
512 int line_size)
513 {
514 int i;
515 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
516
517 /* read the pixels */
518 for(i=0;i<2;i++) {
519 pixels[0] = cm[block[0]];
520 pixels[1] = cm[block[1]];
521
522 pixels += line_size;
523 block += 8;
524 }
525 }
526
527 static void put_signed_pixels_clamped_c(const DCTELEM *block,
528 uint8_t *restrict pixels,
529 int line_size)
530 {
531 int i, j;
532
533 for (i = 0; i < 8; i++) {
534 for (j = 0; j < 8; j++) {
535 if (*block < -128)
536 *pixels = 0;
537 else if (*block > 127)
538 *pixels = 255;
539 else
540 *pixels = (uint8_t)(*block + 128);
541 block++;
542 pixels++;
543 }
544 pixels += (line_size - 8);
545 }
546 }
547
548 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
549 int line_size)
550 {
551 int i;
552
553 /* read the pixels */
554 for(i=0;i<8;i++) {
555 pixels[0] = block[0];
556 pixels[1] = block[1];
557 pixels[2] = block[2];
558 pixels[3] = block[3];
559 pixels[4] = block[4];
560 pixels[5] = block[5];
561 pixels[6] = block[6];
562 pixels[7] = block[7];
563
564 pixels += line_size;
565 block += 8;
566 }
567 }
568
569 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
570 int line_size)
571 {
572 int i;
573 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574
575 /* read the pixels */
576 for(i=0;i<8;i++) {
577 pixels[0] = cm[pixels[0] + block[0]];
578 pixels[1] = cm[pixels[1] + block[1]];
579 pixels[2] = cm[pixels[2] + block[2]];
580 pixels[3] = cm[pixels[3] + block[3]];
581 pixels[4] = cm[pixels[4] + block[4]];
582 pixels[5] = cm[pixels[5] + block[5]];
583 pixels[6] = cm[pixels[6] + block[6]];
584 pixels[7] = cm[pixels[7] + block[7]];
585 pixels += line_size;
586 block += 8;
587 }
588 }
589
590 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
591 int line_size)
592 {
593 int i;
594 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
595
596 /* read the pixels */
597 for(i=0;i<4;i++) {
598 pixels[0] = cm[pixels[0] + block[0]];
599 pixels[1] = cm[pixels[1] + block[1]];
600 pixels[2] = cm[pixels[2] + block[2]];
601 pixels[3] = cm[pixels[3] + block[3]];
602 pixels += line_size;
603 block += 8;
604 }
605 }
606
607 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
608 int line_size)
609 {
610 int i;
611 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
612
613 /* read the pixels */
614 for(i=0;i<2;i++) {
615 pixels[0] = cm[pixels[0] + block[0]];
616 pixels[1] = cm[pixels[1] + block[1]];
617 pixels += line_size;
618 block += 8;
619 }
620 }
621
622 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
623 {
624 int i;
625 for(i=0;i<8;i++) {
626 pixels[0] += block[0];
627 pixels[1] += block[1];
628 pixels[2] += block[2];
629 pixels[3] += block[3];
630 pixels[4] += block[4];
631 pixels[5] += block[5];
632 pixels[6] += block[6];
633 pixels[7] += block[7];
634 pixels += line_size;
635 block += 8;
636 }
637 }
638
639 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
640 {
641 int i;
642 for(i=0;i<4;i++) {
643 pixels[0] += block[0];
644 pixels[1] += block[1];
645 pixels[2] += block[2];
646 pixels[3] += block[3];
647 pixels += line_size;
648 block += 4;
649 }
650 }
651
652 static int sum_abs_dctelem_c(DCTELEM *block)
653 {
654 int sum=0, i;
655 for(i=0; i<64; i++)
656 sum+= FFABS(block[i]);
657 return sum;
658 }
659
660 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
661 {
662 int i;
663
664 for (i = 0; i < h; i++) {
665 memset(block, value, 16);
666 block += line_size;
667 }
668 }
669
670 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
671 {
672 int i;
673
674 for (i = 0; i < h; i++) {
675 memset(block, value, 8);
676 block += line_size;
677 }
678 }
679
680 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
681 {
682 int i, j;
683 uint16_t *dst1 = (uint16_t *) dst;
684 uint16_t *dst2 = (uint16_t *)(dst + linesize);
685
686 for (j = 0; j < 8; j++) {
687 for (i = 0; i < 8; i++) {
688 dst1[i] = dst2[i] = src[i] * 0x0101;
689 }
690 src += 8;
691 dst1 += linesize;
692 dst2 += linesize;
693 }
694 }
695
696 #if 0
697
698 #define PIXOP2(OPNAME, OP) \
699 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700 {\
701 int i;\
702 for(i=0; i<h; i++){\
703 OP(*((uint64_t*)block), AV_RN64(pixels));\
704 pixels+=line_size;\
705 block +=line_size;\
706 }\
707 }\
708 \
709 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
710 {\
711 int i;\
712 for(i=0; i<h; i++){\
713 const uint64_t a= AV_RN64(pixels );\
714 const uint64_t b= AV_RN64(pixels+1);\
715 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
716 pixels+=line_size;\
717 block +=line_size;\
718 }\
719 }\
720 \
721 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
722 {\
723 int i;\
724 for(i=0; i<h; i++){\
725 const uint64_t a= AV_RN64(pixels );\
726 const uint64_t b= AV_RN64(pixels+1);\
727 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
728 pixels+=line_size;\
729 block +=line_size;\
730 }\
731 }\
732 \
733 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
734 {\
735 int i;\
736 for(i=0; i<h; i++){\
737 const uint64_t a= AV_RN64(pixels );\
738 const uint64_t b= AV_RN64(pixels+line_size);\
739 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
740 pixels+=line_size;\
741 block +=line_size;\
742 }\
743 }\
744 \
745 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
746 {\
747 int i;\
748 for(i=0; i<h; i++){\
749 const uint64_t a= AV_RN64(pixels );\
750 const uint64_t b= AV_RN64(pixels+line_size);\
751 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
752 pixels+=line_size;\
753 block +=line_size;\
754 }\
755 }\
756 \
757 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758 {\
759 int i;\
760 const uint64_t a= AV_RN64(pixels );\
761 const uint64_t b= AV_RN64(pixels+1);\
762 uint64_t l0= (a&0x0303030303030303ULL)\
763 + (b&0x0303030303030303ULL)\
764 + 0x0202020202020202ULL;\
765 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767 uint64_t l1,h1;\
768 \
769 pixels+=line_size;\
770 for(i=0; i<h; i+=2){\
771 uint64_t a= AV_RN64(pixels );\
772 uint64_t b= AV_RN64(pixels+1);\
773 l1= (a&0x0303030303030303ULL)\
774 + (b&0x0303030303030303ULL);\
775 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
776 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
777 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
778 pixels+=line_size;\
779 block +=line_size;\
780 a= AV_RN64(pixels );\
781 b= AV_RN64(pixels+1);\
782 l0= (a&0x0303030303030303ULL)\
783 + (b&0x0303030303030303ULL)\
784 + 0x0202020202020202ULL;\
785 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
786 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
787 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
788 pixels+=line_size;\
789 block +=line_size;\
790 }\
791 }\
792 \
793 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
794 {\
795 int i;\
796 const uint64_t a= AV_RN64(pixels );\
797 const uint64_t b= AV_RN64(pixels+1);\
798 uint64_t l0= (a&0x0303030303030303ULL)\
799 + (b&0x0303030303030303ULL)\
800 + 0x0101010101010101ULL;\
801 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803 uint64_t l1,h1;\
804 \
805 pixels+=line_size;\
806 for(i=0; i<h; i+=2){\
807 uint64_t a= AV_RN64(pixels );\
808 uint64_t b= AV_RN64(pixels+1);\
809 l1= (a&0x0303030303030303ULL)\
810 + (b&0x0303030303030303ULL);\
811 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
812 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
813 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
814 pixels+=line_size;\
815 block +=line_size;\
816 a= AV_RN64(pixels );\
817 b= AV_RN64(pixels+1);\
818 l0= (a&0x0303030303030303ULL)\
819 + (b&0x0303030303030303ULL)\
820 + 0x0101010101010101ULL;\
821 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
822 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
823 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
824 pixels+=line_size;\
825 block +=line_size;\
826 }\
827 }\
828 \
829 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
830 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
831 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
832 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
833 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
834 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
835 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
836
837 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
838 #else // 64 bit variant
839
840 #define PIXOP2(OPNAME, OP) \
841 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842 int i;\
843 for(i=0; i<h; i++){\
844 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
845 pixels+=line_size;\
846 block +=line_size;\
847 }\
848 }\
849 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
850 int i;\
851 for(i=0; i<h; i++){\
852 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
853 pixels+=line_size;\
854 block +=line_size;\
855 }\
856 }\
857 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
858 int i;\
859 for(i=0; i<h; i++){\
860 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
861 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
862 pixels+=line_size;\
863 block +=line_size;\
864 }\
865 }\
866 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
867 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
868 }\
869 \
870 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871 int src_stride1, int src_stride2, int h){\
872 int i;\
873 for(i=0; i<h; i++){\
874 uint32_t a,b;\
875 a= AV_RN32(&src1[i*src_stride1 ]);\
876 b= AV_RN32(&src2[i*src_stride2 ]);\
877 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
881 }\
882 }\
883 \
884 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885 int src_stride1, int src_stride2, int h){\
886 int i;\
887 for(i=0; i<h; i++){\
888 uint32_t a,b;\
889 a= AV_RN32(&src1[i*src_stride1 ]);\
890 b= AV_RN32(&src2[i*src_stride2 ]);\
891 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
892 a= AV_RN32(&src1[i*src_stride1+4]);\
893 b= AV_RN32(&src2[i*src_stride2+4]);\
894 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
895 }\
896 }\
897 \
898 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899 int src_stride1, int src_stride2, int h){\
900 int i;\
901 for(i=0; i<h; i++){\
902 uint32_t a,b;\
903 a= AV_RN32(&src1[i*src_stride1 ]);\
904 b= AV_RN32(&src2[i*src_stride2 ]);\
905 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
906 }\
907 }\
908 \
909 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
910 int src_stride1, int src_stride2, int h){\
911 int i;\
912 for(i=0; i<h; i++){\
913 uint32_t a,b;\
914 a= AV_RN16(&src1[i*src_stride1 ]);\
915 b= AV_RN16(&src2[i*src_stride2 ]);\
916 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
917 }\
918 }\
919 \
920 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921 int src_stride1, int src_stride2, int h){\
922 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
923 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
924 }\
925 \
926 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
927 int src_stride1, int src_stride2, int h){\
928 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
929 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
930 }\
931 \
932 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
933 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
934 }\
935 \
936 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
937 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
938 }\
939 \
940 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
942 }\
943 \
944 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
945 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
946 }\
947 \
948 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950 int i;\
951 for(i=0; i<h; i++){\
952 uint32_t a, b, c, d, l0, l1, h0, h1;\
953 a= AV_RN32(&src1[i*src_stride1]);\
954 b= AV_RN32(&src2[i*src_stride2]);\
955 c= AV_RN32(&src3[i*src_stride3]);\
956 d= AV_RN32(&src4[i*src_stride4]);\
957 l0= (a&0x03030303UL)\
958 + (b&0x03030303UL)\
959 + 0x02020202UL;\
960 h0= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 l1= (c&0x03030303UL)\
963 + (d&0x03030303UL);\
964 h1= ((c&0xFCFCFCFCUL)>>2)\
965 + ((d&0xFCFCFCFCUL)>>2);\
966 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967 a= AV_RN32(&src1[i*src_stride1+4]);\
968 b= AV_RN32(&src2[i*src_stride2+4]);\
969 c= AV_RN32(&src3[i*src_stride3+4]);\
970 d= AV_RN32(&src4[i*src_stride4+4]);\
971 l0= (a&0x03030303UL)\
972 + (b&0x03030303UL)\
973 + 0x02020202UL;\
974 h0= ((a&0xFCFCFCFCUL)>>2)\
975 + ((b&0xFCFCFCFCUL)>>2);\
976 l1= (c&0x03030303UL)\
977 + (d&0x03030303UL);\
978 h1= ((c&0xFCFCFCFCUL)>>2)\
979 + ((d&0xFCFCFCFCUL)>>2);\
980 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981 }\
982 }\
983 \
984 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986 }\
987 \
988 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990 }\
991 \
992 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
993 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994 }\
995 \
996 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
997 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 }\
999 \
1000 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1001 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002 int i;\
1003 for(i=0; i<h; i++){\
1004 uint32_t a, b, c, d, l0, l1, h0, h1;\
1005 a= AV_RN32(&src1[i*src_stride1]);\
1006 b= AV_RN32(&src2[i*src_stride2]);\
1007 c= AV_RN32(&src3[i*src_stride3]);\
1008 d= AV_RN32(&src4[i*src_stride4]);\
1009 l0= (a&0x03030303UL)\
1010 + (b&0x03030303UL)\
1011 + 0x01010101UL;\
1012 h0= ((a&0xFCFCFCFCUL)>>2)\
1013 + ((b&0xFCFCFCFCUL)>>2);\
1014 l1= (c&0x03030303UL)\
1015 + (d&0x03030303UL);\
1016 h1= ((c&0xFCFCFCFCUL)>>2)\
1017 + ((d&0xFCFCFCFCUL)>>2);\
1018 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019 a= AV_RN32(&src1[i*src_stride1+4]);\
1020 b= AV_RN32(&src2[i*src_stride2+4]);\
1021 c= AV_RN32(&src3[i*src_stride3+4]);\
1022 d= AV_RN32(&src4[i*src_stride4+4]);\
1023 l0= (a&0x03030303UL)\
1024 + (b&0x03030303UL)\
1025 + 0x01010101UL;\
1026 h0= ((a&0xFCFCFCFCUL)>>2)\
1027 + ((b&0xFCFCFCFCUL)>>2);\
1028 l1= (c&0x03030303UL)\
1029 + (d&0x03030303UL);\
1030 h1= ((c&0xFCFCFCFCUL)>>2)\
1031 + ((d&0xFCFCFCFCUL)>>2);\
1032 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1033 }\
1034 }\
1035 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1036 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1037 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1038 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039 }\
1040 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1041 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1042 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1043 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044 }\
1045 \
1046 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1047 {\
1048 int i, a0, b0, a1, b1;\
1049 a0= pixels[0];\
1050 b0= pixels[1] + 2;\
1051 a0 += b0;\
1052 b0 += pixels[2];\
1053 \
1054 pixels+=line_size;\
1055 for(i=0; i<h; i+=2){\
1056 a1= pixels[0];\
1057 b1= pixels[1];\
1058 a1 += b1;\
1059 b1 += pixels[2];\
1060 \
1061 block[0]= (a1+a0)>>2; /* FIXME non put */\
1062 block[1]= (b1+b0)>>2;\
1063 \
1064 pixels+=line_size;\
1065 block +=line_size;\
1066 \
1067 a0= pixels[0];\
1068 b0= pixels[1] + 2;\
1069 a0 += b0;\
1070 b0 += pixels[2];\
1071 \
1072 block[0]= (a1+a0)>>2;\
1073 block[1]= (b1+b0)>>2;\
1074 pixels+=line_size;\
1075 block +=line_size;\
1076 }\
1077 }\
1078 \
1079 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080 {\
1081 int i;\
1082 const uint32_t a= AV_RN32(pixels );\
1083 const uint32_t b= AV_RN32(pixels+1);\
1084 uint32_t l0= (a&0x03030303UL)\
1085 + (b&0x03030303UL)\
1086 + 0x02020202UL;\
1087 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 uint32_t l1,h1;\
1090 \
1091 pixels+=line_size;\
1092 for(i=0; i<h; i+=2){\
1093 uint32_t a= AV_RN32(pixels );\
1094 uint32_t b= AV_RN32(pixels+1);\
1095 l1= (a&0x03030303UL)\
1096 + (b&0x03030303UL);\
1097 h1= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 pixels+=line_size;\
1101 block +=line_size;\
1102 a= AV_RN32(pixels );\
1103 b= AV_RN32(pixels+1);\
1104 l0= (a&0x03030303UL)\
1105 + (b&0x03030303UL)\
1106 + 0x02020202UL;\
1107 h0= ((a&0xFCFCFCFCUL)>>2)\
1108 + ((b&0xFCFCFCFCUL)>>2);\
1109 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110 pixels+=line_size;\
1111 block +=line_size;\
1112 }\
1113 }\
1114 \
1115 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1116 {\
1117 int j;\
1118 for(j=0; j<2; j++){\
1119 int i;\
1120 const uint32_t a= AV_RN32(pixels );\
1121 const uint32_t b= AV_RN32(pixels+1);\
1122 uint32_t l0= (a&0x03030303UL)\
1123 + (b&0x03030303UL)\
1124 + 0x02020202UL;\
1125 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1126 + ((b&0xFCFCFCFCUL)>>2);\
1127 uint32_t l1,h1;\
1128 \
1129 pixels+=line_size;\
1130 for(i=0; i<h; i+=2){\
1131 uint32_t a= AV_RN32(pixels );\
1132 uint32_t b= AV_RN32(pixels+1);\
1133 l1= (a&0x03030303UL)\
1134 + (b&0x03030303UL);\
1135 h1= ((a&0xFCFCFCFCUL)>>2)\
1136 + ((b&0xFCFCFCFCUL)>>2);\
1137 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1138 pixels+=line_size;\
1139 block +=line_size;\
1140 a= AV_RN32(pixels );\
1141 b= AV_RN32(pixels+1);\
1142 l0= (a&0x03030303UL)\
1143 + (b&0x03030303UL)\
1144 + 0x02020202UL;\
1145 h0= ((a&0xFCFCFCFCUL)>>2)\
1146 + ((b&0xFCFCFCFCUL)>>2);\
1147 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148 pixels+=line_size;\
1149 block +=line_size;\
1150 }\
1151 pixels+=4-line_size*(h+1);\
1152 block +=4-line_size*h;\
1153 }\
1154 }\
1155 \
1156 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1157 {\
1158 int j;\
1159 for(j=0; j<2; j++){\
1160 int i;\
1161 const uint32_t a= AV_RN32(pixels );\
1162 const uint32_t b= AV_RN32(pixels+1);\
1163 uint32_t l0= (a&0x03030303UL)\
1164 + (b&0x03030303UL)\
1165 + 0x01010101UL;\
1166 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1167 + ((b&0xFCFCFCFCUL)>>2);\
1168 uint32_t l1,h1;\
1169 \
1170 pixels+=line_size;\
1171 for(i=0; i<h; i+=2){\
1172 uint32_t a= AV_RN32(pixels );\
1173 uint32_t b= AV_RN32(pixels+1);\
1174 l1= (a&0x03030303UL)\
1175 + (b&0x03030303UL);\
1176 h1= ((a&0xFCFCFCFCUL)>>2)\
1177 + ((b&0xFCFCFCFCUL)>>2);\
1178 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1179 pixels+=line_size;\
1180 block +=line_size;\
1181 a= AV_RN32(pixels );\
1182 b= AV_RN32(pixels+1);\
1183 l0= (a&0x03030303UL)\
1184 + (b&0x03030303UL)\
1185 + 0x01010101UL;\
1186 h0= ((a&0xFCFCFCFCUL)>>2)\
1187 + ((b&0xFCFCFCFCUL)>>2);\
1188 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189 pixels+=line_size;\
1190 block +=line_size;\
1191 }\
1192 pixels+=4-line_size*(h+1);\
1193 block +=4-line_size*h;\
1194 }\
1195 }\
1196 \
1197 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1198 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1199 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1200 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1201 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1202 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1203 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1204 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1205
1206 #define op_avg(a, b) a = rnd_avg32(a, b)
1207 #endif
1208 #define op_put(a, b) a = b
1209
1210 PIXOP2(avg, op_avg)
1211 PIXOP2(put, op_put)
1212 #undef op_avg
1213 #undef op_put
1214
1215 #define avg2(a,b) ((a+b+1)>>1)
1216 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1217
1218 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1219 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1220 }
1221
1222 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1223 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1224 }
1225
1226 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1227 {
1228 const int A=(16-x16)*(16-y16);
1229 const int B=( x16)*(16-y16);
1230 const int C=(16-x16)*( y16);
1231 const int D=( x16)*( y16);
1232 int i;
1233
1234 for(i=0; i<h; i++)
1235 {
1236 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1237 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1238 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1239 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1240 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1241 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1242 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1243 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1244 dst+= stride;
1245 src+= stride;
1246 }
1247 }
1248
1249 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1250 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1251 {
1252 int y, vx, vy;
1253 const int s= 1<<shift;
1254
1255 width--;
1256 height--;
1257
1258 for(y=0; y<h; y++){
1259 int x;
1260
1261 vx= ox;
1262 vy= oy;
1263 for(x=0; x<8; x++){ //XXX FIXME optimize
1264 int src_x, src_y, frac_x, frac_y, index;
1265
1266 src_x= vx>>16;
1267 src_y= vy>>16;
1268 frac_x= src_x&(s-1);
1269 frac_y= src_y&(s-1);
1270 src_x>>=shift;
1271 src_y>>=shift;
1272
1273 if((unsigned)src_x < width){
1274 if((unsigned)src_y < height){
1275 index= src_x + src_y*stride;
1276 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1277 + src[index +1]* frac_x )*(s-frac_y)
1278 + ( src[index+stride ]*(s-frac_x)
1279 + src[index+stride+1]* frac_x )* frac_y
1280 + r)>>(shift*2);
1281 }else{
1282 index= src_x + av_clip(src_y, 0, height)*stride;
1283 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1284 + src[index +1]* frac_x )*s
1285 + r)>>(shift*2);
1286 }
1287 }else{
1288 if((unsigned)src_y < height){
1289 index= av_clip(src_x, 0, width) + src_y*stride;
1290 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1291 + src[index+stride ]* frac_y )*s
1292 + r)>>(shift*2);
1293 }else{
1294 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1295 dst[y*stride + x]= src[index ];
1296 }
1297 }
1298
1299 vx+= dxx;
1300 vy+= dyx;
1301 }
1302 ox += dxy;
1303 oy += dyy;
1304 }
1305 }
1306
1307 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308 switch(width){
1309 case 2: put_pixels2_c (dst, src, stride, height); break;
1310 case 4: put_pixels4_c (dst, src, stride, height); break;
1311 case 8: put_pixels8_c (dst, src, stride, height); break;
1312 case 16:put_pixels16_c(dst, src, stride, height); break;
1313 }
1314 }
1315
1316 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317 int i,j;
1318 for (i=0; i < height; i++) {
1319 for (j=0; j < width; j++) {
1320 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1321 }
1322 src += stride;
1323 dst += stride;
1324 }
1325 }
1326
1327 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328 int i,j;
1329 for (i=0; i < height; i++) {
1330 for (j=0; j < width; j++) {
1331 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1332 }
1333 src += stride;
1334 dst += stride;
1335 }
1336 }
1337
1338 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339 int i,j;
1340 for (i=0; i < height; i++) {
1341 for (j=0; j < width; j++) {
1342 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1343 }
1344 src += stride;
1345 dst += stride;
1346 }
1347 }
1348
1349 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350 int i,j;
1351 for (i=0; i < height; i++) {
1352 for (j=0; j < width; j++) {
1353 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1354 }
1355 src += stride;
1356 dst += stride;
1357 }
1358 }
1359
1360 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361 int i,j;
1362 for (i=0; i < height; i++) {
1363 for (j=0; j < width; j++) {
1364 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365 }
1366 src += stride;
1367 dst += stride;
1368 }
1369 }
1370
1371 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372 int i,j;
1373 for (i=0; i < height; i++) {
1374 for (j=0; j < width; j++) {
1375 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1376 }
1377 src += stride;
1378 dst += stride;
1379 }
1380 }
1381
1382 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383 int i,j;
1384 for (i=0; i < height; i++) {
1385 for (j=0; j < width; j++) {
1386 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1387 }
1388 src += stride;
1389 dst += stride;
1390 }
1391 }
1392
1393 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394 int i,j;
1395 for (i=0; i < height; i++) {
1396 for (j=0; j < width; j++) {
1397 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1398 }
1399 src += stride;
1400 dst += stride;
1401 }
1402 }
1403
1404 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405 switch(width){
1406 case 2: avg_pixels2_c (dst, src, stride, height); break;
1407 case 4: avg_pixels4_c (dst, src, stride, height); break;
1408 case 8: avg_pixels8_c (dst, src, stride, height); break;
1409 case 16:avg_pixels16_c(dst, src, stride, height); break;
1410 }
1411 }
1412
1413 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414 int i,j;
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1418 }
1419 src += stride;
1420 dst += stride;
1421 }
1422 }
1423
1424 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425 int i,j;
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1429 }
1430 src += stride;
1431 dst += stride;
1432 }
1433 }
1434
1435 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436 int i,j;
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1440 }
1441 src += stride;
1442 dst += stride;
1443 }
1444 }
1445
1446 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447 int i,j;
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451 }
1452 src += stride;
1453 dst += stride;
1454 }
1455 }
1456
1457 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458 int i,j;
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462 }
1463 src += stride;
1464 dst += stride;
1465 }
1466 }
1467
1468 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 int i,j;
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1473 }
1474 src += stride;
1475 dst += stride;
1476 }
1477 }
1478
1479 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 int i,j;
1481 for (i=0; i < height; i++) {
1482 for (j=0; j < width; j++) {
1483 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484 }
1485 src += stride;
1486 dst += stride;
1487 }
1488 }
1489
1490 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 int i,j;
1492 for (i=0; i < height; i++) {
1493 for (j=0; j < width; j++) {
1494 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495 }
1496 src += stride;
1497 dst += stride;
1498 }
1499 }
1500 #if 0
1501 #define TPEL_WIDTH(width)\
1502 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1503 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1504 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1505 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1506 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1507 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1508 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1509 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1510 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1511 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1512 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1513 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1514 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1515 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1516 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1517 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1518 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1519 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1520 #endif
1521
1522 #define H264_CHROMA_MC(OPNAME, OP)\
1523 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524 const int A=(8-x)*(8-y);\
1525 const int B=( x)*(8-y);\
1526 const int C=(8-x)*( y);\
1527 const int D=( x)*( y);\
1528 int i;\
1529 \
1530 assert(x<8 && y<8 && x>=0 && y>=0);\
1531 \
1532 if(D){\
1533 for(i=0; i<h; i++){\
1534 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536 dst+= stride;\
1537 src+= stride;\
1538 }\
1539 }else{\
1540 const int E= B+C;\
1541 const int step= C ? stride : 1;\
1542 for(i=0; i<h; i++){\
1543 OP(dst[0], (A*src[0] + E*src[step+0]));\
1544 OP(dst[1], (A*src[1] + E*src[step+1]));\
1545 dst+= stride;\
1546 src+= stride;\
1547 }\
1548 }\
1549 }\
1550 \
1551 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1552 const int A=(8-x)*(8-y);\
1553 const int B=( x)*(8-y);\
1554 const int C=(8-x)*( y);\
1555 const int D=( x)*( y);\
1556 int i;\
1557 \
1558 assert(x<8 && y<8 && x>=0 && y>=0);\
1559 \
1560 if(D){\
1561 for(i=0; i<h; i++){\
1562 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1563 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1564 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1565 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1566 dst+= stride;\
1567 src+= stride;\
1568 }\
1569 }else{\
1570 const int E= B+C;\
1571 const int step= C ? stride : 1;\
1572 for(i=0; i<h; i++){\
1573 OP(dst[0], (A*src[0] + E*src[step+0]));\
1574 OP(dst[1], (A*src[1] + E*src[step+1]));\
1575 OP(dst[2], (A*src[2] + E*src[step+2]));\
1576 OP(dst[3], (A*src[3] + E*src[step+3]));\
1577 dst+= stride;\
1578 src+= stride;\
1579 }\
1580 }\
1581 }\
1582 \
1583 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1584 const int A=(8-x)*(8-y);\
1585 const int B=( x)*(8-y);\
1586 const int C=(8-x)*( y);\
1587 const int D=( x)*( y);\
1588 int i;\
1589 \
1590 assert(x<8 && y<8 && x>=0 && y>=0);\
1591 \
1592 if(D){\
1593 for(i=0; i<h; i++){\
1594 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1595 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1596 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1597 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1598 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1599 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1600 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1601 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1602 dst+= stride;\
1603 src+= stride;\
1604 }\
1605 }else{\
1606 const int E= B+C;\
1607 const int step= C ? stride : 1;\
1608 for(i=0; i<h; i++){\
1609 OP(dst[0], (A*src[0] + E*src[step+0]));\
1610 OP(dst[1], (A*src[1] + E*src[step+1]));\
1611 OP(dst[2], (A*src[2] + E*src[step+2]));\
1612 OP(dst[3], (A*src[3] + E*src[step+3]));\
1613 OP(dst[4], (A*src[4] + E*src[step+4]));\
1614 OP(dst[5], (A*src[5] + E*src[step+5]));\
1615 OP(dst[6], (A*src[6] + E*src[step+6]));\
1616 OP(dst[7], (A*src[7] + E*src[step+7]));\
1617 dst+= stride;\
1618 src+= stride;\
1619 }\
1620 }\
1621 }
1622
1623 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1624 #define op_put(a, b) a = (((b) + 32)>>6)
1625
1626 H264_CHROMA_MC(put_ , op_put)
1627 H264_CHROMA_MC(avg_ , op_avg)
1628 #undef op_avg
1629 #undef op_put
1630
1631 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1632 const int A=(8-x)*(8-y);
1633 const int B=( x)*(8-y);
1634 const int C=(8-x)*( y);
1635 const int D=( x)*( y);
1636 int i;
1637
1638 assert(x<8 && y<8 && x>=0 && y>=0);
1639
1640 for(i=0; i<h; i++)
1641 {
1642 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1643 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1644 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1645 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1646 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1647 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1648 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1649 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1650 dst+= stride;
1651 src+= stride;
1652 }
1653 }
1654
1655 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1656 const int A=(8-x)*(8-y);
1657 const int B=( x)*(8-y);
1658 const int C=(8-x)*( y);
1659 const int D=( x)*( y);
1660 int i;
1661
1662 assert(x<8 && y<8 && x>=0 && y>=0);
1663
1664 for(i=0; i<h; i++)
1665 {
1666 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1667 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1668 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1669 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1670 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1671 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1672 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1673 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1674 dst+= stride;
1675 src+= stride;
1676 }
1677 }
1678
1679 #define QPEL_MC(r, OPNAME, RND, OP) \
1680 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682 int i;\
1683 for(i=0; i<h; i++)\
1684 {\
1685 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1686 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1687 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1688 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1689 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1690 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1691 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1692 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1693 dst+=dstStride;\
1694 src+=srcStride;\
1695 }\
1696 }\
1697 \
1698 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1699 const int w=8;\
1700 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1701 int i;\
1702 for(i=0; i<w; i++)\
1703 {\
1704 const int src0= src[0*srcStride];\
1705 const int src1= src[1*srcStride];\
1706 const int src2= src[2*srcStride];\
1707 const int src3= src[3*srcStride];\
1708 const int src4= src[4*srcStride];\
1709 const int src5= src[5*srcStride];\
1710 const int src6= src[6*srcStride];\
1711 const int src7= src[7*srcStride];\
1712 const int src8= src[8*srcStride];\
1713 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1714 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1715 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1716 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1717 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1718 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1719 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1720 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1721 dst++;\
1722 src++;\
1723 }\
1724 }\
1725 \
1726 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1727 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728 int i;\
1729 \
1730 for(i=0; i<h; i++)\
1731 {\
1732 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1733 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1734 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1735 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1736 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1737 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1738 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1739 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1740 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1741 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1742 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1743 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1744 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1745 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1746 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1747 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1748 dst+=dstStride;\
1749 src+=srcStride;\
1750 }\
1751 }\
1752 \
1753 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755 int i;\
1756 const int w=16;\
1757 for(i=0; i<w; i++)\
1758 {\
1759 const int src0= src[0*srcStride];\
1760 const int src1= src[1*srcStride];\
1761 const int src2= src[2*srcStride];\
1762 const int src3= src[3*srcStride];\
1763 const int src4= src[4*srcStride];\
1764 const int src5= src[5*srcStride];\
1765 const int src6= src[6*srcStride];\
1766 const int src7= src[7*srcStride];\
1767 const int src8= src[8*srcStride];\
1768 const int src9= src[9*srcStride];\
1769 const int src10= src[10*srcStride];\
1770 const int src11= src[11*srcStride];\
1771 const int src12= src[12*srcStride];\
1772 const int src13= src[13*srcStride];\
1773 const int src14= src[14*srcStride];\
1774 const int src15= src[15*srcStride];\
1775 const int src16= src[16*srcStride];\
1776 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1777 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1778 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1779 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1780 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1781 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1782 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1783 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1784 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1785 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1786 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1787 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1788 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1789 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1790 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1791 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1792 dst++;\
1793 src++;\
1794 }\
1795 }\
1796 \
1797 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1798 OPNAME ## pixels8_c(dst, src, stride, 8);\
1799 }\
1800 \
1801 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t half[64];\
1803 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1804 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1805 }\
1806 \
1807 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1808 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1809 }\
1810 \
1811 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t half[64];\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1814 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1815 }\
1816 \
1817 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t full[16*9];\
1819 uint8_t half[64];\
1820 copy_block9(full, src, 16, stride, 9);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1822 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1823 }\
1824 \
1825 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1827 copy_block9(full, src, 16, stride, 9);\
1828 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1829 }\
1830 \
1831 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1832 uint8_t full[16*9];\
1833 uint8_t half[64];\
1834 copy_block9(full, src, 16, stride, 9);\
1835 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1836 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1837 }\
1838 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1839 uint8_t full[16*9];\
1840 uint8_t halfH[72];\
1841 uint8_t halfV[64];\
1842 uint8_t halfHV[64];\
1843 copy_block9(full, src, 16, stride, 9);\
1844 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1846 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1848 }\
1849 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1850 uint8_t full[16*9];\
1851 uint8_t halfH[72];\
1852 uint8_t halfHV[64];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1858 }\
1859 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[16*9];\
1861 uint8_t halfH[72];\
1862 uint8_t halfV[64];\
1863 uint8_t halfHV[64];\
1864 copy_block9(full, src, 16, stride, 9);\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1867 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869 }\
1870 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1871 uint8_t full[16*9];\
1872 uint8_t halfH[72];\
1873 uint8_t halfHV[64];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1877 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879 }\
1880 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[16*9];\
1882 uint8_t halfH[72];\
1883 uint8_t halfV[64];\
1884 uint8_t halfHV[64];\
1885 copy_block9(full, src, 16, stride, 9);\
1886 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1887 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1888 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890 }\
1891 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1892 uint8_t full[16*9];\
1893 uint8_t halfH[72];\
1894 uint8_t halfHV[64];\
1895 copy_block9(full, src, 16, stride, 9);\
1896 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1898 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1899 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1900 }\
1901 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t full[16*9];\
1903 uint8_t halfH[72];\
1904 uint8_t halfV[64];\
1905 uint8_t halfHV[64];\
1906 copy_block9(full, src, 16, stride, 9);\
1907 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1908 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1909 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911 }\
1912 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1913 uint8_t full[16*9];\
1914 uint8_t halfH[72];\
1915 uint8_t halfHV[64];\
1916 copy_block9(full, src, 16, stride, 9);\
1917 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1918 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1919 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1920 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921 }\
1922 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t halfH[72];\
1924 uint8_t halfHV[64];\
1925 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1926 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1928 }\
1929 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t halfH[72];\
1931 uint8_t halfHV[64];\
1932 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1934 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1935 }\
1936 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937 uint8_t full[16*9];\
1938 uint8_t halfH[72];\
1939 uint8_t halfV[64];\
1940 uint8_t halfHV[64];\
1941 copy_block9(full, src, 16, stride, 9);\
1942 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1943 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1944 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1946 }\
1947 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t full[16*9];\
1949 uint8_t halfH[72];\
1950 copy_block9(full, src, 16, stride, 9);\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1954 }\
1955 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[16*9];\
1957 uint8_t halfH[72];\
1958 uint8_t halfV[64];\
1959 uint8_t halfHV[64];\
1960 copy_block9(full, src, 16, stride, 9);\
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965 }\
1966 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[16*9];\
1968 uint8_t halfH[72];\
1969 copy_block9(full, src, 16, stride, 9);\
1970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973 }\
1974 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t halfH[72];\
1976 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978 }\
1979 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1980 OPNAME ## pixels16_c(dst, src, stride, 16);\
1981 }\
1982 \
1983 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t half[256];\
1985 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1986 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1987 }\
1988 \
1989 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1990 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1991 }\
1992 \
1993 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t half[256];\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1996 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1997 }\
1998 \
1999 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t full[24*17];\
2001 uint8_t half[256];\
2002 copy_block17(full, src, 24, stride, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2004 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2005 }\
2006 \
2007 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 copy_block17(full, src, 24, stride, 17);\
2010 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2011 }\
2012 \
2013 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2014 uint8_t full[24*17];\
2015 uint8_t half[256];\
2016 copy_block17(full, src, 24, stride, 17);\
2017 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2018 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2019 }\
2020 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[24*17];\
2022 uint8_t halfH[272];\
2023 uint8_t halfV[256];\
2024 uint8_t halfHV[256];\
2025 copy_block17(full, src, 24, stride, 17);\
2026 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030 }\
2031 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t full[24*17];\
2033 uint8_t halfH[272];\
2034 uint8_t halfHV[256];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2040 }\
2041 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[24*17];\
2043 uint8_t halfH[272];\
2044 uint8_t halfV[256];\
2045 uint8_t halfHV[256];\
2046 copy_block17(full, src, 24, stride, 17);\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051 }\
2052 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[24*17];\
2054 uint8_t halfH[272];\
2055 uint8_t halfHV[256];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061 }\
2062 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2063 uint8_t full[24*17];\
2064 uint8_t halfH[272];\
2065 uint8_t halfV[256];\
2066 uint8_t halfHV[256];\
2067 copy_block17(full, src, 24, stride, 17);\
2068 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2069 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2070 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2071 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072 }\
2073 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2074 uint8_t full[24*17];\
2075 uint8_t halfH[272];\
2076 uint8_t halfHV[256];\
2077 copy_block17(full, src, 24, stride, 17);\
2078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2079 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2080 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082 }\
2083 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084 uint8_t full[24*17];\
2085 uint8_t halfH[272];\
2086 uint8_t halfV[256];\
2087 uint8_t halfHV[256];\
2088 copy_block17(full, src, 24, stride, 17);\
2089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2090 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093 }\
2094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2095 uint8_t full[24*17];\
2096 uint8_t halfH[272];\
2097 uint8_t halfHV[256];\
2098 copy_block17(full, src, 24, stride, 17);\
2099 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2100 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2101 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2102 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103 }\
2104 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2105 uint8_t halfH[272];\
2106 uint8_t halfHV[256];\
2107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2108 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2110 }\
2111 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2112 uint8_t halfH[272];\
2113 uint8_t halfHV[256];\
2114 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2116 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2117 }\
2118 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2119 uint8_t full[24*17];\
2120 uint8_t halfH[272];\
2121 uint8_t halfV[256];\
2122 uint8_t halfHV[256];\
2123 copy_block17(full, src, 24, stride, 17);\
2124 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2125 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2126 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2128 }\
2129 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2130 uint8_t full[24*17];\
2131 uint8_t halfH[272];\
2132 copy_block17(full, src, 24, stride, 17);\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2136 }\
2137 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t full[24*17];\
2139 uint8_t halfH[272];\
2140 uint8_t halfV[256];\
2141 uint8_t halfHV[256];\
2142 copy_block17(full, src, 24, stride, 17);\
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147 }\
2148 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2149 uint8_t full[24*17];\
2150 uint8_t halfH[272];\
2151 copy_block17(full, src, 24, stride, 17);\
2152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155 }\
2156 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2157 uint8_t halfH[272];\
2158 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160 }
2161
2162 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2163 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2164 #define op_put(a, b) a = cm[((b) + 16)>>5]
2165 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2166
2167 QPEL_MC(0, put_ , _ , op_put)
2168 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2169 QPEL_MC(0, avg_ , _ , op_avg)
2170 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2171 #undef op_avg
2172 #undef op_avg_no_rnd
2173 #undef op_put
2174 #undef op_put_no_rnd
2175
2176 #if 1
2177 #define H264_LOWPASS(OPNAME, OP, OP2) \
2178 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2179 const int h=2;\
2180 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181 int i;\
2182 for(i=0; i<h; i++)\
2183 {\
2184 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2185 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2186 dst+=dstStride;\
2187 src+=srcStride;\
2188 }\
2189 }\
2190 \
2191 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192 const int w=2;\
2193 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2194 int i;\
2195 for(i=0; i<w; i++)\
2196 {\
2197 const int srcB= src[-2*srcStride];\
2198 const int srcA= src[-1*srcStride];\
2199 const int src0= src[0 *srcStride];\
2200 const int src1= src[1 *srcStride];\
2201 const int src2= src[2 *srcStride];\
2202 const int src3= src[3 *srcStride];\
2203 const int src4= src[4 *srcStride];\
2204 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2205 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2206 dst++;\
2207 src++;\
2208 }\
2209 }\
2210 \
2211 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212 const int h=2;\
2213 const int w=2;\
2214 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215 int i;\
2216 src -= 2*srcStride;\
2217 for(i=0; i<h+5; i++)\
2218 {\
2219 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2220 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2221 tmp+=tmpStride;\
2222 src+=srcStride;\
2223 }\
2224 tmp -= tmpStride*(h+5-2);\
2225 for(i=0; i<w; i++)\
2226 {\
2227 const int tmpB= tmp[-2*tmpStride];\
2228 const int tmpA= tmp[-1*tmpStride];\
2229 const int tmp0= tmp[0 *tmpStride];\
2230 const int tmp1= tmp[1 *tmpStride];\
2231 const int tmp2= tmp[2 *tmpStride];\
2232 const int tmp3= tmp[3 *tmpStride];\
2233 const int tmp4= tmp[4 *tmpStride];\
2234 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2235 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2236 dst++;\
2237 tmp++;\
2238 }\
2239 }\
2240 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241 const int h=4;\
2242 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243 int i;\
2244 for(i=0; i<h; i++)\
2245 {\
2246 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2247 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2248 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2249 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2250 dst+=dstStride;\
2251 src+=srcStride;\
2252 }\
2253 }\
2254 \
2255 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2256 const int w=4;\
2257 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258 int i;\
2259 for(i=0; i<w; i++)\
2260 {\
2261 const int srcB= src[-2*srcStride];\
2262 const int srcA= src[-1*srcStride];\
2263 const int src0= src[0 *srcStride];\
2264 const int src1= src[1 *srcStride];\
2265 const int src2= src[2 *srcStride];\
2266 const int src3= src[3 *srcStride];\
2267 const int src4= src[4 *srcStride];\
2268 const int src5= src[5 *srcStride];\
2269 const int src6= src[6 *srcStride];\
2270 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2271 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2272 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2273 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2274 dst++;\
2275 src++;\
2276 }\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280 const int h=4;\
2281 const int w=4;\
2282 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283 int i;\
2284 src -= 2*srcStride;\
2285 for(i=0; i<h+5; i++)\
2286 {\
2287 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2288 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2289 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2290 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2291 tmp+=tmpStride;\
2292 src+=srcStride;\
2293 }\
2294 tmp -= tmpStride*(h+5-2);\
2295 for(i=0; i<w; i++)\
2296 {\
2297 const int tmpB= tmp[-2*tmpStride];\
2298 const int tmpA= tmp[-1*tmpStride];\
2299 const int tmp0= tmp[0 *tmpStride];\
2300 const int tmp1= tmp[1 *tmpStride];\
2301 const int tmp2= tmp[2 *tmpStride];\
2302 const int tmp3= tmp[3 *tmpStride];\
2303 const int tmp4= tmp[4 *tmpStride];\
2304 const int tmp5= tmp[5 *tmpStride];\
2305 const int tmp6= tmp[6 *tmpStride];\
2306 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2307 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2308 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2309 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2310 dst++;\
2311 tmp++;\
2312 }\
2313 }\
2314 \
2315 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2316 const int h=8;\
2317 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2318 int i;\
2319 for(i=0; i<h; i++)\
2320 {\
2321 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2322 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2323 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2324 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2325 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2326 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2327 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2328 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2329 dst+=dstStride;\
2330 src+=srcStride;\
2331 }\
2332 }\
2333 \
2334 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335 const int w=8;\
2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337 int i;\
2338 for(i=0; i<w; i++)\
2339 {\
2340 const int srcB= src[-2*srcStride];\
2341 const int srcA= src[-1*srcStride];\
2342 const int src0= src[0 *srcStride];\
2343 const int src1= src[1 *srcStride];\
2344 const int src2= src[2 *srcStride];\
2345 const int src3= src[3 *srcStride];\
2346 const int src4= src[4 *srcStride];\
2347 const int src5= src[5 *srcStride];\
2348 const int src6= src[6 *srcStride];\
2349 const int src7= src[7 *srcStride];\
2350 const int src8= src[8 *srcStride];\
2351 const int src9= src[9 *srcStride];\
2352 const int src10=src[10*srcStride];\
2353 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2354 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2355 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2356 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2357 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2358 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2359 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2360 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2361 dst++;\
2362 src++;\
2363 }\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2367 const int h=8;\
2368 const int w=8;\
2369 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2370 int i;\
2371 src -= 2*srcStride;\
2372 for(i=0; i<h+5; i++)\
2373 {\
2374 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2375 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2376 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2377 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2378 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2379 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2380 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2381 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2382 tmp+=tmpStride;\
2383 src+=srcStride;\
2384 }\
2385 tmp -= tmpStride*(h+5-2);\
2386 for(i=0; i<w; i++)\
2387 {\
2388 const int tmpB= tmp[-2*tmpStride];\
2389 const int tmpA= tmp[-1*tmpStride];\
2390 const int tmp0= tmp[0 *tmpStride];\
2391 const int tmp1= tmp[1 *tmpStride];\
2392 const int tmp2= tmp[2 *tmpStride];\
2393 const int tmp3= tmp[3 *tmpStride];\
2394 const int tmp4= tmp[4 *tmpStride];\
2395 const int tmp5= tmp[5 *tmpStride];\
2396 const int tmp6= tmp[6 *tmpStride];\
2397 const int tmp7= tmp[7 *tmpStride];\
2398 const int tmp8= tmp[8 *tmpStride];\
2399 const int tmp9= tmp[9 *tmpStride];\
2400 const int tmp10=tmp[10*tmpStride];\
2401 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2402 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2403 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2404 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2405 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2406 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2407 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2408 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2409 dst++;\
2410 tmp++;\
2411 }\
2412 }\
2413 \
2414 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2415 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2416 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2417 src += 8*srcStride;\
2418 dst += 8*dstStride;\
2419 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2420 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2425 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2426 src += 8*srcStride;\
2427 dst += 8*dstStride;\
2428 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2429 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2430 }\
2431 \
2432 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2433 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2434 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2435 src += 8*srcStride;\
2436 dst += 8*dstStride;\
2437 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2438 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2439 }\
2440
2441 #define H264_MC(OPNAME, SIZE) \
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2443 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2444 }\
2445 \
2446 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2447 uint8_t half[SIZE*SIZE];\
2448 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2449 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2450 }\
2451 \
2452 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2453 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2454 }\
2455 \
2456 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2457 uint8_t half[SIZE*SIZE];\
2458 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2459 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2460 }\
2461 \
2462 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2463 uint8_t full[SIZE*(SIZE+5)];\
2464 uint8_t * const full_mid= full + SIZE*2;\
2465 uint8_t half[SIZE*SIZE];\
2466 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2467 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2468 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2469 }\
2470 \
2471 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2472 uint8_t full[SIZE*(SIZE+5)];\
2473 uint8_t * const full_mid= full + SIZE*2;\
2474 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2475 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2479 uint8_t full[SIZE*(SIZE+5)];\
2480 uint8_t * const full_mid= full + SIZE*2;\
2481 uint8_t half[SIZE*SIZE];\
2482 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2483 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2484 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2485 }\
2486 \
2487 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2488 uint8_t full[SIZE*(SIZE+5)];\
2489 uint8_t * const full_mid= full + SIZE*2;\
2490 uint8_t halfH[SIZE*SIZE];\
2491 uint8_t halfV[SIZE*SIZE];\
2492 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2493 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2494 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496 }\
2497 \
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2499 uint8_t full[SIZE*(SIZE+5)];\
2500 uint8_t * const full_mid= full + SIZE*2;\
2501 uint8_t halfH[SIZE*SIZE];\
2502 uint8_t halfV[SIZE*SIZE];\
2503 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2505 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2506 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2507 }\
2508 \
2509 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2510 uint8_t full[SIZE*(SIZE+5)];\
2511 uint8_t * const full_mid= full + SIZE*2;\
2512 uint8_t halfH[SIZE*SIZE];\
2513 uint8_t halfV[SIZE*SIZE];\
2514 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2516 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2517 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518 }\
2519 \
2520 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2521 uint8_t full[SIZE*(SIZE+5)];\
2522 uint8_t * const full_mid= full + SIZE*2;\
2523 uint8_t halfH[SIZE*SIZE];\
2524 uint8_t halfV[SIZE*SIZE];\
2525 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2526 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2527 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529 }\
2530 \
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2532 int16_t tmp[SIZE*(SIZE+5)];\
2533 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2534 }\
2535 \
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2537 int16_t tmp[SIZE*(SIZE+5)];\
2538 uint8_t halfH[SIZE*SIZE];\
2539 uint8_t halfHV[SIZE*SIZE];\
2540 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2541 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2543 }\
2544 \
2545 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2546 int16_t tmp[SIZE*(SIZE+5)];\
2547 uint8_t halfH[SIZE*SIZE];\
2548 uint8_t halfHV[SIZE*SIZE];\
2549 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2550 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2551 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2552 }\
2553 \
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2555 uint8_t full[SIZE*(SIZE+5)];\
2556 uint8_t * const full_mid= full + SIZE*2;\
2557 int16_t tmp[SIZE*(SIZE+5)];\
2558 uint8_t halfV[SIZE*SIZE];\
2559 uint8_t halfHV[SIZE*SIZE];\
2560 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2561 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2562 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2563 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2564 }\
2565 \
2566 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2567 uint8_t full[SIZE*(SIZE+5)];\
2568 uint8_t * const full_mid= full + SIZE*2;\
2569 int16_t tmp[SIZE*(SIZE+5)];\
2570 uint8_t halfV[SIZE*SIZE];\
2571 uint8_t halfHV[SIZE*SIZE];\
2572 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2573 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2574 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2575 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2576 }\
2577
2578 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2579 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2580 #define op_put(a, b) a = cm[((b) + 16)>>5]
2581 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2582 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2583
2584 H264_LOWPASS(put_ , op_put, op2_put)
2585 H264_LOWPASS(avg_ , op_avg, op2_avg)
2586 H264_MC(put_, 2)
2587 H264_MC(put_, 4)
2588 H264_MC(put_, 8)
2589 H264_MC(put_, 16)
2590 H264_MC(avg_, 4)
2591 H264_MC(avg_, 8)
2592 H264_MC(avg_, 16)
2593
2594 #undef op_avg
2595 #undef op_put
2596 #undef op2_avg
2597 #undef op2_put
2598 #endif
2599
2600 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2601 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2602 #define H264_WEIGHT(W,H) \
2603 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2604 int y; \
2605 offset <<= log2_denom; \
2606 if(log2_denom) offset += 1<<(log2_denom-1); \
2607 for(y=0; y<H; y++, block += stride){ \
2608 op_scale1(0); \
2609 op_scale1(1); \
2610 if(W==2) continue; \
2611 op_scale1(2); \
2612 op_scale1(3); \
2613 if(W==4) continue; \
2614 op_scale1(4); \
2615 op_scale1(5); \
2616 op_scale1(6); \
2617 op_scale1(7); \
2618 if(W==8) continue; \
2619 op_scale1(8); \
2620 op_scale1(9); \
2621 op_scale1(10); \
2622 op_scale1(11); \
2623 op_scale1(12); \
2624 op_scale1(13); \
2625 op_scale1(14); \
2626 op_scale1(15); \
2627 } \
2628 } \
2629 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2630 int y; \
2631 offset = ((offset + 1) | 1) << log2_denom; \
2632 for(y=0; y<H; y++, dst += stride, src += stride){ \
2633 op_scale2(0); \
2634 op_scale2(1); \
2635 if(W==2) continue; \
2636 op_scale2(2); \
2637 op_scale2(3); \
2638 if(W==4) continue; \
2639 op_scale2(4); \
2640 op_scale2(5); \
2641 op_scale2(6); \
2642 op_scale2(7); \
2643 if(W==8) continue; \
2644 op_scale2(8); \
2645 op_scale2(9); \
2646 op_scale2(10); \
2647 op_scale2(11); \
2648 op_scale2(12); \
2649 op_scale2(13); \
2650 op_scale2(14); \
2651 op_scale2(15); \
2652 } \
2653 }
2654
2655 H264_WEIGHT(16,16)
2656 H264_WEIGHT(16,8)
2657 H264_WEIGHT(8,16)
2658 H264_WEIGHT(8,8)
2659 H264_WEIGHT(8,4)
2660 H264_WEIGHT(4,8)
2661 H264_WEIGHT(4,4)
2662 H264_WEIGHT(4,2)
2663 H264_WEIGHT(2,4)
2664 H264_WEIGHT(2,2)
2665
2666 #undef op_scale1
2667 #undef op_scale2
2668 #undef H264_WEIGHT
2669
2670 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2671 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2672 int i;
2673
2674 for(i=0; i<h; i++){
2675 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2676 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2677 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2678 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2679 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2680 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2681 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2682 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2683 dst+=dstStride;
2684 src+=srcStride;
2685 }
2686 }
2687
2688 #if CONFIG_CAVS_DECODER
2689 /* AVS specific */
2690 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2691 put_pixels8_c(dst, src, stride, 8);
2692 }
2693 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2694 avg_pixels8_c(dst, src, stride, 8);
2695 }
2696 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2697 put_pixels16_c(dst, src, stride, 16);
2698 }
2699 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2700 avg_pixels16_c(dst, src, stride, 16);
2701 }
2702 #endif /* CONFIG_CAVS_DECODER */
2703
2704 #if CONFIG_VC1_DECODER
2705 /* VC-1 specific */
2706 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2707 put_pixels8_c(dst, src, stride, 8);
2708 }
2709 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2710 avg_pixels8_c(dst, src, stride, 8);
2711 }
2712 #endif /* CONFIG_VC1_DECODER */
2713
2714 /* H264 specific */
2715 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2716
2717 #if CONFIG_RV40_DECODER
2718 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2719 put_pixels16_xy2_c(dst, src, stride, 16);
2720 }
2721 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2722 avg_pixels16_xy2_c(dst, src, stride, 16);
2723 }
2724 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2725 put_pixels8_xy2_c(dst, src, stride, 8);
2726 }
2727 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2728 avg_pixels8_xy2_c(dst, src, stride, 8);
2729 }
2730 #endif /* CONFIG_RV40_DECODER */
2731
2732 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2733 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2734 int i;
2735
2736 for(i=0; i<w; i++){
2737 const int src_1= src[ -srcStride];
2738 const int src0 = src[0 ];
2739 const int src1 = src[ srcStride];
2740 const int src2 = src[2*srcStride];
2741 const int src3 = src[3*srcStride];
2742 const int src4 = src[4*srcStride];
2743 const int src5 = src[5*srcStride];
2744 const int src6 = src[6*srcStride];
2745 const int src7 = src[7*srcStride];
2746 const int src8 = src[8*srcStride];
2747 const int src9 = src[9*srcStride];
2748 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2749 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2750 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2751 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2752 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2753 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2754 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2755 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2756 src++;
2757 dst++;
2758 }
2759 }
2760
2761 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2762 put_pixels8_c(dst, src, stride, 8);
2763 }
2764
2765 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2766 uint8_t half[64];
2767 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2768 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2769 }
2770
2771 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2772 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2773 }
2774
2775 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2776 uint8_t half[64];
2777 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2778 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2779 }
2780
2781 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2782 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2783 }
2784
2785 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2786 uint8_t halfH[88];
2787 uint8_t halfV[64];
2788 uint8_t halfHV[64];
2789 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2790 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2791 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2792 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2793 }
2794 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2795 uint8_t halfH[88];
2796 uint8_t halfV[64];
2797 uint8_t halfHV[64];
2798 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2799 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2800 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2801 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2802 }
2803 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2804 uint8_t halfH[88];
2805 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2806 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2807 }
2808
2809 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2810 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2811 int x;
2812 const int strength= ff_h263_loop_filter_strength[qscale];
2813
2814 for(x=0; x<8; x++){
2815 int d1, d2, ad1;
2816 int p0= src[x-2*stride];
2817 int p1= src[x-1*stride];
2818 int p2= src[x+0*stride];
2819 int p3= src[x+1*stride];
2820 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2821
2822 if (d<-2*strength) d1= 0;
2823 else if(d<- strength) d1=-2*strength - d;
2824 else if(d< strength) d1= d;
2825 else if(d< 2*strength) d1= 2*strength - d;
2826 else d1= 0;
2827
2828 p1 += d1;
2829 p2 -= d1;
2830 if(p1&256) p1= ~(p1>>31);
2831 if(p2&256) p2= ~(p2>>31);
2832
2833 src[x-1*stride] = p1;
2834 src[x+0*stride] = p2;
2835
2836 ad1= FFABS(d1)>>1;
2837
2838 d2= av_clip((p0-p3)/4, -ad1, ad1);
2839
2840 src[x-2*stride] = p0 - d2;
2841 src[x+ stride] = p3 + d2;
2842 }
2843 }
2844 }
2845
2846 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2847 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2848 int y;
2849 const int strength= ff_h263_loop_filter_strength[qscale];
2850
2851 for(y=0; y<8; y++){
2852 int d1, d2, ad1;
2853 int p0= src[y*stride-2];
2854 int p1= src[y*stride-1];
2855 int p2= src[y*stride+0];
2856 int p3= src[y*stride+1];
2857 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2858
2859 if (d<-2*strength) d1= 0;
2860 else if(d<- strength) d1=-2*strength - d;
2861 else if(d< strength) d1= d;
2862 else if(d< 2*strength) d1= 2*strength - d;
2863 else d1= 0;
2864
2865 p1 += d1;
2866 p2 -= d1;
2867 if(p1&256) p1= ~(p1>>31);
2868 if(p2&256) p2= ~(p2>>31);
2869
2870 src[y*stride-1] = p1;
2871 src[y*stride+0] = p2;
2872
2873 ad1= FFABS(d1)>>1;
2874
2875 d2= av_clip((p0-p3)/4, -ad1, ad1);
2876
2877 src[y*stride-2] = p0 - d2;
2878 src[y*stride+1] = p3 + d2;
2879 }
2880 }
2881 }
2882
2883 static void h261_loop_filter_c(uint8_t *src, int stride){
2884 int x,y,xy,yz;
2885 int temp[64];
2886
2887 for(x=0; x<8; x++){
2888 temp[x ] = 4*src[x ];
2889 temp[x + 7*8] = 4*src[x + 7*stride];
2890 }
2891 for(y=1; y<7; y++){
2892 for(x=0; x<8; x++){
2893 xy = y * stride + x;
2894 yz = y * 8 + x;
2895 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2896 }
2897 }
2898
2899 for(y=0; y<8; y++){
2900 src[ y*stride] = (temp[ y*8] + 2)>>2;
2901 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2902 for(x=1; x<7; x++){
2903 xy = y * stride + x;
2904 yz = y * 8 + x;
2905 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2906 }
2907 }
2908 }
2909
2910 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2911 {
2912 int i, d;
2913 for( i = 0; i < 4; i++ ) {
2914 if( tc0[i] < 0 ) {
2915 pix += 4*ystride;
2916 continue;
2917 }
2918 for( d = 0; d < 4; d++ ) {
2919 const int p0 = pix[-1*xstride];
2920 const int p1 = pix[-2*xstride];
2921 const int p2 = pix[-3*xstride];
2922 const int q0 = pix[0];
2923 const int q1 = pix[1*xstride];
2924 const int q2 = pix[2*xstride];
2925
2926 if( FFABS( p0 - q0 ) < alpha &&
2927 FFABS( p1 - p0 ) < beta &&
2928 FFABS( q1 - q0 ) < beta ) {
2929
2930 int tc = tc0[i];
2931 int i_delta;
2932
2933 if( FFABS( p2 - p0 ) < beta ) {
2934 if(tc0[i])
2935 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2936 tc++;
2937 }
2938 if( FFABS( q2 - q0 ) < beta ) {
2939 if(tc0[i])
2940 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2941 tc++;
2942 }
2943
2944 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2945 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2946 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2947 }
2948 pix += ystride;
2949 }
2950 }
2951 }
2952 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2953 {
2954 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2955 }
2956 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2957 {
2958 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2959 }
2960
2961 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2962 {
2963 int d;
2964 for( d = 0; d < 16; d++ ) {
2965 const int p2 = pix[-3*xstride];
2966 const int p1 = pix[-2*xstride];
2967 const int p0 = pix[-1*xstride];
2968
2969 const int q0 = pix[ 0*xstride];
2970 const int q1 = pix[ 1*xstride];
2971 const int q2 = pix[ 2*xstride];
2972
2973 if( FFABS( p0 - q0 ) < alpha &&
2974 FFABS( p1 - p0 ) < beta &&
2975 FFABS( q1 - q0 ) < beta ) {
2976
2977 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
2978 if( FFABS( p2 - p0 ) < beta)
2979 {
2980 const int p3 = pix[-4*xstride];
2981 /* p0', p1', p2' */
2982 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
2983 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
2984 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
2985 } else {
2986 /* p0' */
2987 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
2988 }
2989 if( FFABS( q2 - q0 ) < beta)
2990 {
2991 const int q3 = pix[3*xstride];
2992 /* q0', q1', q2' */
2993 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
2994 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
2995 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
2996 } else {
2997 /* q0' */
2998 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
2999 }
3000 }else{
3001 /* p0', q0' */
3002 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3003 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3004 }
3005 }
3006 pix += ystride;
3007 }
3008 }
3009 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3010 {
3011 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3012 }
3013 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3014 {
3015 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3016 }
3017
3018 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3019 {
3020 int i, d;
3021 for( i = 0; i < 4; i++ ) {
3022 const int tc = tc0[i];
3023 if( tc <= 0 ) {
3024 pix += 2*ystride;
3025 continue;
3026 }
3027 for( d = 0; d < 2; d++ ) {
3028 const int p0 = pix[-1*xstride];
3029 const int p1 = pix[-2*xstride];
3030 const int q0 = pix[0];
3031 const int q1 = pix[1*xstride];
3032
3033 if( FFABS( p0 - q0 ) < alpha &&
3034 FFABS( p1 - p0 ) < beta &&
3035 FFABS( q1 - q0 ) < beta ) {
3036
3037 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3038
3039 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3040 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3041 }
3042 pix += ystride;
3043 }
3044 }
3045 }
3046 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3047 {
3048 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3049 }
3050 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3051 {
3052 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3053 }
3054
3055 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3056 {
3057 int d;
3058 for( d = 0; d < 8; d++ ) {
3059 const int p0 = pix[-1*xstride];
3060 const int p1 = pix[-2*xstride];
3061 const int q0 = pix[0];
3062 const int q1 = pix[1*xstride];
3063
3064 if( FFABS( p0 - q0 ) < alpha &&
3065 FFABS( p1 - p0 ) < beta &&
3066 FFABS( q1 - q0 ) < beta ) {
3067
3068 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3069 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3070 }
3071 pix += ystride;
3072 }
3073 }
3074 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3075 {
3076 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3077 }
3078 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079 {
3080 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3081 }
3082
3083 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3084 {
3085 int s, i;
3086
3087 s = 0;
3088 for(i=0;i<h;i++) {
3089 s += abs(pix1[0] - pix2[0]);
3090 s += abs(pix1[1] - pix2[1]);
3091 s += abs(pix1[2] - pix2[2]);
3092 s += abs(pix1[3] - pix2[3]);
3093 s += abs(pix1[4] - pix2[4]);
3094 s += abs(pix1[5] - pix2[5]);
3095 s += abs(pix1[6] - pix2[6]);
3096 s += abs(pix1[7] - pix2[7]);
3097 s += abs(pix1[8] - pix2[8]);
3098 s += abs(pix1[9] - pix2[9]);
3099 s += abs(pix1[10] - pix2[10]);
3100 s += abs(pix1[11] - pix2[11]);
3101 s += abs(pix1[12] - pix2[12]);