CAVS decoder by (Stefan Gehrer stefan.gehrer gmx.de)
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
ff4ec49e
FB
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
de6d9b64 10 *
ff4ec49e 11 * This library is distributed in the hope that it will be useful,
de6d9b64 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
de6d9b64 15 *
ff4ec49e
FB
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
5509bffa 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7ff037e9 19 *
59fe111e 20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 21 */
115329f1 22
983e3246
MN
23/**
24 * @file dsputil.c
25 * DSP utils
26 */
115329f1 27
de6d9b64
FB
28#include "avcodec.h"
29#include "dsputil.h"
1457ab52 30#include "mpegvideo.h"
b0368839 31#include "simple_idct.h"
65e4c8c9 32#include "faandct.h"
059715a4 33#include "snow.h"
5596c60c 34
88730be6
MR
35/* snow.c */
36void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37
8b69867f
MN
38uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39uint32_t squareTbl[512] = {0, };
de6d9b64 40
0c1a9eda 41const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 44 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 45 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
50};
51
10acc479
RS
52/* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
63};
64
2f349de2 65/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
68b51e58 66DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
2f349de2 67
0c1a9eda 68const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 69 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 70 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 71 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 72 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 73 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 74 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 75 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
76 52, 53, 54, 55, 60, 61, 62, 63,
77};
78
0c1a9eda 79const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 80 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 81 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 82 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 83 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 84 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 85 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 86 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
87 38, 46, 54, 62, 39, 47, 55, 63,
88};
89
2f349de2 90/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 91const uint32_t inverse[256]={
115329f1
DB
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2
MN
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
124};
125
b0368839
MN
126/* Input permutation for the simple_idct_mmx */
127static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
136};
137
0c1a9eda 138static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
139{
140 int s, i, j;
141
142 s = 0;
143 for (i = 0; i < 16; i++) {
bb270c08
DB
144 for (j = 0; j < 16; j += 8) {
145 s += pix[0];
146 s += pix[1];
147 s += pix[2];
148 s += pix[3];
149 s += pix[4];
150 s += pix[5];
151 s += pix[6];
152 s += pix[7];
153 pix += 8;
154 }
155 pix += line_size - 16;
3aa102be
MN
156 }
157 return s;
158}
159
0c1a9eda 160static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
161{
162 int s, i, j;
0c1a9eda 163 uint32_t *sq = squareTbl + 256;
3aa102be
MN
164
165 s = 0;
166 for (i = 0; i < 16; i++) {
bb270c08 167 for (j = 0; j < 16; j += 8) {
2a006cd3 168#if 0
bb270c08
DB
169 s += sq[pix[0]];
170 s += sq[pix[1]];
171 s += sq[pix[2]];
172 s += sq[pix[3]];
173 s += sq[pix[4]];
174 s += sq[pix[5]];
175 s += sq[pix[6]];
176 s += sq[pix[7]];
2a006cd3
FL
177#else
178#if LONG_MAX > 2147483647
bb270c08
DB
179 register uint64_t x=*(uint64_t*)pix;
180 s += sq[x&0xff];
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
2a006cd3
FL
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
188#else
bb270c08
DB
189 register uint32_t x=*(uint32_t*)pix;
190 s += sq[x&0xff];
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
2a006cd3
FL
194 x=*(uint32_t*)(pix+4);
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199#endif
200#endif
bb270c08
DB
201 pix += 8;
202 }
203 pix += line_size - 16;
3aa102be
MN
204 }
205 return s;
206}
207
3d2e8cce
MN
208static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
209 int i;
115329f1 210
3d2e8cce
MN
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
220 }
221 for(;i<w; i++){
222 dst[i+0]= bswap_32(src[i+0]);
223 }
224}
3aa102be 225
26efc54e
MN
226static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
227{
228 int s, i;
229 uint32_t *sq = squareTbl + 256;
230
231 s = 0;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241}
242
bb198e19 243static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
244{
245 int s, i;
0c1a9eda 246 uint32_t *sq = squareTbl + 256;
1457ab52
MN
247
248 s = 0;
bb198e19 249 for (i = 0; i < h; i++) {
1457ab52
MN
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
258 pix1 += line_size;
259 pix2 += line_size;
260 }
261 return s;
262}
263
bb198e19 264static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 265{
6b026927
FH
266 int s, i;
267 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
268
269 s = 0;
bb198e19 270 for (i = 0; i < h; i++) {
6b026927
FH
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
2a006cd3 287
6b026927
FH
288 pix1 += line_size;
289 pix2 += line_size;
9c76bd48
BF
290 }
291 return s;
292}
293
26efc54e 294
871371a7 295#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
3a6fc8fa 296static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
26efc54e
MN
297 int s, i, j;
298 const int dec_count= w==8 ? 3 : 4;
871371a7 299 int tmp[32*32];
26efc54e 300 int level, ori;
115329f1 301 static const int scale[2][2][4][4]={
26efc54e
MN
302 {
303 {
871371a7 304 // 9/7 8x8 dec=3
26efc54e
MN
305 {268, 239, 239, 213},
306 { 0, 224, 224, 152},
307 { 0, 135, 135, 110},
308 },{
871371a7 309 // 9/7 16x16 or 32x32 dec=4
26efc54e
MN
310 {344, 310, 310, 280},
311 { 0, 320, 320, 228},
312 { 0, 175, 175, 136},
313 { 0, 129, 129, 102},
314 }
315 },{
871371a7
LM
316 {
317 // 5/3 8x8 dec=3
26efc54e
MN
318 {275, 245, 245, 218},
319 { 0, 230, 230, 156},
320 { 0, 138, 138, 113},
321 },{
871371a7 322 // 5/3 16x16 or 32x32 dec=4
26efc54e
MN
323 {352, 317, 317, 286},
324 { 0, 328, 328, 233},
325 { 0, 180, 180, 140},
326 { 0, 132, 132, 105},
327 }
328 }
329 };
26efc54e
MN
330
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
871371a7
LM
333 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
26efc54e
MN
337 }
338 pix1 += line_size;
339 pix2 += line_size;
340 }
8b975b7c 341
871371a7 342 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
26efc54e
MN
343
344 s=0;
871371a7 345 assert(w==h);
26efc54e
MN
346 for(level=0; level<dec_count; level++){
347 for(ori= level ? 1 : 0; ori<4; ori++){
871371a7
LM
348 int size= w>>(dec_count-level);
349 int sx= (ori&1) ? size : 0;
350 int stride= 32<<(dec_count-level);
26efc54e 351 int sy= (ori&2) ? stride>>1 : 0;
115329f1 352
26efc54e
MN
353 for(i=0; i<size; i++){
354 for(j=0; j<size; j++){
355 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
356 s += ABS(v);
357 }
358 }
359 }
360 }
115329f1 361 assert(s>=0);
871371a7 362 return s>>9;
26efc54e
MN
363}
364
365static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
366 return w_c(v, pix1, pix2, line_size, 8, h, 1);
367}
368
369static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
370 return w_c(v, pix1, pix2, line_size, 8, h, 0);
371}
372
373static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
374 return w_c(v, pix1, pix2, line_size, 16, h, 1);
375}
376
377static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 16, h, 0);
379}
380
871371a7
LM
381int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 32, h, 1);
383}
384
385int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 32, h, 0);
387}
3a6fc8fa 388#endif
871371a7 389
0c1a9eda 390static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 391{
de6d9b64
FB
392 int i;
393
394 /* read the pixels */
de6d9b64 395 for(i=0;i<8;i++) {
c13e1abd
FH
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
404 pixels += line_size;
405 block += 8;
de6d9b64
FB
406 }
407}
408
0c1a9eda 409static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 410 const uint8_t *s2, int stride){
9dbcbd92
MN
411 int i;
412
413 /* read the pixels */
9dbcbd92 414 for(i=0;i<8;i++) {
c13e1abd
FH
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
9dbcbd92
MN
423 s1 += stride;
424 s2 += stride;
c13e1abd 425 block += 8;
9dbcbd92
MN
426 }
427}
428
429
0c1a9eda 430static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 431 int line_size)
de6d9b64 432{
de6d9b64 433 int i;
0c1a9eda 434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 435
de6d9b64 436 /* read the pixels */
de6d9b64 437 for(i=0;i<8;i++) {
c13e1abd
FH
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
446
447 pixels += line_size;
448 block += 8;
de6d9b64
FB
449 }
450}
451
178fcca8 452static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 453 int line_size)
178fcca8
MN
454{
455 int i;
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 457
178fcca8
MN
458 /* read the pixels */
459 for(i=0;i<4;i++) {
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
464
465 pixels += line_size;
466 block += 8;
467 }
468}
469
9ca358b9 470static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 471 int line_size)
9ca358b9
MN
472{
473 int i;
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 475
9ca358b9
MN
476 /* read the pixels */
477 for(i=0;i<2;i++) {
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
480
481 pixels += line_size;
482 block += 8;
483 }
484}
485
115329f1 486static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
487 uint8_t *restrict pixels,
488 int line_size)
489{
490 int i, j;
491
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
494 if (*block < -128)
495 *pixels = 0;
496 else if (*block > 127)
497 *pixels = 255;
498 else
499 *pixels = (uint8_t)(*block + 128);
500 block++;
501 pixels++;
502 }
503 pixels += (line_size - 8);
504 }
505}
506
0c1a9eda 507static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 508 int line_size)
de6d9b64 509{
de6d9b64 510 int i;
0c1a9eda 511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 512
de6d9b64 513 /* read the pixels */
de6d9b64 514 for(i=0;i<8;i++) {
c13e1abd
FH
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
523 pixels += line_size;
524 block += 8;
de6d9b64
FB
525 }
526}
178fcca8
MN
527
528static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529 int line_size)
530{
531 int i;
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 533
178fcca8
MN
534 /* read the pixels */
535 for(i=0;i<4;i++) {
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
540 pixels += line_size;
541 block += 8;
542 }
543}
9ca358b9
MN
544
545static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546 int line_size)
547{
548 int i;
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
115329f1 550
9ca358b9
MN
551 /* read the pixels */
552 for(i=0;i<2;i++) {
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
555 pixels += line_size;
556 block += 8;
557 }
558}
36940eca
LM
559
560static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
561{
562 int i;
563 for(i=0;i<8;i++) {
564 pixels[0] += block[0];
565 pixels[1] += block[1];
566 pixels[2] += block[2];
567 pixels[3] += block[3];
568 pixels[4] += block[4];
569 pixels[5] += block[5];
570 pixels[6] += block[6];
571 pixels[7] += block[7];
572 pixels += line_size;
573 block += 8;
574 }
575}
576
577static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
578{
579 int i;
580 for(i=0;i<4;i++) {
581 pixels[0] += block[0];
582 pixels[1] += block[1];
583 pixels[2] += block[2];
584 pixels[3] += block[3];
585 pixels += line_size;
586 block += 4;
587 }
588}
589
59fe111e
MN
590#if 0
591
592#define PIXOP2(OPNAME, OP) \
b3184779 593static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
594{\
595 int i;\
596 for(i=0; i<h; i++){\
597 OP(*((uint64_t*)block), LD64(pixels));\
598 pixels+=line_size;\
599 block +=line_size;\
600 }\
601}\
602\
45553457 603static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
604{\
605 int i;\
606 for(i=0; i<h; i++){\
607 const uint64_t a= LD64(pixels );\
608 const uint64_t b= LD64(pixels+1);\
609 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
610 pixels+=line_size;\
611 block +=line_size;\
612 }\
613}\
614\
45553457 615static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
616{\
617 int i;\
618 for(i=0; i<h; i++){\
619 const uint64_t a= LD64(pixels );\
620 const uint64_t b= LD64(pixels+1);\
621 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
622 pixels+=line_size;\
623 block +=line_size;\
624 }\
625}\
626\
45553457 627static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
628{\
629 int i;\
630 for(i=0; i<h; i++){\
631 const uint64_t a= LD64(pixels );\
632 const uint64_t b= LD64(pixels+line_size);\
633 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
634 pixels+=line_size;\
635 block +=line_size;\
636 }\
637}\
638\
45553457 639static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
640{\
641 int i;\
642 for(i=0; i<h; i++){\
643 const uint64_t a= LD64(pixels );\
644 const uint64_t b= LD64(pixels+line_size);\
645 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
646 pixels+=line_size;\
647 block +=line_size;\
648 }\
649}\
650\
45553457 651static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
652{\
653 int i;\
654 const uint64_t a= LD64(pixels );\
655 const uint64_t b= LD64(pixels+1);\
656 uint64_t l0= (a&0x0303030303030303ULL)\
657 + (b&0x0303030303030303ULL)\
658 + 0x0202020202020202ULL;\
659 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
660 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
661 uint64_t l1,h1;\
662\
663 pixels+=line_size;\
664 for(i=0; i<h; i+=2){\
665 uint64_t a= LD64(pixels );\
666 uint64_t b= LD64(pixels+1);\
667 l1= (a&0x0303030303030303ULL)\
668 + (b&0x0303030303030303ULL);\
669 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
670 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
671 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
672 pixels+=line_size;\
673 block +=line_size;\
674 a= LD64(pixels );\
675 b= LD64(pixels+1);\
676 l0= (a&0x0303030303030303ULL)\
677 + (b&0x0303030303030303ULL)\
678 + 0x0202020202020202ULL;\
679 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
682 pixels+=line_size;\
683 block +=line_size;\
684 }\
685}\
686\
45553457 687static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
688{\
689 int i;\
690 const uint64_t a= LD64(pixels );\
691 const uint64_t b= LD64(pixels+1);\
692 uint64_t l0= (a&0x0303030303030303ULL)\
693 + (b&0x0303030303030303ULL)\
694 + 0x0101010101010101ULL;\
695 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
696 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
697 uint64_t l1,h1;\
698\
699 pixels+=line_size;\
700 for(i=0; i<h; i+=2){\
701 uint64_t a= LD64(pixels );\
702 uint64_t b= LD64(pixels+1);\
703 l1= (a&0x0303030303030303ULL)\
704 + (b&0x0303030303030303ULL);\
705 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
706 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
707 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
708 pixels+=line_size;\
709 block +=line_size;\
710 a= LD64(pixels );\
711 b= LD64(pixels+1);\
712 l0= (a&0x0303030303030303ULL)\
713 + (b&0x0303030303030303ULL)\
714 + 0x0101010101010101ULL;\
715 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
718 pixels+=line_size;\
719 block +=line_size;\
720 }\
721}\
722\
45553457
ZK
723CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
724CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
725CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
726CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
727CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
728CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
729CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
730
731#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
732#else // 64 bit variant
733
734#define PIXOP2(OPNAME, OP) \
669ac79c
MN
735static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
736 int i;\
737 for(i=0; i<h; i++){\
738 OP(*((uint16_t*)(block )), LD16(pixels ));\
739 pixels+=line_size;\
740 block +=line_size;\
741 }\
742}\
0da71265
MN
743static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
744 int i;\
745 for(i=0; i<h; i++){\
746 OP(*((uint32_t*)(block )), LD32(pixels ));\
747 pixels+=line_size;\
748 block +=line_size;\
749 }\
750}\
45553457 751static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
752 int i;\
753 for(i=0; i<h; i++){\
754 OP(*((uint32_t*)(block )), LD32(pixels ));\
755 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
756 pixels+=line_size;\
757 block +=line_size;\
758 }\
759}\
45553457
ZK
760static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 762}\
59fe111e 763\
b3184779
MN
764static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
59fe111e
MN
766 int i;\
767 for(i=0; i<h; i++){\
b3184779
MN
768 uint32_t a,b;\
769 a= LD32(&src1[i*src_stride1 ]);\
770 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 771 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
772 a= LD32(&src1[i*src_stride1+4]);\
773 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 774 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
775 }\
776}\
777\
b3184779
MN
778static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779 int src_stride1, int src_stride2, int h){\
59fe111e
MN
780 int i;\
781 for(i=0; i<h; i++){\
b3184779
MN
782 uint32_t a,b;\
783 a= LD32(&src1[i*src_stride1 ]);\
784 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 785 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
786 a= LD32(&src1[i*src_stride1+4]);\
787 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 788 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
789 }\
790}\
791\
0da71265
MN
792static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
794 int i;\
795 for(i=0; i<h; i++){\
796 uint32_t a,b;\
797 a= LD32(&src1[i*src_stride1 ]);\
798 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 799 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
800 }\
801}\
802\
669ac79c
MN
803static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
804 int src_stride1, int src_stride2, int h){\
805 int i;\
806 for(i=0; i<h; i++){\
807 uint32_t a,b;\
808 a= LD16(&src1[i*src_stride1 ]);\
809 b= LD16(&src2[i*src_stride2 ]);\
810 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
811 }\
812}\
813\
b3184779
MN
814static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
815 int src_stride1, int src_stride2, int h){\
816 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
817 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
818}\
819\
820static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
822 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
823 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
824}\
825\
45553457 826static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
827 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
828}\
829\
45553457 830static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
831 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
832}\
833\
45553457 834static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
835 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
836}\
837\
45553457 838static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
839 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
840}\
841\
842static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
843 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
844 int i;\
845 for(i=0; i<h; i++){\
b3184779
MN
846 uint32_t a, b, c, d, l0, l1, h0, h1;\
847 a= LD32(&src1[i*src_stride1]);\
848 b= LD32(&src2[i*src_stride2]);\
849 c= LD32(&src3[i*src_stride3]);\
850 d= LD32(&src4[i*src_stride4]);\
851 l0= (a&0x03030303UL)\
852 + (b&0x03030303UL)\
853 + 0x02020202UL;\
854 h0= ((a&0xFCFCFCFCUL)>>2)\
855 + ((b&0xFCFCFCFCUL)>>2);\
856 l1= (c&0x03030303UL)\
857 + (d&0x03030303UL);\
858 h1= ((c&0xFCFCFCFCUL)>>2)\
859 + ((d&0xFCFCFCFCUL)>>2);\
860 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
861 a= LD32(&src1[i*src_stride1+4]);\
862 b= LD32(&src2[i*src_stride2+4]);\
863 c= LD32(&src3[i*src_stride3+4]);\
864 d= LD32(&src4[i*src_stride4+4]);\
865 l0= (a&0x03030303UL)\
866 + (b&0x03030303UL)\
867 + 0x02020202UL;\
868 h0= ((a&0xFCFCFCFCUL)>>2)\
869 + ((b&0xFCFCFCFCUL)>>2);\
870 l1= (c&0x03030303UL)\
871 + (d&0x03030303UL);\
872 h1= ((c&0xFCFCFCFCUL)>>2)\
873 + ((d&0xFCFCFCFCUL)>>2);\
874 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
875 }\
876}\
669ac79c
MN
877\
878static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
879 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
880}\
881\
882static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
884}\
885\
886static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
888}\
889\
890static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
892}\
893\
b3184779
MN
894static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
895 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
896 int i;\
897 for(i=0; i<h; i++){\
b3184779
MN
898 uint32_t a, b, c, d, l0, l1, h0, h1;\
899 a= LD32(&src1[i*src_stride1]);\
900 b= LD32(&src2[i*src_stride2]);\
901 c= LD32(&src3[i*src_stride3]);\
902 d= LD32(&src4[i*src_stride4]);\
903 l0= (a&0x03030303UL)\
904 + (b&0x03030303UL)\
905 + 0x01010101UL;\
906 h0= ((a&0xFCFCFCFCUL)>>2)\
907 + ((b&0xFCFCFCFCUL)>>2);\
908 l1= (c&0x03030303UL)\
909 + (d&0x03030303UL);\
910 h1= ((c&0xFCFCFCFCUL)>>2)\
911 + ((d&0xFCFCFCFCUL)>>2);\
912 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
913 a= LD32(&src1[i*src_stride1+4]);\
914 b= LD32(&src2[i*src_stride2+4]);\
915 c= LD32(&src3[i*src_stride3+4]);\
916 d= LD32(&src4[i*src_stride4+4]);\
917 l0= (a&0x03030303UL)\
918 + (b&0x03030303UL)\
919 + 0x01010101UL;\
920 h0= ((a&0xFCFCFCFCUL)>>2)\
921 + ((b&0xFCFCFCFCUL)>>2);\
922 l1= (c&0x03030303UL)\
923 + (d&0x03030303UL);\
924 h1= ((c&0xFCFCFCFCUL)>>2)\
925 + ((d&0xFCFCFCFCUL)>>2);\
926 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
927 }\
928}\
b3184779
MN
929static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
930 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
931 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
932 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
933}\
934static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938}\
59fe111e 939\
669ac79c
MN
940static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
941{\
942 int i, a0, b0, a1, b1;\
943 a0= pixels[0];\
944 b0= pixels[1] + 2;\
945 a0 += b0;\
946 b0 += pixels[2];\
947\
948 pixels+=line_size;\
949 for(i=0; i<h; i+=2){\
950 a1= pixels[0];\
951 b1= pixels[1];\
952 a1 += b1;\
953 b1 += pixels[2];\
954\
955 block[0]= (a1+a0)>>2; /* FIXME non put */\
956 block[1]= (b1+b0)>>2;\
957\
958 pixels+=line_size;\
959 block +=line_size;\
960\
961 a0= pixels[0];\
962 b0= pixels[1] + 2;\
963 a0 += b0;\
964 b0 += pixels[2];\
965\
966 block[0]= (a1+a0)>>2;\
967 block[1]= (b1+b0)>>2;\
968 pixels+=line_size;\
969 block +=line_size;\
970 }\
971}\
972\
973static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
974{\
975 int i;\
976 const uint32_t a= LD32(pixels );\
977 const uint32_t b= LD32(pixels+1);\
978 uint32_t l0= (a&0x03030303UL)\
979 + (b&0x03030303UL)\
980 + 0x02020202UL;\
981 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
982 + ((b&0xFCFCFCFCUL)>>2);\
983 uint32_t l1,h1;\
984\
985 pixels+=line_size;\
986 for(i=0; i<h; i+=2){\
987 uint32_t a= LD32(pixels );\
988 uint32_t b= LD32(pixels+1);\
989 l1= (a&0x03030303UL)\
990 + (b&0x03030303UL);\
991 h1= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
993 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994 pixels+=line_size;\
995 block +=line_size;\
996 a= LD32(pixels );\
997 b= LD32(pixels+1);\
998 l0= (a&0x03030303UL)\
999 + (b&0x03030303UL)\
1000 + 0x02020202UL;\
1001 h0= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 pixels+=line_size;\
1005 block +=line_size;\
1006 }\
1007}\
1008\
45553457 1009static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1010{\
1011 int j;\
1012 for(j=0; j<2; j++){\
1013 int i;\
1014 const uint32_t a= LD32(pixels );\
1015 const uint32_t b= LD32(pixels+1);\
1016 uint32_t l0= (a&0x03030303UL)\
1017 + (b&0x03030303UL)\
1018 + 0x02020202UL;\
1019 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020 + ((b&0xFCFCFCFCUL)>>2);\
1021 uint32_t l1,h1;\
1022\
1023 pixels+=line_size;\
1024 for(i=0; i<h; i+=2){\
1025 uint32_t a= LD32(pixels );\
1026 uint32_t b= LD32(pixels+1);\
1027 l1= (a&0x03030303UL)\
1028 + (b&0x03030303UL);\
1029 h1= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032 pixels+=line_size;\
1033 block +=line_size;\
1034 a= LD32(pixels );\
1035 b= LD32(pixels+1);\
1036 l0= (a&0x03030303UL)\
1037 + (b&0x03030303UL)\
1038 + 0x02020202UL;\
1039 h0= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042 pixels+=line_size;\
1043 block +=line_size;\
1044 }\
1045 pixels+=4-line_size*(h+1);\
1046 block +=4-line_size*h;\
1047 }\
1048}\
1049\
45553457 1050static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1051{\
1052 int j;\
1053 for(j=0; j<2; j++){\
1054 int i;\
1055 const uint32_t a= LD32(pixels );\
1056 const uint32_t b= LD32(pixels+1);\
1057 uint32_t l0= (a&0x03030303UL)\
1058 + (b&0x03030303UL)\
1059 + 0x01010101UL;\
1060 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 uint32_t l1,h1;\
1063\
1064 pixels+=line_size;\
1065 for(i=0; i<h; i+=2){\
1066 uint32_t a= LD32(pixels );\
1067 uint32_t b= LD32(pixels+1);\
1068 l1= (a&0x03030303UL)\
1069 + (b&0x03030303UL);\
1070 h1= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073 pixels+=line_size;\
1074 block +=line_size;\
1075 a= LD32(pixels );\
1076 b= LD32(pixels+1);\
1077 l0= (a&0x03030303UL)\
1078 + (b&0x03030303UL)\
1079 + 0x01010101UL;\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083 pixels+=line_size;\
1084 block +=line_size;\
1085 }\
1086 pixels+=4-line_size*(h+1);\
1087 block +=4-line_size*h;\
1088 }\
1089}\
1090\
45553457
ZK
1091CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1092CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1096CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1099
d8085ea7 1100#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1101#endif
59fe111e
MN
1102#define op_put(a, b) a = b
1103
1104PIXOP2(avg, op_avg)
1105PIXOP2(put, op_put)
1106#undef op_avg
1107#undef op_put
1108
de6d9b64
FB
1109#define avg2(a,b) ((a+b+1)>>1)
1110#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111
c0a0170c
MN
1112static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114}
1115
1116static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118}
073b013d 1119
0c1a9eda 1120static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1121{
1122 const int A=(16-x16)*(16-y16);
1123 const int B=( x16)*(16-y16);
1124 const int C=(16-x16)*( y16);
1125 const int D=( x16)*( y16);
1126 int i;
44eb4951
MN
1127
1128 for(i=0; i<h; i++)
1129 {
b3184779
MN
1130 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1138 dst+= stride;
1139 src+= stride;
44eb4951
MN
1140 }
1141}
1142
703c8195 1143void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1144 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145{
1146 int y, vx, vy;
1147 const int s= 1<<shift;
115329f1 1148
073b013d
MN
1149 width--;
1150 height--;
1151
1152 for(y=0; y<h; y++){
1153 int x;
1154
1155 vx= ox;
1156 vy= oy;
1157 for(x=0; x<8; x++){ //XXX FIXME optimize
1158 int src_x, src_y, frac_x, frac_y, index;
1159
1160 src_x= vx>>16;
1161 src_y= vy>>16;
1162 frac_x= src_x&(s-1);
1163 frac_y= src_y&(s-1);
1164 src_x>>=shift;
1165 src_y>>=shift;
115329f1 1166
073b013d
MN
1167 if((unsigned)src_x < width){
1168 if((unsigned)src_y < height){
1169 index= src_x + src_y*stride;
1170 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1171 + src[index +1]* frac_x )*(s-frac_y)
1172 + ( src[index+stride ]*(s-frac_x)
1173 + src[index+stride+1]* frac_x )* frac_y
1174 + r)>>(shift*2);
1175 }else{
115329f1
DB
1176 index= src_x + clip(src_y, 0, height)*stride;
1177 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1178 + src[index +1]* frac_x )*s
1179 + r)>>(shift*2);
1180 }
1181 }else{
1182 if((unsigned)src_y < height){
115329f1
DB
1183 index= clip(src_x, 0, width) + src_y*stride;
1184 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1185 + src[index+stride ]* frac_y )*s
1186 + r)>>(shift*2);
1187 }else{
115329f1 1188 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
073b013d
MN
1189 dst[y*stride + x]= src[index ];
1190 }
1191 }
115329f1 1192
073b013d
MN
1193 vx+= dxx;
1194 vy+= dyx;
1195 }
1196 ox += dxy;
1197 oy += dyy;
1198 }
1199}
669ac79c
MN
1200
1201static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202 switch(width){
1203 case 2: put_pixels2_c (dst, src, stride, height); break;
1204 case 4: put_pixels4_c (dst, src, stride, height); break;
1205 case 8: put_pixels8_c (dst, src, stride, height); break;
1206 case 16:put_pixels16_c(dst, src, stride, height); break;
1207 }
1208}
1209
1210static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211 int i,j;
1212 for (i=0; i < height; i++) {
1213 for (j=0; j < width; j++) {
bb270c08 1214 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1215 }
1216 src += stride;
1217 dst += stride;
1218 }
1219}
1220
1221static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222 int i,j;
1223 for (i=0; i < height; i++) {
1224 for (j=0; j < width; j++) {
bb270c08 1225 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1226 }
1227 src += stride;
1228 dst += stride;
1229 }
1230}
115329f1 1231
669ac79c
MN
1232static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233 int i,j;
1234 for (i=0; i < height; i++) {
1235 for (j=0; j < width; j++) {
bb270c08 1236 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1237 }
1238 src += stride;
1239 dst += stride;
1240 }
1241}
115329f1 1242
669ac79c
MN
1243static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244 int i,j;
1245 for (i=0; i < height; i++) {
1246 for (j=0; j < width; j++) {
bb270c08 1247 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1248 }
1249 src += stride;
1250 dst += stride;
1251 }
1252}
1253
1254static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255 int i,j;
1256 for (i=0; i < height; i++) {
1257 for (j=0; j < width; j++) {
bb270c08 1258 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1259 }
1260 src += stride;
1261 dst += stride;
1262 }
1263}
1264
1265static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266 int i,j;
1267 for (i=0; i < height; i++) {
1268 for (j=0; j < width; j++) {
bb270c08 1269 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1270 }
1271 src += stride;
1272 dst += stride;
1273 }
1274}
1275
1276static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 int i,j;
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
bb270c08 1280 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1281 }
1282 src += stride;
1283 dst += stride;
1284 }
1285}
1286
1287static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 int i,j;
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
bb270c08 1291 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1292 }
1293 src += stride;
1294 dst += stride;
1295 }
1296}
da3b9756
MM
1297
1298static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 switch(width){
1300 case 2: avg_pixels2_c (dst, src, stride, height); break;
1301 case 4: avg_pixels4_c (dst, src, stride, height); break;
1302 case 8: avg_pixels8_c (dst, src, stride, height); break;
1303 case 16:avg_pixels16_c(dst, src, stride, height); break;
1304 }
1305}
1306
1307static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308 int i,j;
1309 for (i=0; i < height; i++) {
1310 for (j=0; j < width; j++) {
bb270c08 1311 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1312 }
1313 src += stride;
1314 dst += stride;
1315 }
1316}
1317
1318static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319 int i,j;
1320 for (i=0; i < height; i++) {
1321 for (j=0; j < width; j++) {
bb270c08 1322 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1323 }
1324 src += stride;
1325 dst += stride;
1326 }
1327}
115329f1 1328
da3b9756
MM
1329static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330 int i,j;
1331 for (i=0; i < height; i++) {
1332 for (j=0; j < width; j++) {
bb270c08 1333 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1334 }
1335 src += stride;
1336 dst += stride;
1337 }
1338}
115329f1 1339
da3b9756
MM
1340static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341 int i,j;
1342 for (i=0; i < height; i++) {
1343 for (j=0; j < width; j++) {
bb270c08 1344 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1345 }
1346 src += stride;
1347 dst += stride;
1348 }
1349}
1350
1351static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352 int i,j;
1353 for (i=0; i < height; i++) {
1354 for (j=0; j < width; j++) {
bb270c08 1355 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1356 }
1357 src += stride;
1358 dst += stride;
1359 }
1360}
1361
1362static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363 int i,j;
1364 for (i=0; i < height; i++) {
1365 for (j=0; j < width; j++) {
bb270c08 1366 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1367 }
1368 src += stride;
1369 dst += stride;
1370 }
1371}
1372
1373static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374 int i,j;
1375 for (i=0; i < height; i++) {
1376 for (j=0; j < width; j++) {
bb270c08 1377 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1378 }
1379 src += stride;
1380 dst += stride;
1381 }
1382}
1383
1384static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385 int i,j;
1386 for (i=0; i < height; i++) {
1387 for (j=0; j < width; j++) {
bb270c08 1388 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1389 }
1390 src += stride;
1391 dst += stride;
1392 }
1393}
669ac79c
MN
1394#if 0
1395#define TPEL_WIDTH(width)\
1396static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414#endif
1415
0da71265
MN
1416#define H264_CHROMA_MC(OPNAME, OP)\
1417static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418 const int A=(8-x)*(8-y);\
1419 const int B=( x)*(8-y);\
1420 const int C=(8-x)*( y);\
1421 const int D=( x)*( y);\
1422 int i;\
1423 \
1424 assert(x<8 && y<8 && x>=0 && y>=0);\
1425\
1426 for(i=0; i<h; i++)\
1427 {\
1428 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430 dst+= stride;\
1431 src+= stride;\
1432 }\
1433}\
1434\
1435static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436 const int A=(8-x)*(8-y);\
1437 const int B=( x)*(8-y);\
1438 const int C=(8-x)*( y);\
1439 const int D=( x)*( y);\
1440 int i;\
1441 \
1442 assert(x<8 && y<8 && x>=0 && y>=0);\
1443\
1444 for(i=0; i<h; i++)\
1445 {\
1446 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1450 dst+= stride;\
1451 src+= stride;\
1452 }\
1453}\
1454\
1455static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456 const int A=(8-x)*(8-y);\
1457 const int B=( x)*(8-y);\
1458 const int C=(8-x)*( y);\
1459 const int D=( x)*( y);\
1460 int i;\
1461 \
1462 assert(x<8 && y<8 && x>=0 && y>=0);\
1463\
1464 for(i=0; i<h; i++)\
1465 {\
1466 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1474 dst+= stride;\
1475 src+= stride;\
1476 }\
1477}
1478
1479#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480#define op_put(a, b) a = (((b) + 32)>>6)
1481
1482H264_CHROMA_MC(put_ , op_put)
1483H264_CHROMA_MC(avg_ , op_avg)
1484#undef op_avg
1485#undef op_put
1486
80e44bc3
MN
1487static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1488{
1489 int i;
1490 for(i=0; i<h; i++)
1491 {
1492 ST16(dst , LD16(src ));
1493 dst+=dstStride;
1494 src+=srcStride;
1495 }
1496}
1497
0da71265
MN
1498static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1499{
1500 int i;
1501 for(i=0; i<h; i++)
1502 {
1503 ST32(dst , LD32(src ));
1504 dst+=dstStride;
1505 src+=srcStride;
1506 }
1507}
1508
1509static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510{
1511 int i;
1512 for(i=0; i<h; i++)
1513 {
1514 ST32(dst , LD32(src ));
1515 ST32(dst+4 , LD32(src+4 ));
1516 dst+=dstStride;
1517 src+=srcStride;
1518 }
1519}
1520
1521static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1522{
1523 int i;
1524 for(i=0; i<h; i++)
1525 {
1526 ST32(dst , LD32(src ));
1527 ST32(dst+4 , LD32(src+4 ));
1528 ST32(dst+8 , LD32(src+8 ));
1529 ST32(dst+12, LD32(src+12));
1530 dst+=dstStride;
1531 src+=srcStride;
1532 }
1533}
073b013d 1534
0c1a9eda 1535static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 1536{
44eb4951
MN
1537 int i;
1538 for(i=0; i<h; i++)
1539 {
b3184779
MN
1540 ST32(dst , LD32(src ));
1541 ST32(dst+4 , LD32(src+4 ));
1542 ST32(dst+8 , LD32(src+8 ));
1543 ST32(dst+12, LD32(src+12));
1544 dst[16]= src[16];
44eb4951
MN
1545 dst+=dstStride;
1546 src+=srcStride;
1547 }
1548}
1549
0c1a9eda 1550static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
1551{
1552 int i;
b3184779 1553 for(i=0; i<h; i++)
44eb4951 1554 {
b3184779
MN
1555 ST32(dst , LD32(src ));
1556 ST32(dst+4 , LD32(src+4 ));
1557 dst[8]= src[8];
44eb4951
MN
1558 dst+=dstStride;
1559 src+=srcStride;
1560 }
1561}
1562
826f429a 1563
b3184779 1564#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
1565static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1567 int i;\
1568 for(i=0; i<h; i++)\
1569 {\
1570 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578 dst+=dstStride;\
1579 src+=srcStride;\
1580 }\
44eb4951
MN
1581}\
1582\
0c1a9eda 1583static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1584 const int w=8;\
0c1a9eda 1585 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
1586 int i;\
1587 for(i=0; i<w; i++)\
1588 {\
1589 const int src0= src[0*srcStride];\
1590 const int src1= src[1*srcStride];\
1591 const int src2= src[2*srcStride];\
1592 const int src3= src[3*srcStride];\
1593 const int src4= src[4*srcStride];\
1594 const int src5= src[5*srcStride];\
1595 const int src6= src[6*srcStride];\
1596 const int src7= src[7*srcStride];\
1597 const int src8= src[8*srcStride];\
1598 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1606 dst++;\
1607 src++;\
1608 }\
1609}\
1610\
0c1a9eda
ZK
1611static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1613 int i;\
826f429a 1614 \
b3184779
MN
1615 for(i=0; i<h; i++)\
1616 {\
1617 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633 dst+=dstStride;\
1634 src+=srcStride;\
1635 }\
1636}\
1637\
0c1a9eda
ZK
1638static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1640 int i;\
826f429a 1641 const int w=16;\
b3184779
MN
1642 for(i=0; i<w; i++)\
1643 {\
1644 const int src0= src[0*srcStride];\
1645 const int src1= src[1*srcStride];\
1646 const int src2= src[2*srcStride];\
1647 const int src3= src[3*srcStride];\
1648 const int src4= src[4*srcStride];\
1649 const int src5= src[5*srcStride];\
1650 const int src6= src[6*srcStride];\
1651 const int src7= src[7*srcStride];\
1652 const int src8= src[8*srcStride];\
1653 const int src9= src[9*srcStride];\
1654 const int src10= src[10*srcStride];\
1655 const int src11= src[11*srcStride];\
1656 const int src12= src[12*srcStride];\
1657 const int src13= src[13*srcStride];\
1658 const int src14= src[14*srcStride];\
1659 const int src15= src[15*srcStride];\
1660 const int src16= src[16*srcStride];\
1661 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1677 dst++;\
1678 src++;\
1679 }\
1680}\
1681\
0c1a9eda 1682static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1683 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1684}\
1685\
0c1a9eda
ZK
1686static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t half[64];\
b3184779
MN
1688 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1690}\
1691\
0c1a9eda 1692static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1693 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1694}\
1695\
0c1a9eda
ZK
1696static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t half[64];\
b3184779
MN
1698 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1700}\
1701\
0c1a9eda
ZK
1702static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1704 uint8_t half[64];\
b3184779 1705 copy_block9(full, src, 16, stride, 9);\
db794953 1706 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1707 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1708}\
1709\
0c1a9eda
ZK
1710static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
b3184779 1712 copy_block9(full, src, 16, stride, 9);\
db794953 1713 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1714}\
1715\
0c1a9eda
ZK
1716static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1718 uint8_t half[64];\
b3184779 1719 copy_block9(full, src, 16, stride, 9);\
db794953 1720 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1721 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1722}\
0c1a9eda
ZK
1723void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1725 uint8_t halfH[72];\
1726 uint8_t halfV[64];\
1727 uint8_t halfHV[64];\
b3184779
MN
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1732 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1733}\
0c1a9eda
ZK
1734static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1736 uint8_t halfH[72];\
1737 uint8_t halfHV[64];\
db794953
MN
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743}\
0c1a9eda
ZK
1744void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1746 uint8_t halfH[72];\
1747 uint8_t halfV[64];\
1748 uint8_t halfHV[64];\
b3184779
MN
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1753 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1754}\
0c1a9eda
ZK
1755static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1757 uint8_t halfH[72];\
1758 uint8_t halfHV[64];\
db794953
MN
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764}\
0c1a9eda
ZK
1765void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766 uint8_t full[16*9];\
1767 uint8_t halfH[72];\
1768 uint8_t halfV[64];\
1769 uint8_t halfHV[64];\
b3184779
MN
1770 copy_block9(full, src, 16, stride, 9);\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1774 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1775}\
0c1a9eda
ZK
1776static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1778 uint8_t halfH[72];\
1779 uint8_t halfHV[64];\
db794953
MN
1780 copy_block9(full, src, 16, stride, 9);\
1781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785}\
0c1a9eda
ZK
1786void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787 uint8_t full[16*9];\
1788 uint8_t halfH[72];\
1789 uint8_t halfV[64];\
1790 uint8_t halfHV[64];\
b3184779
MN
1791 copy_block9(full, src, 16, stride, 9);\
1792 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1795 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1796}\
0c1a9eda
ZK
1797static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[16*9];\
1799 uint8_t halfH[72];\
1800 uint8_t halfHV[64];\
db794953
MN
1801 copy_block9(full, src, 16, stride, 9);\
1802 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806}\
0c1a9eda
ZK
1807static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1808 uint8_t halfH[72];\
1809 uint8_t halfHV[64];\
b3184779 1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1812 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1813}\
0c1a9eda
ZK
1814static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t halfH[72];\
1816 uint8_t halfHV[64];\
b3184779 1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1819 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1820}\
0c1a9eda
ZK
1821void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t full[16*9];\
1823 uint8_t halfH[72];\
1824 uint8_t halfV[64];\
1825 uint8_t halfHV[64];\
b3184779
MN
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1830 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1831}\
0c1a9eda
ZK
1832static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1834 uint8_t halfH[72];\
db794953
MN
1835 copy_block9(full, src, 16, stride, 9);\
1836 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839}\
0c1a9eda
ZK
1840void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[16*9];\
1842 uint8_t halfH[72];\
1843 uint8_t halfV[64];\
1844 uint8_t halfHV[64];\
b3184779
MN
1845 copy_block9(full, src, 16, stride, 9);\
1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1849 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1850}\
0c1a9eda
ZK
1851static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 uint8_t halfH[72];\
db794953
MN
1854 copy_block9(full, src, 16, stride, 9);\
1855 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858}\
0c1a9eda
ZK
1859static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t halfH[72];\
b3184779 1861 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1862 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1863}\
0c1a9eda 1864static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1865 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1866}\
1867\
0c1a9eda
ZK
1868static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t half[256];\
b3184779
MN
1870 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872}\
1873\
0c1a9eda 1874static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1875 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1876}\
b3184779 1877\
0c1a9eda
ZK
1878static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t half[256];\
b3184779
MN
1880 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882}\
1883\
0c1a9eda
ZK
1884static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1886 uint8_t half[256];\
b3184779 1887 copy_block17(full, src, 24, stride, 17);\
826f429a 1888 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1889 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890}\
1891\
0c1a9eda
ZK
1892static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
b3184779 1894 copy_block17(full, src, 24, stride, 17);\
826f429a 1895 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1896}\
1897\
0c1a9eda
ZK
1898static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t half[256];\
b3184779 1901 copy_block17(full, src, 24, stride, 17);\
826f429a 1902 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1903 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904}\
0c1a9eda
ZK
1905void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
b3184779
MN
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1914 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915}\
0c1a9eda
ZK
1916static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
db794953
MN
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925}\
0c1a9eda
ZK
1926void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
b3184779
MN
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1935 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936}\
0c1a9eda
ZK
1937static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
db794953
MN
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946}\
0c1a9eda
ZK
1947void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t full[24*17];\
1949 uint8_t halfH[272];\
1950 uint8_t halfV[256];\
1951 uint8_t halfHV[256];\
b3184779
MN
1952 copy_block17(full, src, 24, stride, 17);\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1956 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957}\
0c1a9eda
ZK
1958static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfHV[256];\
db794953
MN
1962 copy_block17(full, src, 24, stride, 17);\
1963 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967}\
0c1a9eda
ZK
1968void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969 uint8_t full[24*17];\
1970 uint8_t halfH[272];\
1971 uint8_t halfV[256];\
1972 uint8_t halfHV[256];\
b3184779
MN
1973 copy_block17(full, src, 24, stride, 17);\
1974 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1975 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1977 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978}\
0c1a9eda
ZK
1979static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfHV[256];\
db794953
MN
1983 copy_block17(full, src, 24, stride, 17);\
1984 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988}\
0c1a9eda
ZK
1989static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990 uint8_t halfH[272];\
1991 uint8_t halfHV[256];\
b3184779 1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1994 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995}\
0c1a9eda
ZK
1996static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
1998 uint8_t halfHV[256];\
b3184779 1999 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2000 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2001 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002}\
0c1a9eda
ZK
2003void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t full[24*17];\
2005 uint8_t halfH[272];\
2006 uint8_t halfV[256];\
2007 uint8_t halfHV[256];\
b3184779
MN
2008 copy_block17(full, src, 24, stride, 17);\
2009 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2010 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2012 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013}\
0c1a9eda
ZK
2014static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015 uint8_t full[24*17];\
2016 uint8_t halfH[272];\
db794953
MN
2017 copy_block17(full, src, 24, stride, 17);\
2018 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021}\
0c1a9eda
ZK
2022void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 uint8_t halfV[256];\
2026 uint8_t halfHV[256];\
b3184779
MN
2027 copy_block17(full, src, 24, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
2031 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032}\
0c1a9eda
ZK
2033static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 uint8_t halfH[272];\
db794953
MN
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040}\
0c1a9eda
ZK
2041static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t halfH[272];\
b3184779 2043 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 2044 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 2045}
44eb4951 2046
b3184779
MN
2047#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049#define op_put(a, b) a = cm[((b) + 16)>>5]
2050#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051
2052QPEL_MC(0, put_ , _ , op_put)
2053QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054QPEL_MC(0, avg_ , _ , op_avg)
2055//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2056#undef op_avg
2057#undef op_avg_no_rnd
2058#undef op_put
2059#undef op_put_no_rnd
44eb4951 2060
0da71265
MN
2061#if 1
2062#define H264_LOWPASS(OPNAME, OP, OP2) \
80e44bc3
MN
2063static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064 const int h=2;\
2065 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066 int i;\
2067 for(i=0; i<h; i++)\
2068 {\
2069 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2071 dst+=dstStride;\
2072 src+=srcStride;\
2073 }\
2074}\
2075\
2076static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2077 const int w=2;\
2078 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079 int i;\
2080 for(i=0; i<w; i++)\
2081 {\
2082 const int srcB= src[-2*srcStride];\
2083 const int srcA= src[-1*srcStride];\
2084 const int src0= src[0 *srcStride];\
2085 const int src1= src[1 *srcStride];\
2086 const int src2= src[2 *srcStride];\
2087 const int src3= src[3 *srcStride];\
2088 const int src4= src[4 *srcStride];\
2089 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2091 dst++;\
2092 src++;\
2093 }\
2094}\
2095\
2096static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097 const int h=2;\
2098 const int w=2;\
2099 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100 int i;\
2101 src -= 2*srcStride;\
2102 for(i=0; i<h+5; i++)\
2103 {\
2104 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2106 tmp+=tmpStride;\
2107 src+=srcStride;\
2108 }\
2109 tmp -= tmpStride*(h+5-2);\
2110 for(i=0; i<w; i++)\
2111 {\
2112 const int tmpB= tmp[-2*tmpStride];\
2113 const int tmpA= tmp[-1*tmpStride];\
2114 const int tmp0= tmp[0 *tmpStride];\
2115 const int tmp1= tmp[1 *tmpStride];\
2116 const int tmp2= tmp[2 *tmpStride];\
2117 const int tmp3= tmp[3 *tmpStride];\
2118 const int tmp4= tmp[4 *tmpStride];\
2119 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121 dst++;\
2122 tmp++;\
2123 }\
2124}\
0da71265
MN
2125static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2126 const int h=4;\
2127 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128 int i;\
2129 for(i=0; i<h; i++)\
2130 {\
2131 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2135 dst+=dstStride;\
2136 src+=srcStride;\
2137 }\
2138}\
2139\
2140static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2141 const int w=4;\
2142 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143 int i;\
2144 for(i=0; i<w; i++)\
2145 {\
2146 const int srcB= src[-2*srcStride];\
2147 const int srcA= src[-1*srcStride];\
2148 const int src0= src[0 *srcStride];\
2149 const int src1= src[1 *srcStride];\
2150 const int src2= src[2 *srcStride];\
2151 const int src3= src[3 *srcStride];\
2152 const int src4= src[4 *srcStride];\
2153 const int src5= src[5 *srcStride];\
2154 const int src6= src[6 *srcStride];\
2155 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2159 dst++;\
2160 src++;\
2161 }\
2162}\
2163\
2164static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165 const int h=4;\
2166 const int w=4;\
2167 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168 int i;\
2169 src -= 2*srcStride;\
2170 for(i=0; i<h+5; i++)\
2171 {\
2172 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2176 tmp+=tmpStride;\
2177 src+=srcStride;\
2178 }\
2179 tmp -= tmpStride*(h+5-2);\
2180 for(i=0; i<w; i++)\
2181 {\
2182 const int tmpB= tmp[-2*tmpStride];\
2183 const int tmpA= tmp[-1*tmpStride];\
2184 const int tmp0= tmp[0 *tmpStride];\
2185 const int tmp1= tmp[1 *tmpStride];\
2186 const int tmp2= tmp[2 *tmpStride];\
2187 const int tmp3= tmp[3 *tmpStride];\
2188 const int tmp4= tmp[4 *tmpStride];\
2189 const int tmp5= tmp[5 *tmpStride];\
2190 const int tmp6= tmp[6 *tmpStride];\
2191 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195 dst++;\
2196 tmp++;\
2197 }\
2198}\
2199\
2200static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2201 const int h=8;\
2202 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203 int i;\
2204 for(i=0; i<h; i++)\
2205 {\
2206 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2214 dst+=dstStride;\
2215 src+=srcStride;\
2216 }\
2217}\
2218\
2219static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220 const int w=8;\
2221 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222 int i;\
2223 for(i=0; i<w; i++)\
2224 {\
2225 const int srcB= src[-2*srcStride];\
2226 const int srcA= src[-1*srcStride];\
2227 const int src0= src[0 *srcStride];\
2228 const int src1= src[1 *srcStride];\
2229 const int src2= src[2 *srcStride];\
2230 const int src3= src[3 *srcStride];\
2231 const int src4= src[4 *srcStride];\
2232 const int src5= src[5 *srcStride];\
2233 const int src6= src[6 *srcStride];\
2234 const int src7= src[7 *srcStride];\
2235 const int src8= src[8 *srcStride];\
2236 const int src9= src[9 *srcStride];\
2237 const int src10=src[10*srcStride];\
2238 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2246 dst++;\
2247 src++;\
2248 }\
2249}\
2250\
2251static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252 const int h=8;\
2253 const int w=8;\
2254 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255 int i;\
2256 src -= 2*srcStride;\
2257 for(i=0; i<h+5; i++)\
2258 {\
2259 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2267 tmp+=tmpStride;\
2268 src+=srcStride;\
2269 }\
2270 tmp -= tmpStride*(h+5-2);\
2271 for(i=0; i<w; i++)\
2272 {\
2273 const int tmpB= tmp[-2*tmpStride];\
2274 const int tmpA= tmp[-1*tmpStride];\
2275 const int tmp0= tmp[0 *tmpStride];\
2276 const int tmp1= tmp[1 *tmpStride];\
2277 const int tmp2= tmp[2 *tmpStride];\
2278 const int tmp3= tmp[3 *tmpStride];\
2279 const int tmp4= tmp[4 *tmpStride];\
2280 const int tmp5= tmp[5 *tmpStride];\
2281 const int tmp6= tmp[6 *tmpStride];\
2282 const int tmp7= tmp[7 *tmpStride];\
2283 const int tmp8= tmp[8 *tmpStride];\
2284 const int tmp9= tmp[9 *tmpStride];\
2285 const int tmp10=tmp[10*tmpStride];\
2286 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294 dst++;\
2295 tmp++;\
2296 }\
2297}\
2298\
2299static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2301 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302 src += 8*srcStride;\
2303 dst += 8*dstStride;\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306}\
2307\
2308static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2310 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 src += 8*srcStride;\
2312 dst += 8*dstStride;\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315}\
2316\
2317static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2319 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320 src += 8*srcStride;\
0da71265
MN
2321 dst += 8*dstStride;\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324}\
2325
2326#define H264_MC(OPNAME, SIZE) \
2327static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329}\
2330\
2331static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332 uint8_t half[SIZE*SIZE];\
2333 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335}\
2336\
2337static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339}\
2340\
2341static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342 uint8_t half[SIZE*SIZE];\
2343 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345}\
2346\
2347static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 uint8_t half[SIZE*SIZE];\
2351 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2352 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354}\
2355\
2356static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357 uint8_t full[SIZE*(SIZE+5)];\
2358 uint8_t * const full_mid= full + SIZE*2;\
2359 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2360 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361}\
2362\
2363static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364 uint8_t full[SIZE*(SIZE+5)];\
2365 uint8_t * const full_mid= full + SIZE*2;\
2366 uint8_t half[SIZE*SIZE];\
2367 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2368 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370}\
2371\
2372static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373 uint8_t full[SIZE*(SIZE+5)];\
2374 uint8_t * const full_mid= full + SIZE*2;\
2375 uint8_t halfH[SIZE*SIZE];\
2376 uint8_t halfV[SIZE*SIZE];\
2377 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2379 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381}\
2382\
2383static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384 uint8_t full[SIZE*(SIZE+5)];\
2385 uint8_t * const full_mid= full + SIZE*2;\
2386 uint8_t halfH[SIZE*SIZE];\
2387 uint8_t halfV[SIZE*SIZE];\
2388 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2390 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392}\
2393\
2394static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395 uint8_t full[SIZE*(SIZE+5)];\
2396 uint8_t * const full_mid= full + SIZE*2;\
2397 uint8_t halfH[SIZE*SIZE];\
2398 uint8_t halfV[SIZE*SIZE];\
2399 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2401 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403}\
2404\
2405static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406 uint8_t full[SIZE*(SIZE+5)];\
2407 uint8_t * const full_mid= full + SIZE*2;\
2408 uint8_t halfH[SIZE*SIZE];\
2409 uint8_t halfV[SIZE*SIZE];\
2410 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2412 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414}\
2415\
2416static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417 int16_t tmp[SIZE*(SIZE+5)];\
2418 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419}\
2420\
2421static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422 int16_t tmp[SIZE*(SIZE+5)];\
2423 uint8_t halfH[SIZE*SIZE];\
2424 uint8_t halfHV[SIZE*SIZE];\
2425 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428}\
2429\
2430static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431 int16_t tmp[SIZE*(SIZE+5)];\
2432 uint8_t halfH[SIZE*SIZE];\
2433 uint8_t halfHV[SIZE*SIZE];\
2434 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437}\
2438\
2439static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440 uint8_t full[SIZE*(SIZE+5)];\
2441 uint8_t * const full_mid= full + SIZE*2;\
2442 int16_t tmp[SIZE*(SIZE+5)];\
2443 uint8_t halfV[SIZE*SIZE];\
2444 uint8_t halfHV[SIZE*SIZE];\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449}\
2450\
2451static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452 uint8_t full[SIZE*(SIZE+5)];\
2453 uint8_t * const full_mid= full + SIZE*2;\
2454 int16_t tmp[SIZE*(SIZE+5)];\
2455 uint8_t halfV[SIZE*SIZE];\
2456 uint8_t halfHV[SIZE*SIZE];\
2457 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2458 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461}\
2462
2463#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465#define op_put(a, b) a = cm[((b) + 16)>>5]
2466#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467#define op2_put(a, b) a = cm[((b) + 512)>>10]
2468
2469H264_LOWPASS(put_ , op_put, op2_put)
2470H264_LOWPASS(avg_ , op_avg, op2_avg)
80e44bc3 2471H264_MC(put_, 2)
0da71265
MN
2472H264_MC(put_, 4)
2473H264_MC(put_, 8)
2474H264_MC(put_, 16)
2475H264_MC(avg_, 4)
2476H264_MC(avg_, 8)
2477H264_MC(avg_, 16)
2478
2479#undef op_avg
2480#undef op_put
2481#undef op2_avg
2482#undef op2_put
2483#endif
2484
91c56db6
MN
2485#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2487#define H264_WEIGHT(W,H) \
2488static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
e8b56208 2489 int y; \
9f2d1b4f
LM
2490 offset <<= log2_denom; \
2491 if(log2_denom) offset += 1<<(log2_denom-1); \
2492 for(y=0; y<H; y++, block += stride){ \
2493 op_scale1(0); \
2494 op_scale1(1); \
2495 if(W==2) continue; \
2496 op_scale1(2); \
2497 op_scale1(3); \
2498 if(W==4) continue; \
2499 op_scale1(4); \
2500 op_scale1(5); \
2501 op_scale1(6); \
2502 op_scale1(7); \
2503 if(W==8) continue; \
2504 op_scale1(8); \
2505 op_scale1(9); \
2506 op_scale1(10); \
2507 op_scale1(11); \
2508 op_scale1(12); \
2509 op_scale1(13); \
2510 op_scale1(14); \
2511 op_scale1(15); \
2512 } \
2513} \
e8b56208
LM
2514static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515 int y; \
2516 offset = ((offset + 1) | 1) << log2_denom; \
9f2d1b4f
LM
2517 for(y=0; y<H; y++, dst += stride, src += stride){ \
2518 op_scale2(0); \
2519 op_scale2(1); \
2520 if(W==2) continue; \
2521 op_scale2(2); \
2522 op_scale2(3); \
2523 if(W==4) continue; \
2524 op_scale2(4); \
2525 op_scale2(5); \
2526 op_scale2(6); \
2527 op_scale2(7); \
2528 if(W==8) continue; \
2529 op_scale2(8); \
2530 op_scale2(9); \
2531 op_scale2(10); \
2532 op_scale2(11); \
2533 op_scale2(12); \
2534 op_scale2(13); \
2535 op_scale2(14); \
2536 op_scale2(15); \
2537 } \
2538}
2539
2540H264_WEIGHT(16,16)
2541H264_WEIGHT(16,8)
2542H264_WEIGHT(8,16)
2543H264_WEIGHT(8,8)
2544H264_WEIGHT(8,4)
2545H264_WEIGHT(4,8)
2546H264_WEIGHT(4,4)
2547H264_WEIGHT(4,2)
2548H264_WEIGHT(2,4)
2549H264_WEIGHT(2,2)
2550
2551#undef op_scale1
2552#undef op_scale2
2553#undef H264_WEIGHT
2554
1457ab52
MN
2555static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557 int i;
2558
2559 for(i=0; i<h; i++){
2560 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568 dst+=dstStride;
115329f1 2569 src+=srcStride;
1457ab52
MN
2570 }
2571}
2572
b482e2d1
MN
2573/* AVS specific */
2574void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2575
2576void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2577 put_pixels8_c(dst, src, stride, 8);
2578}
2579void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580 avg_pixels8_c(dst, src, stride, 8);
2581}
2582void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583 put_pixels16_c(dst, src, stride, 16);
2584}
2585void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 avg_pixels16_c(dst, src, stride, 16);
2587}
2588
1457ab52
MN
2589static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2590 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2591 int i;
2592
2593 for(i=0; i<w; i++){
2594 const int src_1= src[ -srcStride];
2595 const int src0 = src[0 ];
2596 const int src1 = src[ srcStride];
2597 const int src2 = src[2*srcStride];
2598 const int src3 = src[3*srcStride];
2599 const int src4 = src[4*srcStride];
2600 const int src5 = src[5*srcStride];
2601 const int src6 = src[6*srcStride];
2602 const int src7 = src[7*srcStride];
2603 const int src8 = src[8*srcStride];
2604 const int src9 = src[9*srcStride];
2605 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2606 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2607 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2608 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2609 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2610 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2611 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2612 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2613 src++;
2614 dst++;
2615 }
2616}
2617
2618static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2619 put_pixels8_c(dst, src, stride, 8);
2620}
2621
2622static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2623 uint8_t half[64];
2624 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2625 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2626}
2627
2628static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2629 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2630}
2631
2632static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2633 uint8_t half[64];
2634 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2635 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2636}
2637
2638static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2639 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2640}
2641
2642static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2643 uint8_t halfH[88];
2644 uint8_t halfV[64];
2645 uint8_t halfHV[64];
2646 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2647 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2648 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2649 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2650}
2651static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2652 uint8_t halfH[88];
2653 uint8_t halfV[64];
2654 uint8_t halfHV[64];
2655 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2657 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2658 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2659}
2660static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2661 uint8_t halfH[88];
2662 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2663 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2664}
2665
332f9ac4
MN
2666static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2667 int x;
2668 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2669
332f9ac4
MN
2670 for(x=0; x<8; x++){
2671 int d1, d2, ad1;
2672 int p0= src[x-2*stride];
2673 int p1= src[x-1*stride];
2674 int p2= src[x+0*stride];
2675 int p3= src[x+1*stride];
2676 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2677
2678 if (d<-2*strength) d1= 0;
2679 else if(d<- strength) d1=-2*strength - d;
2680 else if(d< strength) d1= d;
2681 else if(d< 2*strength) d1= 2*strength - d;
2682 else d1= 0;
115329f1 2683
332f9ac4
MN
2684 p1 += d1;
2685 p2 -= d1;
2686 if(p1&256) p1= ~(p1>>31);
2687 if(p2&256) p2= ~(p2>>31);
115329f1 2688
332f9ac4
MN
2689 src[x-1*stride] = p1;
2690 src[x+0*stride] = p2;
2691
5b5404e3 2692 ad1= ABS(d1)>>1;
115329f1 2693
332f9ac4 2694 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2695
332f9ac4
MN
2696 src[x-2*stride] = p0 - d2;
2697 src[x+ stride] = p3 + d2;
2698 }
2699}
2700
2701static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2702 int y;
2703 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2704
332f9ac4
MN
2705 for(y=0; y<8; y++){
2706 int d1, d2, ad1;
2707 int p0= src[y*stride-2];
2708 int p1= src[y*stride-1];
2709 int p2= src[y*stride+0];
2710 int p3= src[y*stride+1];
2711 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2712
2713 if (d<-2*strength) d1= 0;
2714 else if(d<- strength) d1=-2*strength - d;
2715 else if(d< strength) d1= d;
2716 else if(d< 2*strength) d1= 2*strength - d;
2717 else d1= 0;
115329f1 2718
332f9ac4
MN
2719 p1 += d1;
2720 p2 -= d1;
2721 if(p1&256) p1= ~(p1>>31);
2722 if(p2&256) p2= ~(p2>>31);
115329f1 2723
332f9ac4
MN
2724 src[y*stride-1] = p1;
2725 src[y*stride+0] = p2;
2726
2727 ad1= ABS(d1)>>1;
115329f1 2728
332f9ac4 2729 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2730
332f9ac4
MN
2731 src[y*stride-2] = p0 - d2;
2732 src[y*stride+1] = p3 + d2;
2733 }
2734}
1457ab52 2735
fdbbf2e0
MN
2736static void h261_loop_filter_c(uint8_t *src, int stride){
2737 int x,y,xy,yz;
2738 int temp[64];
2739
2740 for(x=0; x<8; x++){
2741 temp[x ] = 4*src[x ];
2742 temp[x + 7*8] = 4*src[x + 7*stride];
2743 }
2744 for(y=1; y<7; y++){
2745 for(x=0; x<8; x++){
2746 xy = y * stride + x;
2747 yz = y * 8 + x;
2748 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2749 }
2750 }
115329f1 2751
fdbbf2e0
MN
2752 for(y=0; y<8; y++){
2753 src[ y*stride] = (temp[ y*8] + 2)>>2;
2754 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2755 for(x=1; x<7; x++){
2756 xy = y * stride + x;
2757 yz = y * 8 + x;
2758 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2759 }
2760 }
2761}
2762
5cf08f23 2763static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2764{
2765 int i, d;
2766 for( i = 0; i < 4; i++ ) {
2767 if( tc0[i] < 0 ) {
2768 pix += 4*ystride;
2769 continue;
2770 }
2771 for( d = 0; d < 4; d++ ) {
2772 const int p0 = pix[-1*xstride];
2773 const int p1 = pix[-2*xstride];
2774 const int p2 = pix[-3*xstride];
2775 const int q0 = pix[0];
2776 const int q1 = pix[1*xstride];
2777 const int q2 = pix[2*xstride];
115329f1 2778
42251a2a
LM
2779 if( ABS( p0 - q0 ) < alpha &&
2780 ABS( p1 - p0 ) < beta &&
2781 ABS( q1 - q0 ) < beta ) {
115329f1 2782
42251a2a
LM
2783 int tc = tc0[i];
2784 int i_delta;
115329f1 2785
42251a2a 2786 if( ABS( p2 - p0 ) < beta ) {
bda1c56c 2787 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
2788 tc++;
2789 }
2790 if( ABS( q2 - q0 ) < beta ) {
bda1c56c 2791 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
2792 tc++;
2793 }
115329f1 2794
42251a2a
LM
2795 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2796 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2797 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2798 }
2799 pix += ystride;
2800 }
2801 }
2802}
5cf08f23 2803static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2804{
2805 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2806}
5cf08f23 2807static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2808{
2809 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2810}
2811
5cf08f23 2812static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2813{
2814 int i, d;
2815 for( i = 0; i < 4; i++ ) {
2816 const int tc = tc0[i];
2817 if( tc <= 0 ) {
2818 pix += 2*ystride;
2819 continue;
2820 }
2821 for( d = 0; d < 2; d++ ) {
2822 const int p0 = pix[-1*xstride];
2823 const int p1 = pix[-2*xstride];
2824 const int q0 = pix[0];
2825 const int q1 = pix[1*xstride];
2826
2827 if( ABS( p0 - q0 ) < alpha &&
2828 ABS( p1 - p0 ) < beta &&
2829 ABS( q1 - q0 ) < beta ) {
2830
2831 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2832
2833 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2834 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2835 }
2836 pix += ystride;
2837 }
2838 }
2839}
5cf08f23 2840static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2841{
2842 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2843}
5cf08f23 2844static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2845{
2846 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2847}
2848
5cf08f23
LM
2849static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2850{
2851 int d;
2852 for( d = 0; d < 8; d++ ) {
2853 const int p0 = pix[-1*xstride];
2854 const int p1 = pix[-2*xstride];
2855 const int q0 = pix[0];
2856 const int q1 = pix[1*xstride];
2857
2858 if( ABS( p0 - q0 ) < alpha &&
2859 ABS( p1 - p0 ) < beta &&
2860 ABS( q1 - q0 ) < beta ) {
2861
2862 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2863 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2864 }
2865 pix += ystride;
2866 }
2867}
2868static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2869{
2870 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2871}
2872static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2873{
2874 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2875}
2876
bb198e19 2877static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2878{
2879 int s, i;
2880
2881 s = 0;
bb198e19 2882 for(i=0;i<h;i++) {
de6d9b64
FB
2883 s += abs(pix1[0] - pix2[0]);
2884 s += abs(pix1[1] - pix2[1]);
2885 s += abs(pix1[2] - pix2[2]);
2886 s += abs(pix1[3] - pix2[3]);
2887 s += abs(pix1[4] - pix2[4]);
2888 s += abs(pix1[5] - pix2[5]);
2889 s += abs(pix1[6] - pix2[6]);
2890 s += abs(pix1[7] - pix2[7]);
2891 s += abs(pix1[8] - pix2[8]);
2892 s += abs(pix1[9] - pix2[9]);
2893 s += abs(pix1[10] - pix2[10]);
2894 s += abs(pix1[11] - pix2[11]);
2895 s += abs(pix1[12] - pix2[12]);
2896 s += abs(pix1[13] - pix2[13]);
2897 s += abs(pix1[14] - pix2[14]);
2898 s += abs(pix1[15] - pix2[15]);
2899 pix1 += line_size;
2900 pix2 += line_size;
2901 }
2902 return s;
2903}
2904
bb198e19 2905static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2906{
2907 int s, i;
2908
2909 s = 0;
bb198e19 2910 for(i=0;i<h;i++) {
de6d9b64
FB
2911 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2912 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2913 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2914 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2915 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2916 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2917 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2918 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2919 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2920 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2921 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2922 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2923 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2924 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2925 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2926 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2927 pix1 += line_size;
2928 pix2 += line_size;
2929 }
2930 return s;
2931}
2932
bb198e19 2933static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2934{
2935 int s, i;
0c1a9eda 2936 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2937
2938 s = 0;
bb198e19 2939 for(i=0;i<h;i++) {
de6d9b64
FB
2940 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2941 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2942 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2943 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2944 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2945 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2946 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2947 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2948 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2949 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2950 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2951 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2952 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2953 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2954 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2955 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2956 pix1 += line_size;
2957 pix2 += line_size;
2958 pix3 += line_size;
2959 }
2960 return s;
2961}
2962
bb198e19 2963static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2964{
2965 int s, i;
0c1a9eda 2966 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2967
2968 s = 0;
bb198e19 2969 for(i=0;i<h;i++) {
de6d9b64
FB
2970 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2971 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2972 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2973 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2974 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2975 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2976 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2977 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2978 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2979 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2980 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2981 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2982 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2983 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2984 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2985 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2986 pix1 += line_size;
2987 pix2 += line_size;
2988 pix3 += line_size;
2989 }
2990 return s;
2991}
2992
bb198e19 2993static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2994{
2995 int s, i;
2996
2997 s = 0;
bb198e19 2998 for(i=0;i<h;i++) {
ba6802de
MN
2999 s += abs(pix1[0] - pix2[0]);
3000 s += abs(pix1[1] - pix2[1]);
3001 s += abs(pix1[2] - pix2[2]);
3002 s += abs(pix1[3] - pix2[3]);
3003 s += abs(pix1[4] - pix2[4]);
3004 s += abs(pix1[5] - pix2[5]);
3005 s += abs(pix1[6] - pix2[6]);
3006 s += abs(pix1[7] - pix2[7]);
3007 pix1 += line_size;
3008 pix2 += line_size;
3009 }
3010 return s;
3011}
3012
bb198e19 3013static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3014{
3015 int s, i;
3016
3017 s = 0;
bb198e19 3018 for(i=0;i<h;i++) {
ba6802de
MN
3019 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3020 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3021 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3022 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3023 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3024 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3025 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3026 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3027 pix1 += line_size;
3028 pix2 += line_size;
3029 }
3030 return s;
3031}
3032
bb198e19 3033static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3034{
3035 int s, i;
0c1a9eda 3036 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3037
3038 s = 0;
bb198e19 3039 for(i=0;i<h;i++) {
ba6802de
MN
3040 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3041 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3042 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3043 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3044 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3045 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3046 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3047 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3048 pix1 += line_size;
3049 pix2 += line_size;
3050 pix3 += line_size;
3051 }
3052 return s;
3053}
3054
bb198e19 3055static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3056{
3057 int s, i;
0c1a9eda 3058 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3059
3060 s = 0;
bb198e19 3061 for(i=0;i<h;i++) {
ba6802de
MN
3062 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3063 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3064 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3065 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3066 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3067 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3068 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3069 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3070 pix1 += line_size;
3071 pix2 += line_size;
3072 pix3 += line_size;
3073 }
3074 return s;
3075}
3076
bf4e3bd2
MR
3077static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3078 MpegEncContext *c = v;
e6a2ac34
MN
3079 int score1=0;
3080 int score2=0;
3081 int x,y;
d4c5d2ad 3082
e6a2ac34
MN
3083 for(y=0; y<h; y++){
3084 for(x=0; x<16; x++){
3085 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3086 }
3087 if(y+1<h){
3088 for(x=0; x<15; x++){
3089 score2+= ABS( s1[x ] - s1[x +stride]
3090 - s1[x+1] + s1[x+1+stride])
3091 -ABS( s2[x ] - s2[x +stride]
3092 - s2[x+1] + s2[x+1+stride]);
3093 }
3094 }
3095 s1+= stride;
3096 s2+= stride;
3097 }
d4c5d2ad
MN
3098
3099 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3100 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3101}
3102
bf4e3bd2
MR
3103static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3104 MpegEncContext *c = v;
e6a2ac34
MN
3105 int score1=0;
3106 int score2=0;
3107 int x,y;
115329f1 3108
e6a2ac34
MN
3109 for(y=0; y<h; y++){
3110 for(x=0; x<8; x++){
3111 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3112 }
3113 if(y+1<h){
3114 for(x=0; x<7; x++){
3115 score2+= ABS( s1[x ] - s1[x +stride]
3116 - s1[x+1] + s1[x+1+stride])
3117 -ABS( s2[x ] - s2[x +stride]
3118 - s2[x+1] + s2[x+1+stride]);
3119 }
3120 }
3121 s1+= stride;
3122 s2+= stride;
3123 }
115329f1 3124
d4c5d2ad
MN
3125 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3126 else return score1 + ABS(score2)*8;
e6a2ac34
MN
3127}
3128
364a1797
MN
3129static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3130 int i;
3131 unsigned int sum=0;
3132
3133 for(i=0; i<8*8; i++){
3134 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3135 int w= weight[i];
3136 b>>= RECON_SHIFT;
3137 assert(-512<b && b<512);
3138
3139 sum += (w*b)*(w*b)>>4;
3140 }
3141 return sum>>2;
3142}
3143
3144static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3145 int i;
3146
3147 for(i=0; i<8*8; i++){
3148 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3149 }
364a1797
MN
3150}
3151
a9badb51
MN
3152/**
3153 * permutes an 8x8 block.
2a5700de 3154 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3155 * @param permutation the permutation vector
3156 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3157 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3158 * (inverse) permutated to scantable order!
a9badb51 3159 */
0c1a9eda 3160void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3161{
7801d21d 3162 int i;
477ab036 3163 DCTELEM temp[64];
115329f1 3164
7801d21d 3165 if(last<=0) return;
9a7b310d 3166 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 3167
7801d21d
MN
3168 for(i=0; i<=last; i++){
3169 const int j= scantable[i];
3170 temp[j]= block[j];
3171 block[j]=0;
3172 }
115329f1 3173
7801d21d
MN
3174 for(i=0; i<=last; i++){
3175 const int j= scantable[i];
3176 const int perm_j= permutation[j];
3177 block[perm_j]= temp[j];
3178 }
d962f6fd 3179}
e0eac44e 3180
622348f9
MN
3181static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3182 return 0;
3183}
3184
3185void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3186 int i;
115329f1 3187
622348f9 3188 memset(cmp, 0, sizeof(void*)*5);
115329f1 3189
622348f9
MN
3190 for(i=0; i<5; i++){
3191 switch(type&0xFF){
3192 case FF_CMP_SAD:
3193 cmp[i]= c->sad[i];
3194 break;
3195 case FF_CMP_SATD:
3196 cmp[i]= c->hadamard8_diff[i];
3197 break;
3198 case FF_CMP_SSE:
3199 cmp[i]= c->sse[i];
3200 break;
3201 case FF_CMP_DCT:
3202 cmp[i]= c->dct_sad[i];
3203 break;
27c61ac5
MN
3204 case FF_CMP_DCT264:
3205 cmp[i]= c->dct264_sad[i];
3206 break;
0fd6aea1
MN
3207 case FF_CMP_DCTMAX:
3208 cmp[i]= c->dct_max[i];
3209 break;
622348f9
MN
3210 case FF_CMP_PSNR:
3211 cmp[i]= c->quant_psnr[i];
3212 break;
3213 case FF_CMP_BIT:
3214 cmp[i]= c->bit[i];
3215 break;
3216 case FF_CMP_RD:
3217 cmp[i]= c->rd[i];
3218 break;
3219 case FF_CMP_VSAD:
3220 cmp[i]= c->vsad[i];
3221 break;
3222 case FF_CMP_VSSE:
3223 cmp[i]= c->vsse[i];
3224 break;
3225 case FF_CMP_ZERO:
3226 cmp[i]= zero_cmp;
3227 break;
e6a2ac34
MN
3228 case FF_CMP_NSSE:
3229 cmp[i]= c->nsse[i];
3230 break;
3a6fc8fa 3231#ifdef CONFIG_SNOW_ENCODER
26efc54e
MN
3232 case FF_CMP_W53:
3233 cmp[i]= c->w53[i];
3234 break;
3235 case FF_CMP_W97:
3236 cmp[i]= c->w97[i];
3237 break;
3a6fc8fa 3238#endif
622348f9
MN
3239 default:
3240 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3241 }
3242 }
3243}
3244
2a5700de
MN
3245/**
3246 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3247 */
eb4b3dd3 3248static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3249{
3250 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3251}
3252
11f18faf
MN
3253static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3254 int i;
d32ac509 3255 for(i=0; i+7<w; i+=8){
11f18faf
MN
3256 dst[i+0] += src[i+0];
3257 dst[i+1] += src[i+1];
3258 dst[i+2] += src[i+2];
3259 dst[i+3] += src[i+3];
3260 dst[i+4] += src[i+4];
3261 dst[i+5] += src[i+5];
3262 dst[i+6] += src[i+6];
3263 dst[i+7] += src[i+7];
3264 }
3265 for(; i<w; i++)
3266 dst[i+0] += src[i+0];
3267}
3268
3269static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3270 int i;
d32ac509 3271 for(i=0; i+7<w; i+=8){
11f18faf
MN
3272 dst[i+0] = src1[i+0]-src2[i+0];
3273 dst[i+1] = src1[i+1]-src2[i+1];
3274 dst[i+2] = src1[i+2]-src2[i+2];
3275 dst[i+3] = src1[i+3]-src2[i+3];
3276 dst[i+4] = src1[i+4]-src2[i+4];
3277 dst[i+5] = src1[i+5]-src2[i+5];
3278 dst[i+6] = src1[i+6]-src2[i+6];
3279 dst[i+7] = src1[i+7]-src2[i+7];
3280 }
3281 for(; i<w; i++)
3282 dst[i+0] = src1[i+0]-src2[i+0];
3283}
3284
84705403
MN
3285static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3286 int i;
3287 uint8_t l, lt;
3288
3289 l= *left;
3290 lt= *left_top;
3291
3292 for(i=0; i<w; i++){
3293 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3294 lt= src1[i];
3295 l= src2[i];
3296 dst[i]= l - pred;
115329f1 3297 }
84705403
MN
3298
3299 *left= l;
3300 *left_top= lt;
3301}
3302
1457ab52
MN
3303#define BUTTERFLY2(o1,o2,i1,i2) \
3304o1= (i1)+(i2);\
3305o2= (i1)-(i2);
3306
3307#define BUTTERFLY1(x,y) \
3308{\
3309 int a,b;\
3310 a= x;\
3311 b= y;\
3312 x= a+b;\
3313 y= a-b;\
3314}
3315
3316#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3317
bb198e19 3318static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3319 int i;
3320 int temp[64];
3321 int sum=0;
115329f1 3322
bb198e19 3323 assert(h==8);
1457ab52
MN
3324
3325 for(i=0; i<8; i++){
3326 //FIXME try pointer walks
3327 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3328 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3329 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3330 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1 3331
1457ab52
MN
3332 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3333 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3334 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3335 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3336
1457ab52
MN
3337 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3338 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3339 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3340 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3341 }
3342
3343 for(i=0; i<8; i++){
3344 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3345 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3346 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3347 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3348
1457ab52
MN
3349 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3350 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3351 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3352 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3353
115329f1 3354 sum +=
1457ab52
MN
3355 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3356 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3357 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3358 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3359 }
3360#if 0
3361static int maxi=0;
3362if(sum>maxi){
3363 maxi=sum;
3364 printf("MAX:%d\n", maxi);
3365}
3366#endif
3367 return sum;
3368}
3369
622348f9 3370static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3371 int i;
3372 int temp[64];
3373 int sum=0;
115329f1 3374
622348f9 3375 assert(h==8);
115329f1 3376
1457ab52
MN
3377 for(i=0; i<8; i++){
3378 //FIXME try pointer walks
622348f9
MN
3379 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3380 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3381 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3382 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1 3383
1457ab52
MN
3384 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3385 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3386 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3387 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3388
1457ab52
MN
3389 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3390 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3391 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3392 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3393 }
3394
3395 for(i=0; i<8; i++){
3396 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3397 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3398 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3399 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3400
1457ab52
MN
3401 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3402 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3403 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3404 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
DB
3405
3406 sum +=
1457ab52
MN
3407 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3408 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3409 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3410 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3411 }
115329f1 3412
622348f9 3413 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
115329f1 3414
1457ab52
MN
3415 return sum;
3416}
3417
bb198e19 3418static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3419 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3420 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3421 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3422 int sum=0, i;
115329f1 3423
bb198e19 3424 assert(h==8);
1457ab52
MN
3425
3426 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3427 s->dsp.fdct(temp);
1457ab52
MN
3428
3429 for(i=0; i<64; i++)
3430 sum+= ABS(temp[i]);
115329f1 3431
1457ab52
MN
3432 return sum;
3433}
3434
27c61ac5
MN
3435#ifdef CONFIG_GPL
3436#define DCT8_1D {\
3437 const int s07 = SRC(0) + SRC(7);\
3438 const int s16 = SRC(1) + SRC(6);\
3439 const int s25 = SRC(2) + SRC(5);\
3440 const int s34 = SRC(3) + SRC(4);\
3441 const int a0 = s07 + s34;\
3442 const int a1 = s16 + s25;\
3443 const int a2 = s07 - s34;\
3444 const int a3 = s16 - s25;\
3445 const int d07 = SRC(0) - SRC(7);\
3446 const int d16 = SRC(1) - SRC(6);\
3447 const int d25 = SRC(2) - SRC(5);\
3448 const int d34 = SRC(3) - SRC(4);\
3449 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3450 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3451 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3452 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3453 DST(0, a0 + a1 ) ;\
3454 DST(1, a4 + (a7>>2)) ;\
3455 DST(2, a2 + (a3>>1)) ;\
3456 DST(3, a5 + (a6>>2)) ;\
3457 DST(4, a0 - a1 ) ;\
3458 DST(5, a6 - (a5>>2)) ;\
3459 DST(6, (a2>>1) - a3 ) ;\
3460 DST(7, (a4>>2) - a7 ) ;\
3461}
3462
3463static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3464 MpegEncContext * const s= (MpegEncContext *)c;
3465 int16_t dct[8][8];
3466 int i;
3467 int sum=0;
3468
3469 s->dsp.diff_pixels(dct, src1, src2, stride);
3470
3471#define SRC(x) dct[i][x]
3472#define DST(x,v) dct[i][x]= v
3473 for( i = 0; i < 8; i++ )
3474 DCT8_1D
3475#undef SRC
3476#undef DST
3477
3478#define SRC(x) dct[x][i]
3479#define DST(x,v) sum += ABS(v)
3480 for( i = 0; i < 8; i++ )
3481 DCT8_1D
3482#undef SRC
3483#undef DST
3484 return sum;
3485}
3486#endif
3487
0fd6aea1
MN
3488static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3489 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3490 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
0fd6aea1
MN
3491 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3492 int sum=0, i;
115329f1 3493
0fd6aea1
MN
3494 assert(h==8);
3495
3496 s->dsp.diff_pixels(temp, src1, src2, stride);
3497 s->dsp.fdct(temp);
3498
3499 for(i=0; i<64; i++)
3500 sum= FFMAX(sum, ABS(temp[i]));
115329f1 3501
0fd6aea1
MN
3502 return sum;
3503}
3504
0e15384d 3505void simple_idct(DCTELEM *block); //FIXME
1457ab52 3506
bb198e19 3507static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3508 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3509 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
76fbb024
MN
3510 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3512 int sum=0, i;
3513
bb198e19 3514 assert(h==8);
1457ab52 3515 s->mb_intra=0;
115329f1 3516
1457ab52 3517 s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1 3518
1457ab52 3519 memcpy(bak, temp, 64*sizeof(DCTELEM));
115329f1 3520
67725183 3521 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3522 s->dct_unquantize_inter(s, temp, 0, s->qscale);
115329f1
DB
3523 simple_idct(temp); //FIXME
3524
1457ab52
MN
3525 for(i=0; i<64; i++)
3526 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1 3527
1457ab52
MN
3528 return sum;
3529}
3530
bb198e19 3531static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3532 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3533 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58
SH
3534 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3535 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
76fbb024
MN
3536 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3537 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3538 int i, last, run, bits, level, distoration, start_i;
3539 const int esc_length= s->ac_esc_length;
3540 uint8_t * length;
3541 uint8_t * last_length;
115329f1 3542
bb198e19
MN
3543 assert(h==8);
3544
67725183
MN
3545 for(i=0; i<8; i++){
3546 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3547 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3548 }
3a87ac94 3549
67725183
MN
3550 s->dsp.diff_pixels(temp, src1, src2, stride);
3551
3552 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3553
3554 bits=0;
115329f1 3555
3a87ac94 3556 if (s->mb_intra) {
115329f1 3557 start_i = 1;
3a87ac94
MN
3558 length = s->intra_ac_vlc_length;
3559 last_length= s->intra_ac_vlc_last_length;
67725183 3560 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3561 } else {
3562 start_i = 0;
3563 length = s->inter_ac_vlc_length;
3564 last_length= s->inter_ac_vlc_last_length;
3565 }
115329f1 3566
67725183 3567 if(last>=start_i){
3a87ac94
MN
3568 run=0;
3569 for(i=start_i; i<last; i++){
3570 int j= scantable[i];
3571 level= temp[j];
115329f1 3572
3a87ac94
MN
3573 if(level){
3574 level+=64;
3575 if((level&(~127)) == 0){
3576 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3577 }else
3578 bits+= esc_length;
3579 run=0;
3580 }else
3581 run++;
3582 }
3583 i= scantable[last];
115329f1 3584
3a87ac94 3585 level= temp[i] + 64;
1d0eab1d
MN
3586
3587 assert(level - 64);
115329f1 3588
3a87ac94
MN
3589 if((level&(~127)) == 0){
3590 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3591 }else
3592 bits+= esc_length;
115329f1 3593
67725183
MN
3594 }
3595
3596 if(last>=0){
d50635cd
MN
3597 if(s->mb_intra)
3598 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3599 else
3600 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94 3601 }
115329f1 3602
b0368839 3603 s->dsp.idct_add(bak, stride, temp);
115329f1 3604
bb198e19 3605 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3606
67725183 3607 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3608}
3609
bb198e19 3610static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3611 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3612 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58 3613 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3614 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
3615 int i, last, run, bits, level, start_i;
3616 const int esc_length= s->ac_esc_length;
3617 uint8_t * length;
3618 uint8_t * last_length;
bb