sh4 optimized idct & bswap patch by (BERO <bero at geocities dot co dot jp>)
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
de6d9b64 4 *
ff4ec49e
FB
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
de6d9b64 9 *
ff4ec49e 10 * This library is distributed in the hope that it will be useful,
de6d9b64 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
de6d9b64 14 *
ff4ec49e
FB
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
7ff037e9 18 *
59fe111e 19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 20 */
983e3246
MN
21
22/**
23 * @file dsputil.c
24 * DSP utils
25 */
26
de6d9b64
FB
27#include "avcodec.h"
28#include "dsputil.h"
1457ab52 29#include "mpegvideo.h"
b0368839 30#include "simple_idct.h"
45553457 31
5596c60c 32
0c1a9eda
ZK
33uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34uint32_t squareTbl[512];
de6d9b64 35
0c1a9eda 36const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 39 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 40 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
45};
46
2f349de2 47/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
0c1a9eda 48uint16_t __align8 inv_zigzag_direct16[64];
2f349de2 49
0c1a9eda 50const uint8_t ff_alternate_horizontal_scan[64] = {
2ad1516a 51 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e
FB
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
59};
60
0c1a9eda 61const uint8_t ff_alternate_vertical_scan[64] = {
2ad1516a 62 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e
FB
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
70};
71
2f349de2 72/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
0c1a9eda 73const uint32_t inverse[256]={
2f349de2
MN
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
106};
107
b0368839
MN
108/* Input permutation for the simple_idct_mmx */
109static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
118};
119
0c1a9eda 120static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
121{
122 int s, i, j;
123
124 s = 0;
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
127 s += pix[0];
128 s += pix[1];
129 s += pix[2];
130 s += pix[3];
131 s += pix[4];
132 s += pix[5];
133 s += pix[6];
134 s += pix[7];
135 pix += 8;
136 }
137 pix += line_size - 16;
138 }
139 return s;
140}
141
0c1a9eda 142static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
143{
144 int s, i, j;
0c1a9eda 145 uint32_t *sq = squareTbl + 256;
3aa102be
MN
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
2a006cd3 150#if 0
3aa102be
MN
151 s += sq[pix[0]];
152 s += sq[pix[1]];
153 s += sq[pix[2]];
154 s += sq[pix[3]];
155 s += sq[pix[4]];
156 s += sq[pix[5]];
157 s += sq[pix[6]];
158 s += sq[pix[7]];
2a006cd3
FL
159#else
160#if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
162 s += sq[x&0xff];
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
170#else
171 register uint32_t x=*(uint32_t*)pix;
172 s += sq[x&0xff];
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
177 s += sq[x&0xff];
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
181#endif
182#endif
3aa102be
MN
183 pix += 8;
184 }
185 pix += line_size - 16;
186 }
187 return s;
188}
189
190
0c1a9eda 191static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
1457ab52
MN
192{
193 int s, i;
0c1a9eda 194 uint32_t *sq = squareTbl + 256;
1457ab52
MN
195
196 s = 0;
197 for (i = 0; i < 8; i++) {
198 s += sq[pix1[0] - pix2[0]];
199 s += sq[pix1[1] - pix2[1]];
200 s += sq[pix1[2] - pix2[2]];
201 s += sq[pix1[3] - pix2[3]];
202 s += sq[pix1[4] - pix2[4]];
203 s += sq[pix1[5] - pix2[5]];
204 s += sq[pix1[6] - pix2[6]];
205 s += sq[pix1[7] - pix2[7]];
206 pix1 += line_size;
207 pix2 += line_size;
208 }
209 return s;
210}
211
6b026927 212static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
9c76bd48 213{
6b026927
FH
214 int s, i;
215 uint32_t *sq = squareTbl + 256;
9c76bd48
BF
216
217 s = 0;
218 for (i = 0; i < 16; i++) {
6b026927
FH
219 s += sq[pix1[ 0] - pix2[ 0]];
220 s += sq[pix1[ 1] - pix2[ 1]];
221 s += sq[pix1[ 2] - pix2[ 2]];
222 s += sq[pix1[ 3] - pix2[ 3]];
223 s += sq[pix1[ 4] - pix2[ 4]];
224 s += sq[pix1[ 5] - pix2[ 5]];
225 s += sq[pix1[ 6] - pix2[ 6]];
226 s += sq[pix1[ 7] - pix2[ 7]];
227 s += sq[pix1[ 8] - pix2[ 8]];
228 s += sq[pix1[ 9] - pix2[ 9]];
229 s += sq[pix1[10] - pix2[10]];
230 s += sq[pix1[11] - pix2[11]];
231 s += sq[pix1[12] - pix2[12]];
232 s += sq[pix1[13] - pix2[13]];
233 s += sq[pix1[14] - pix2[14]];
234 s += sq[pix1[15] - pix2[15]];
2a006cd3 235
6b026927
FH
236 pix1 += line_size;
237 pix2 += line_size;
9c76bd48
BF
238 }
239 return s;
240}
241
0c1a9eda 242static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 243{
de6d9b64
FB
244 int i;
245
246 /* read the pixels */
de6d9b64 247 for(i=0;i<8;i++) {
c13e1abd
FH
248 block[0] = pixels[0];
249 block[1] = pixels[1];
250 block[2] = pixels[2];
251 block[3] = pixels[3];
252 block[4] = pixels[4];
253 block[5] = pixels[5];
254 block[6] = pixels[6];
255 block[7] = pixels[7];
256 pixels += line_size;
257 block += 8;
de6d9b64
FB
258 }
259}
260
0c1a9eda
ZK
261static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262 const uint8_t *s2, int stride){
9dbcbd92
MN
263 int i;
264
265 /* read the pixels */
9dbcbd92 266 for(i=0;i<8;i++) {
c13e1abd
FH
267 block[0] = s1[0] - s2[0];
268 block[1] = s1[1] - s2[1];
269 block[2] = s1[2] - s2[2];
270 block[3] = s1[3] - s2[3];
271 block[4] = s1[4] - s2[4];
272 block[5] = s1[5] - s2[5];
273 block[6] = s1[6] - s2[6];
274 block[7] = s1[7] - s2[7];
9dbcbd92
MN
275 s1 += stride;
276 s2 += stride;
c13e1abd 277 block += 8;
9dbcbd92
MN
278 }
279}
280
281
0c1a9eda 282static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
eb4b3dd3 283 int line_size)
de6d9b64 284{
de6d9b64 285 int i;
0c1a9eda 286 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
287
288 /* read the pixels */
de6d9b64 289 for(i=0;i<8;i++) {
c13e1abd
FH
290 pixels[0] = cm[block[0]];
291 pixels[1] = cm[block[1]];
292 pixels[2] = cm[block[2]];
293 pixels[3] = cm[block[3]];
294 pixels[4] = cm[block[4]];
295 pixels[5] = cm[block[5]];
296 pixels[6] = cm[block[6]];
297 pixels[7] = cm[block[7]];
298
299 pixels += line_size;
300 block += 8;
de6d9b64
FB
301 }
302}
303
0c1a9eda 304static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 305 int line_size)
de6d9b64 306{
de6d9b64 307 int i;
0c1a9eda 308 uint8_t *cm = cropTbl + MAX_NEG_CROP;
de6d9b64
FB
309
310 /* read the pixels */
de6d9b64 311 for(i=0;i<8;i++) {
c13e1abd
FH
312 pixels[0] = cm[pixels[0] + block[0]];
313 pixels[1] = cm[pixels[1] + block[1]];
314 pixels[2] = cm[pixels[2] + block[2]];
315 pixels[3] = cm[pixels[3] + block[3]];
316 pixels[4] = cm[pixels[4] + block[4]];
317 pixels[5] = cm[pixels[5] + block[5]];
318 pixels[6] = cm[pixels[6] + block[6]];
319 pixels[7] = cm[pixels[7] + block[7]];
320 pixels += line_size;
321 block += 8;
de6d9b64
FB
322 }
323}
59fe111e
MN
324#if 0
325
326#define PIXOP2(OPNAME, OP) \
b3184779 327static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
328{\
329 int i;\
330 for(i=0; i<h; i++){\
331 OP(*((uint64_t*)block), LD64(pixels));\
332 pixels+=line_size;\
333 block +=line_size;\
334 }\
335}\
336\
45553457 337static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
338{\
339 int i;\
340 for(i=0; i<h; i++){\
341 const uint64_t a= LD64(pixels );\
342 const uint64_t b= LD64(pixels+1);\
343 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
344 pixels+=line_size;\
345 block +=line_size;\
346 }\
347}\
348\
45553457 349static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
350{\
351 int i;\
352 for(i=0; i<h; i++){\
353 const uint64_t a= LD64(pixels );\
354 const uint64_t b= LD64(pixels+1);\
355 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
356 pixels+=line_size;\
357 block +=line_size;\
358 }\
359}\
360\
45553457 361static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
362{\
363 int i;\
364 for(i=0; i<h; i++){\
365 const uint64_t a= LD64(pixels );\
366 const uint64_t b= LD64(pixels+line_size);\
367 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
368 pixels+=line_size;\
369 block +=line_size;\
370 }\
371}\
372\
45553457 373static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
374{\
375 int i;\
376 for(i=0; i<h; i++){\
377 const uint64_t a= LD64(pixels );\
378 const uint64_t b= LD64(pixels+line_size);\
379 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380 pixels+=line_size;\
381 block +=line_size;\
382 }\
383}\
384\
45553457 385static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
386{\
387 int i;\
388 const uint64_t a= LD64(pixels );\
389 const uint64_t b= LD64(pixels+1);\
390 uint64_t l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0202020202020202ULL;\
393 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
395 uint64_t l1,h1;\
396\
397 pixels+=line_size;\
398 for(i=0; i<h; i+=2){\
399 uint64_t a= LD64(pixels );\
400 uint64_t b= LD64(pixels+1);\
401 l1= (a&0x0303030303030303ULL)\
402 + (b&0x0303030303030303ULL);\
403 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
406 pixels+=line_size;\
407 block +=line_size;\
408 a= LD64(pixels );\
409 b= LD64(pixels+1);\
410 l0= (a&0x0303030303030303ULL)\
411 + (b&0x0303030303030303ULL)\
412 + 0x0202020202020202ULL;\
413 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
416 pixels+=line_size;\
417 block +=line_size;\
418 }\
419}\
420\
45553457 421static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
422{\
423 int i;\
424 const uint64_t a= LD64(pixels );\
425 const uint64_t b= LD64(pixels+1);\
426 uint64_t l0= (a&0x0303030303030303ULL)\
427 + (b&0x0303030303030303ULL)\
428 + 0x0101010101010101ULL;\
429 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
431 uint64_t l1,h1;\
432\
433 pixels+=line_size;\
434 for(i=0; i<h; i+=2){\
435 uint64_t a= LD64(pixels );\
436 uint64_t b= LD64(pixels+1);\
437 l1= (a&0x0303030303030303ULL)\
438 + (b&0x0303030303030303ULL);\
439 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
442 pixels+=line_size;\
443 block +=line_size;\
444 a= LD64(pixels );\
445 b= LD64(pixels+1);\
446 l0= (a&0x0303030303030303ULL)\
447 + (b&0x0303030303030303ULL)\
448 + 0x0101010101010101ULL;\
449 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452 pixels+=line_size;\
453 block +=line_size;\
454 }\
455}\
456\
45553457
ZK
457CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
458CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
464
465#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466#else // 64 bit variant
467
468#define PIXOP2(OPNAME, OP) \
0da71265
MN
469static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
470 int i;\
471 for(i=0; i<h; i++){\
472 OP(*((uint32_t*)(block )), LD32(pixels ));\
473 pixels+=line_size;\
474 block +=line_size;\
475 }\
476}\
45553457 477static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
478 int i;\
479 for(i=0; i<h; i++){\
480 OP(*((uint32_t*)(block )), LD32(pixels ));\
481 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
482 pixels+=line_size;\
483 block +=line_size;\
484 }\
485}\
45553457
ZK
486static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 488}\
59fe111e 489\
b3184779
MN
490static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
491 int src_stride1, int src_stride2, int h){\
59fe111e
MN
492 int i;\
493 for(i=0; i<h; i++){\
b3184779
MN
494 uint32_t a,b;\
495 a= LD32(&src1[i*src_stride1 ]);\
496 b= LD32(&src2[i*src_stride2 ]);\
497 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
498 a= LD32(&src1[i*src_stride1+4]);\
499 b= LD32(&src2[i*src_stride2+4]);\
500 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
501 }\
502}\
503\
b3184779
MN
504static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
505 int src_stride1, int src_stride2, int h){\
59fe111e
MN
506 int i;\
507 for(i=0; i<h; i++){\
b3184779
MN
508 uint32_t a,b;\
509 a= LD32(&src1[i*src_stride1 ]);\
510 b= LD32(&src2[i*src_stride2 ]);\
511 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
512 a= LD32(&src1[i*src_stride1+4]);\
513 b= LD32(&src2[i*src_stride2+4]);\
514 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
59fe111e
MN
515 }\
516}\
517\
0da71265
MN
518static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
519 int src_stride1, int src_stride2, int h){\
520 int i;\
521 for(i=0; i<h; i++){\
522 uint32_t a,b;\
523 a= LD32(&src1[i*src_stride1 ]);\
524 b= LD32(&src2[i*src_stride2 ]);\
525 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
526 }\
527}\
528\
b3184779
MN
529static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
531 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
532 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
533}\
534\
535static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
536 int src_stride1, int src_stride2, int h){\
537 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
538 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
539}\
540\
45553457 541static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
542 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
543}\
544\
45553457 545static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
546 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
547}\
548\
45553457 549static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
550 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
551}\
552\
45553457 553static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
554 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
555}\
556\
557static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
558 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
559 int i;\
560 for(i=0; i<h; i++){\
b3184779
MN
561 uint32_t a, b, c, d, l0, l1, h0, h1;\
562 a= LD32(&src1[i*src_stride1]);\
563 b= LD32(&src2[i*src_stride2]);\
564 c= LD32(&src3[i*src_stride3]);\
565 d= LD32(&src4[i*src_stride4]);\
566 l0= (a&0x03030303UL)\
567 + (b&0x03030303UL)\
568 + 0x02020202UL;\
569 h0= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 l1= (c&0x03030303UL)\
572 + (d&0x03030303UL);\
573 h1= ((c&0xFCFCFCFCUL)>>2)\
574 + ((d&0xFCFCFCFCUL)>>2);\
575 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
576 a= LD32(&src1[i*src_stride1+4]);\
577 b= LD32(&src2[i*src_stride2+4]);\
578 c= LD32(&src3[i*src_stride3+4]);\
579 d= LD32(&src4[i*src_stride4+4]);\
580 l0= (a&0x03030303UL)\
581 + (b&0x03030303UL)\
582 + 0x02020202UL;\
583 h0= ((a&0xFCFCFCFCUL)>>2)\
584 + ((b&0xFCFCFCFCUL)>>2);\
585 l1= (c&0x03030303UL)\
586 + (d&0x03030303UL);\
587 h1= ((c&0xFCFCFCFCUL)>>2)\
588 + ((d&0xFCFCFCFCUL)>>2);\
589 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
590 }\
591}\
b3184779
MN
592static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
593 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
594 int i;\
595 for(i=0; i<h; i++){\
b3184779
MN
596 uint32_t a, b, c, d, l0, l1, h0, h1;\
597 a= LD32(&src1[i*src_stride1]);\
598 b= LD32(&src2[i*src_stride2]);\
599 c= LD32(&src3[i*src_stride3]);\
600 d= LD32(&src4[i*src_stride4]);\
601 l0= (a&0x03030303UL)\
602 + (b&0x03030303UL)\
603 + 0x01010101UL;\
604 h0= ((a&0xFCFCFCFCUL)>>2)\
605 + ((b&0xFCFCFCFCUL)>>2);\
606 l1= (c&0x03030303UL)\
607 + (d&0x03030303UL);\
608 h1= ((c&0xFCFCFCFCUL)>>2)\
609 + ((d&0xFCFCFCFCUL)>>2);\
610 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
611 a= LD32(&src1[i*src_stride1+4]);\
612 b= LD32(&src2[i*src_stride2+4]);\
613 c= LD32(&src3[i*src_stride3+4]);\
614 d= LD32(&src4[i*src_stride4+4]);\
615 l0= (a&0x03030303UL)\
616 + (b&0x03030303UL)\
617 + 0x01010101UL;\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
621 + (d&0x03030303UL);\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
625 }\
626}\
b3184779
MN
627static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
628 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
629 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
630 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
631}\
632static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
633 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
634 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
635 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
636}\
59fe111e 637\
45553457 638static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
639{\
640 int j;\
641 for(j=0; j<2; j++){\
642 int i;\
643 const uint32_t a= LD32(pixels );\
644 const uint32_t b= LD32(pixels+1);\
645 uint32_t l0= (a&0x03030303UL)\
646 + (b&0x03030303UL)\
647 + 0x02020202UL;\
648 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
649 + ((b&0xFCFCFCFCUL)>>2);\
650 uint32_t l1,h1;\
651\
652 pixels+=line_size;\
653 for(i=0; i<h; i+=2){\
654 uint32_t a= LD32(pixels );\
655 uint32_t b= LD32(pixels+1);\
656 l1= (a&0x03030303UL)\
657 + (b&0x03030303UL);\
658 h1= ((a&0xFCFCFCFCUL)>>2)\
659 + ((b&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
661 pixels+=line_size;\
662 block +=line_size;\
663 a= LD32(pixels );\
664 b= LD32(pixels+1);\
665 l0= (a&0x03030303UL)\
666 + (b&0x03030303UL)\
667 + 0x02020202UL;\
668 h0= ((a&0xFCFCFCFCUL)>>2)\
669 + ((b&0xFCFCFCFCUL)>>2);\
670 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
671 pixels+=line_size;\
672 block +=line_size;\
673 }\
674 pixels+=4-line_size*(h+1);\
675 block +=4-line_size*h;\
676 }\
677}\
678\
45553457 679static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
680{\
681 int j;\
682 for(j=0; j<2; j++){\
683 int i;\
684 const uint32_t a= LD32(pixels );\
685 const uint32_t b= LD32(pixels+1);\
686 uint32_t l0= (a&0x03030303UL)\
687 + (b&0x03030303UL)\
688 + 0x01010101UL;\
689 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
690 + ((b&0xFCFCFCFCUL)>>2);\
691 uint32_t l1,h1;\
692\
693 pixels+=line_size;\
694 for(i=0; i<h; i+=2){\
695 uint32_t a= LD32(pixels );\
696 uint32_t b= LD32(pixels+1);\
697 l1= (a&0x03030303UL)\
698 + (b&0x03030303UL);\
699 h1= ((a&0xFCFCFCFCUL)>>2)\
700 + ((b&0xFCFCFCFCUL)>>2);\
701 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
702 pixels+=line_size;\
703 block +=line_size;\
704 a= LD32(pixels );\
705 b= LD32(pixels+1);\
706 l0= (a&0x03030303UL)\
707 + (b&0x03030303UL)\
708 + 0x01010101UL;\
709 h0= ((a&0xFCFCFCFCUL)>>2)\
710 + ((b&0xFCFCFCFCUL)>>2);\
711 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
712 pixels+=line_size;\
713 block +=line_size;\
714 }\
715 pixels+=4-line_size*(h+1);\
716 block +=4-line_size*h;\
717 }\
718}\
719\
45553457
ZK
720CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
721CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
722CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
723CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
724CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
725CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
726CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
727CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 728
59fe111e
MN
729#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
730#endif
59fe111e
MN
731#define op_put(a, b) a = b
732
733PIXOP2(avg, op_avg)
734PIXOP2(put, op_put)
735#undef op_avg
736#undef op_put
737
de6d9b64
FB
738#define avg2(a,b) ((a+b+1)>>1)
739#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
740
073b013d 741
0c1a9eda 742static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
743{
744 const int A=(16-x16)*(16-y16);
745 const int B=( x16)*(16-y16);
746 const int C=(16-x16)*( y16);
747 const int D=( x16)*( y16);
748 int i;
44eb4951
MN
749
750 for(i=0; i<h; i++)
751 {
b3184779
MN
752 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
753 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
754 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
755 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
756 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
757 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
758 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
759 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
760 dst+= stride;
761 src+= stride;
44eb4951
MN
762 }
763}
764
0c1a9eda 765static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
766 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
767{
768 int y, vx, vy;
769 const int s= 1<<shift;
770
771 width--;
772 height--;
773
774 for(y=0; y<h; y++){
775 int x;
776
777 vx= ox;
778 vy= oy;
779 for(x=0; x<8; x++){ //XXX FIXME optimize
780 int src_x, src_y, frac_x, frac_y, index;
781
782 src_x= vx>>16;
783 src_y= vy>>16;
784 frac_x= src_x&(s-1);
785 frac_y= src_y&(s-1);
786 src_x>>=shift;
787 src_y>>=shift;
788
789 if((unsigned)src_x < width){
790 if((unsigned)src_y < height){
791 index= src_x + src_y*stride;
792 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
793 + src[index +1]* frac_x )*(s-frac_y)
794 + ( src[index+stride ]*(s-frac_x)
795 + src[index+stride+1]* frac_x )* frac_y
796 + r)>>(shift*2);
797 }else{
798 index= src_x + clip(src_y, 0, height)*stride;
799 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
800 + src[index +1]* frac_x )*s
801 + r)>>(shift*2);
802 }
803 }else{
804 if((unsigned)src_y < height){
805 index= clip(src_x, 0, width) + src_y*stride;
806 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
807 + src[index+stride ]* frac_y )*s
808 + r)>>(shift*2);
809 }else{
810 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
811 dst[y*stride + x]= src[index ];
812 }
813 }
814
815 vx+= dxx;
816 vy+= dyx;
817 }
818 ox += dxy;
819 oy += dyy;
820 }
821}
0da71265
MN
822#define H264_CHROMA_MC(OPNAME, OP)\
823static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
824 const int A=(8-x)*(8-y);\
825 const int B=( x)*(8-y);\
826 const int C=(8-x)*( y);\
827 const int D=( x)*( y);\
828 int i;\
829 \
830 assert(x<8 && y<8 && x>=0 && y>=0);\
831\
832 for(i=0; i<h; i++)\
833 {\
834 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
835 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
836 dst+= stride;\
837 src+= stride;\
838 }\
839}\
840\
841static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
842 const int A=(8-x)*(8-y);\
843 const int B=( x)*(8-y);\
844 const int C=(8-x)*( y);\
845 const int D=( x)*( y);\
846 int i;\
847 \
848 assert(x<8 && y<8 && x>=0 && y>=0);\
849\
850 for(i=0; i<h; i++)\
851 {\
852 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
853 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
854 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
855 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
856 dst+= stride;\
857 src+= stride;\
858 }\
859}\
860\
861static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
862 const int A=(8-x)*(8-y);\
863 const int B=( x)*(8-y);\
864 const int C=(8-x)*( y);\
865 const int D=( x)*( y);\
866 int i;\
867 \
868 assert(x<8 && y<8 && x>=0 && y>=0);\
869\
870 for(i=0; i<h; i++)\
871 {\
872 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
873 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
874 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
875 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
876 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
877 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
878 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
879 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
880 dst+= stride;\
881 src+= stride;\
882 }\
883}
884
885#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
886#define op_put(a, b) a = (((b) + 32)>>6)
887
888H264_CHROMA_MC(put_ , op_put)
889H264_CHROMA_MC(avg_ , op_avg)
890#undef op_avg
891#undef op_put
892
893static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
894{
895 int i;
896 for(i=0; i<h; i++)
897 {
898 ST32(dst , LD32(src ));
899 dst+=dstStride;
900 src+=srcStride;
901 }
902}
903
904static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
905{
906 int i;
907 for(i=0; i<h; i++)
908 {
909 ST32(dst , LD32(src ));
910 ST32(dst+4 , LD32(src+4 ));
911 dst+=dstStride;
912 src+=srcStride;
913 }
914}
915
916static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
917{
918 int i;
919 for(i=0; i<h; i++)
920 {
921 ST32(dst , LD32(src ));
922 ST32(dst+4 , LD32(src+4 ));
923 ST32(dst+8 , LD32(src+8 ));
924 ST32(dst+12, LD32(src+12));
925 dst+=dstStride;
926 src+=srcStride;
927 }
928}
073b013d 929
0c1a9eda 930static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951 931{
44eb4951
MN
932 int i;
933 for(i=0; i<h; i++)
934 {
b3184779
MN
935 ST32(dst , LD32(src ));
936 ST32(dst+4 , LD32(src+4 ));
937 ST32(dst+8 , LD32(src+8 ));
938 ST32(dst+12, LD32(src+12));
939 dst[16]= src[16];
44eb4951
MN
940 dst+=dstStride;
941 src+=srcStride;
942 }
943}
944
0c1a9eda 945static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
44eb4951
MN
946{
947 int i;
b3184779 948 for(i=0; i<h; i++)
44eb4951 949 {
b3184779
MN
950 ST32(dst , LD32(src ));
951 ST32(dst+4 , LD32(src+4 ));
952 dst[8]= src[8];
44eb4951
MN
953 dst+=dstStride;
954 src+=srcStride;
955 }
956}
957
826f429a 958
b3184779 959#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda
ZK
960static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
961 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
962 int i;\
963 for(i=0; i<h; i++)\
964 {\
965 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
966 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
967 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
968 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
969 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
970 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
971 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
972 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
973 dst+=dstStride;\
974 src+=srcStride;\
975 }\
44eb4951
MN
976}\
977\
0c1a9eda 978static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 979 const int w=8;\
0c1a9eda 980 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779
MN
981 int i;\
982 for(i=0; i<w; i++)\
983 {\
984 const int src0= src[0*srcStride];\
985 const int src1= src[1*srcStride];\
986 const int src2= src[2*srcStride];\
987 const int src3= src[3*srcStride];\
988 const int src4= src[4*srcStride];\
989 const int src5= src[5*srcStride];\
990 const int src6= src[6*srcStride];\
991 const int src7= src[7*srcStride];\
992 const int src8= src[8*srcStride];\
993 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
994 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
995 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
996 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
997 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
998 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
999 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1000 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1001 dst++;\
1002 src++;\
1003 }\
1004}\
1005\
0c1a9eda
ZK
1006static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1007 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1008 int i;\
826f429a 1009 \
b3184779
MN
1010 for(i=0; i<h; i++)\
1011 {\
1012 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1013 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1014 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1015 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1016 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1017 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1018 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1019 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1020 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1021 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1022 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1023 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1024 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1025 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1026 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1027 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1028 dst+=dstStride;\
1029 src+=srcStride;\
1030 }\
1031}\
1032\
0c1a9eda
ZK
1033static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1034 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
b3184779 1035 int i;\
826f429a 1036 const int w=16;\
b3184779
MN
1037 for(i=0; i<w; i++)\
1038 {\
1039 const int src0= src[0*srcStride];\
1040 const int src1= src[1*srcStride];\
1041 const int src2= src[2*srcStride];\
1042 const int src3= src[3*srcStride];\
1043 const int src4= src[4*srcStride];\
1044 const int src5= src[5*srcStride];\
1045 const int src6= src[6*srcStride];\
1046 const int src7= src[7*srcStride];\
1047 const int src8= src[8*srcStride];\
1048 const int src9= src[9*srcStride];\
1049 const int src10= src[10*srcStride];\
1050 const int src11= src[11*srcStride];\
1051 const int src12= src[12*srcStride];\
1052 const int src13= src[13*srcStride];\
1053 const int src14= src[14*srcStride];\
1054 const int src15= src[15*srcStride];\
1055 const int src16= src[16*srcStride];\
1056 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1057 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1058 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1059 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1060 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1061 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1062 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1063 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1064 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1065 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1066 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1067 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1068 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1069 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1070 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1071 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1072 dst++;\
1073 src++;\
1074 }\
1075}\
1076\
0c1a9eda 1077static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1078 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1079}\
1080\
0c1a9eda
ZK
1081static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1082 uint8_t half[64];\
b3184779
MN
1083 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1084 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1085}\
1086\
0c1a9eda 1087static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1088 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1089}\
1090\
0c1a9eda
ZK
1091static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1092 uint8_t half[64];\
b3184779
MN
1093 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1094 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1095}\
1096\
0c1a9eda
ZK
1097static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1098 uint8_t full[16*9];\
1099 uint8_t half[64];\
b3184779 1100 copy_block9(full, src, 16, stride, 9);\
db794953 1101 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1102 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1103}\
1104\
0c1a9eda
ZK
1105static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1106 uint8_t full[16*9];\
b3184779 1107 copy_block9(full, src, 16, stride, 9);\
db794953 1108 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1109}\
1110\
0c1a9eda
ZK
1111static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1112 uint8_t full[16*9];\
1113 uint8_t half[64];\
b3184779 1114 copy_block9(full, src, 16, stride, 9);\
db794953 1115 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1116 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1117}\
0c1a9eda
ZK
1118void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1119 uint8_t full[16*9];\
1120 uint8_t halfH[72];\
1121 uint8_t halfV[64];\
1122 uint8_t halfHV[64];\
b3184779
MN
1123 copy_block9(full, src, 16, stride, 9);\
1124 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1125 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1126 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1127 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1128}\
0c1a9eda
ZK
1129static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1130 uint8_t full[16*9];\
1131 uint8_t halfH[72];\
1132 uint8_t halfHV[64];\
db794953
MN
1133 copy_block9(full, src, 16, stride, 9);\
1134 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1135 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1136 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1137 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1138}\
0c1a9eda
ZK
1139void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1140 uint8_t full[16*9];\
1141 uint8_t halfH[72];\
1142 uint8_t halfV[64];\
1143 uint8_t halfHV[64];\
b3184779
MN
1144 copy_block9(full, src, 16, stride, 9);\
1145 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1146 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1147 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1148 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1149}\
0c1a9eda
ZK
1150static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[16*9];\
1152 uint8_t halfH[72];\
1153 uint8_t halfHV[64];\
db794953
MN
1154 copy_block9(full, src, 16, stride, 9);\
1155 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1156 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1157 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1158 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1159}\
0c1a9eda
ZK
1160void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161 uint8_t full[16*9];\
1162 uint8_t halfH[72];\
1163 uint8_t halfV[64];\
1164 uint8_t halfHV[64];\
b3184779
MN
1165 copy_block9(full, src, 16, stride, 9);\
1166 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1167 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1168 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1169 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1170}\
0c1a9eda
ZK
1171static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[16*9];\
1173 uint8_t halfH[72];\
1174 uint8_t halfHV[64];\
db794953
MN
1175 copy_block9(full, src, 16, stride, 9);\
1176 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1177 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1178 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1179 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1180}\
0c1a9eda
ZK
1181void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1182 uint8_t full[16*9];\
1183 uint8_t halfH[72];\
1184 uint8_t halfV[64];\
1185 uint8_t halfHV[64];\
b3184779
MN
1186 copy_block9(full, src, 16, stride, 9);\
1187 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1188 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1189 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1190 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1191}\
0c1a9eda
ZK
1192static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1193 uint8_t full[16*9];\
1194 uint8_t halfH[72];\
1195 uint8_t halfHV[64];\
db794953
MN
1196 copy_block9(full, src, 16, stride, 9);\
1197 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1198 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1199 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1200 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1201}\
0c1a9eda
ZK
1202static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1203 uint8_t halfH[72];\
1204 uint8_t halfHV[64];\
b3184779 1205 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1206 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1207 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1208}\
0c1a9eda
ZK
1209static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t halfH[72];\
1211 uint8_t halfHV[64];\
b3184779 1212 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1213 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1214 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1215}\
0c1a9eda
ZK
1216void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217 uint8_t full[16*9];\
1218 uint8_t halfH[72];\
1219 uint8_t halfV[64];\
1220 uint8_t halfHV[64];\
b3184779
MN
1221 copy_block9(full, src, 16, stride, 9);\
1222 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1223 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1224 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1225 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1226}\
0c1a9eda
ZK
1227static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1228 uint8_t full[16*9];\
1229 uint8_t halfH[72];\
db794953
MN
1230 copy_block9(full, src, 16, stride, 9);\
1231 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1232 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1233 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1234}\
0c1a9eda
ZK
1235void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1236 uint8_t full[16*9];\
1237 uint8_t halfH[72];\
1238 uint8_t halfV[64];\
1239 uint8_t halfHV[64];\
b3184779
MN
1240 copy_block9(full, src, 16, stride, 9);\
1241 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1242 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1243 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1244 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1245}\
0c1a9eda
ZK
1246static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1247 uint8_t full[16*9];\
1248 uint8_t halfH[72];\
db794953
MN
1249 copy_block9(full, src, 16, stride, 9);\
1250 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1251 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1252 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1253}\
0c1a9eda
ZK
1254static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1255 uint8_t halfH[72];\
b3184779 1256 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1257 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1258}\
0c1a9eda 1259static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1260 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1261}\
1262\
0c1a9eda
ZK
1263static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t half[256];\
b3184779
MN
1265 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1266 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1267}\
1268\
0c1a9eda 1269static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1270 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1271}\
b3184779 1272\
0c1a9eda
ZK
1273static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1274 uint8_t half[256];\
b3184779
MN
1275 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1276 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1277}\
1278\
0c1a9eda
ZK
1279static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1280 uint8_t full[24*17];\
1281 uint8_t half[256];\
b3184779 1282 copy_block17(full, src, 24, stride, 17);\
826f429a 1283 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1284 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1285}\
1286\
0c1a9eda
ZK
1287static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t full[24*17];\
b3184779 1289 copy_block17(full, src, 24, stride, 17);\
826f429a 1290 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1291}\
1292\
0c1a9eda
ZK
1293static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t full[24*17];\
1295 uint8_t half[256];\
b3184779 1296 copy_block17(full, src, 24, stride, 17);\
826f429a 1297 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1298 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1299}\
0c1a9eda
ZK
1300void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1301 uint8_t full[24*17];\
1302 uint8_t halfH[272];\
1303 uint8_t halfV[256];\
1304 uint8_t halfHV[256];\
b3184779
MN
1305 copy_block17(full, src, 24, stride, 17);\
1306 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1307 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1308 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1309 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1310}\
0c1a9eda
ZK
1311static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1312 uint8_t full[24*17];\
1313 uint8_t halfH[272];\
1314 uint8_t halfHV[256];\
db794953
MN
1315 copy_block17(full, src, 24, stride, 17);\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1317 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1318 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1319 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1320}\
0c1a9eda
ZK
1321void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1322 uint8_t full[24*17];\
1323 uint8_t halfH[272];\
1324 uint8_t halfV[256];\
1325 uint8_t halfHV[256];\
b3184779
MN
1326 copy_block17(full, src, 24, stride, 17);\
1327 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1328 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1329 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1330 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1331}\
0c1a9eda
ZK
1332static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1333 uint8_t full[24*17];\
1334 uint8_t halfH[272];\
1335 uint8_t halfHV[256];\
db794953
MN
1336 copy_block17(full, src, 24, stride, 17);\
1337 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1338 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1339 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1340 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1341}\
0c1a9eda
ZK
1342void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1343 uint8_t full[24*17];\
1344 uint8_t halfH[272];\
1345 uint8_t halfV[256];\
1346 uint8_t halfHV[256];\
b3184779
MN
1347 copy_block17(full, src, 24, stride, 17);\
1348 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1349 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1350 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1351 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1352}\
0c1a9eda
ZK
1353static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1354 uint8_t full[24*17];\
1355 uint8_t halfH[272];\
1356 uint8_t halfHV[256];\
db794953
MN
1357 copy_block17(full, src, 24, stride, 17);\
1358 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1359 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1360 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1361 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1362}\
0c1a9eda
ZK
1363void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1364 uint8_t full[24*17];\
1365 uint8_t halfH[272];\
1366 uint8_t halfV[256];\
1367 uint8_t halfHV[256];\
b3184779
MN
1368 copy_block17(full, src, 24, stride, 17);\
1369 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1370 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1371 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1372 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1373}\
0c1a9eda
ZK
1374static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1375 uint8_t full[24*17];\
1376 uint8_t halfH[272];\
1377 uint8_t halfHV[256];\
db794953
MN
1378 copy_block17(full, src, 24, stride, 17);\
1379 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1380 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1381 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1382 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1383}\
0c1a9eda
ZK
1384static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1385 uint8_t halfH[272];\
1386 uint8_t halfHV[256];\
b3184779 1387 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1388 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1389 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1390}\
0c1a9eda
ZK
1391static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1392 uint8_t halfH[272];\
1393 uint8_t halfHV[256];\
b3184779 1394 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1395 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1396 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1397}\
0c1a9eda
ZK
1398void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1399 uint8_t full[24*17];\
1400 uint8_t halfH[272];\
1401 uint8_t halfV[256];\
1402 uint8_t halfHV[256];\
b3184779
MN
1403 copy_block17(full, src, 24, stride, 17);\
1404 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1405 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1406 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1407 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1408}\
0c1a9eda
ZK
1409static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1410 uint8_t full[24*17];\
1411 uint8_t halfH[272];\
db794953
MN
1412 copy_block17(full, src, 24, stride, 17);\
1413 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1414 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1415 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1416}\
0c1a9eda
ZK
1417void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1418 uint8_t full[24*17];\
1419 uint8_t halfH[272];\
1420 uint8_t halfV[256];\
1421 uint8_t halfHV[256];\
b3184779
MN
1422 copy_block17(full, src, 24, stride, 17);\
1423 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1424 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1425 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1426 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1427}\
0c1a9eda
ZK
1428static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1429 uint8_t full[24*17];\
1430 uint8_t halfH[272];\
db794953
MN
1431 copy_block17(full, src, 24, stride, 17);\
1432 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1433 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1434 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1435}\
0c1a9eda
ZK
1436static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1437 uint8_t halfH[272];\
b3184779 1438 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1439 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 1440}
44eb4951 1441
b3184779
MN
1442#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1443#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1444#define op_put(a, b) a = cm[((b) + 16)>>5]
1445#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1446
1447QPEL_MC(0, put_ , _ , op_put)
1448QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1449QPEL_MC(0, avg_ , _ , op_avg)
1450//QPEL_MC(1, avg_no_rnd , _ , op_avg)
1451#undef op_avg
1452#undef op_avg_no_rnd
1453#undef op_put
1454#undef op_put_no_rnd
44eb4951 1455
0da71265
MN
1456#if 1
1457#define H264_LOWPASS(OPNAME, OP, OP2) \
1458static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1459 const int h=4;\
1460 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1461 int i;\
1462 for(i=0; i<h; i++)\
1463 {\
1464 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1465 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1466 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1467 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1468 dst+=dstStride;\
1469 src+=srcStride;\
1470 }\
1471}\
1472\
1473static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1474 const int w=4;\
1475 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1476 int i;\
1477 for(i=0; i<w; i++)\
1478 {\
1479 const int srcB= src[-2*srcStride];\
1480 const int srcA= src[-1*srcStride];\
1481 const int src0= src[0 *srcStride];\
1482 const int src1= src[1 *srcStride];\
1483 const int src2= src[2 *srcStride];\
1484 const int src3= src[3 *srcStride];\
1485 const int src4= src[4 *srcStride];\
1486 const int src5= src[5 *srcStride];\
1487 const int src6= src[6 *srcStride];\
1488 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1489 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1490 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1491 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1492 dst++;\
1493 src++;\
1494 }\
1495}\
1496\
1497static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1498 const int h=4;\
1499 const int w=4;\
1500 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1501 int i;\
1502 src -= 2*srcStride;\
1503 for(i=0; i<h+5; i++)\
1504 {\
1505 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1506 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1507 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1508 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1509 tmp+=tmpStride;\
1510 src+=srcStride;\
1511 }\
1512 tmp -= tmpStride*(h+5-2);\
1513 for(i=0; i<w; i++)\
1514 {\
1515 const int tmpB= tmp[-2*tmpStride];\
1516 const int tmpA= tmp[-1*tmpStride];\
1517 const int tmp0= tmp[0 *tmpStride];\
1518 const int tmp1= tmp[1 *tmpStride];\
1519 const int tmp2= tmp[2 *tmpStride];\
1520 const int tmp3= tmp[3 *tmpStride];\
1521 const int tmp4= tmp[4 *tmpStride];\
1522 const int tmp5= tmp[5 *tmpStride];\
1523 const int tmp6= tmp[6 *tmpStride];\
1524 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1525 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1526 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1527 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1528 dst++;\
1529 tmp++;\
1530 }\
1531}\
1532\
1533static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1534 const int h=8;\
1535 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1536 int i;\
1537 for(i=0; i<h; i++)\
1538 {\
1539 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1540 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1541 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1542 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1543 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1544 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1545 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1546 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1547 dst+=dstStride;\
1548 src+=srcStride;\
1549 }\
1550}\
1551\
1552static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1553 const int w=8;\
1554 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1555 int i;\
1556 for(i=0; i<w; i++)\
1557 {\
1558 const int srcB= src[-2*srcStride];\
1559 const int srcA= src[-1*srcStride];\
1560 const int src0= src[0 *srcStride];\
1561 const int src1= src[1 *srcStride];\
1562 const int src2= src[2 *srcStride];\
1563 const int src3= src[3 *srcStride];\
1564 const int src4= src[4 *srcStride];\
1565 const int src5= src[5 *srcStride];\
1566 const int src6= src[6 *srcStride];\
1567 const int src7= src[7 *srcStride];\
1568 const int src8= src[8 *srcStride];\
1569 const int src9= src[9 *srcStride];\
1570 const int src10=src[10*srcStride];\
1571 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1572 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1573 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1574 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1575 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1576 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1577 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1578 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1579 dst++;\
1580 src++;\
1581 }\
1582}\
1583\
1584static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1585 const int h=8;\
1586 const int w=8;\
1587 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1588 int i;\
1589 src -= 2*srcStride;\
1590 for(i=0; i<h+5; i++)\
1591 {\
1592 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1593 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1594 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1595 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1596 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1597 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1598 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1599 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1600 tmp+=tmpStride;\
1601 src+=srcStride;\
1602 }\
1603 tmp -= tmpStride*(h+5-2);\
1604 for(i=0; i<w; i++)\
1605 {\
1606 const int tmpB= tmp[-2*tmpStride];\
1607 const int tmpA= tmp[-1*tmpStride];\
1608 const int tmp0= tmp[0 *tmpStride];\
1609 const int tmp1= tmp[1 *tmpStride];\
1610 const int tmp2= tmp[2 *tmpStride];\
1611 const int tmp3= tmp[3 *tmpStride];\
1612 const int tmp4= tmp[4 *tmpStride];\
1613 const int tmp5= tmp[5 *tmpStride];\
1614 const int tmp6= tmp[6 *tmpStride];\
1615 const int tmp7= tmp[7 *tmpStride];\
1616 const int tmp8= tmp[8 *tmpStride];\
1617 const int tmp9= tmp[9 *tmpStride];\
1618 const int tmp10=tmp[10*tmpStride];\
1619 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1620 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1621 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1622 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1623 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1624 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1625 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1626 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1627 dst++;\
1628 tmp++;\
1629 }\
1630}\
1631\
1632static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1633 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1634 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1635 src += 8*srcStride;\
1636 dst += 8*dstStride;\
1637 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1638 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1639}\
1640\
1641static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1643 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1644 src += 8*srcStride;\
1645 dst += 8*dstStride;\
1646 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1647 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1648}\
1649\
1650static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1651 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1652 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1653 src += 8*srcStride;\
1654 tmp += 8*tmpStride;\
1655 dst += 8*dstStride;\
1656 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1657 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1658}\
1659
1660#define H264_MC(OPNAME, SIZE) \
1661static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1662 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1663}\
1664\
1665static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t half[SIZE*SIZE];\
1667 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1668 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1669}\
1670\
1671static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1672 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1673}\
1674\
1675static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t half[SIZE*SIZE];\
1677 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1678 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1679}\
1680\
1681static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[SIZE*(SIZE+5)];\
1683 uint8_t * const full_mid= full + SIZE*2;\
1684 uint8_t half[SIZE*SIZE];\
1685 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1686 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1687 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1688}\
1689\
1690static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1691 uint8_t full[SIZE*(SIZE+5)];\
1692 uint8_t * const full_mid= full + SIZE*2;\
1693 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1694 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1695}\
1696\
1697static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1698 uint8_t full[SIZE*(SIZE+5)];\
1699 uint8_t * const full_mid= full + SIZE*2;\
1700 uint8_t half[SIZE*SIZE];\
1701 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1702 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1703 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1704}\
1705\
1706static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[SIZE*(SIZE+5)];\
1708 uint8_t * const full_mid= full + SIZE*2;\
1709 uint8_t halfH[SIZE*SIZE];\
1710 uint8_t halfV[SIZE*SIZE];\
1711 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1712 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1713 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1714 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1715}\
1716\
1717static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1718 uint8_t full[SIZE*(SIZE+5)];\
1719 uint8_t * const full_mid= full + SIZE*2;\
1720 uint8_t halfH[SIZE*SIZE];\
1721 uint8_t halfV[SIZE*SIZE];\
1722 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1723 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1724 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1725 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1726}\
1727\
1728static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[SIZE*(SIZE+5)];\
1730 uint8_t * const full_mid= full + SIZE*2;\
1731 uint8_t halfH[SIZE*SIZE];\
1732 uint8_t halfV[SIZE*SIZE];\
1733 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1734 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1735 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1736 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1737}\
1738\
1739static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[SIZE*(SIZE+5)];\
1741 uint8_t * const full_mid= full + SIZE*2;\
1742 uint8_t halfH[SIZE*SIZE];\
1743 uint8_t halfV[SIZE*SIZE];\
1744 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1745 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1746 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1747 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1748}\
1749\
1750static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1751 int16_t tmp[SIZE*(SIZE+5)];\
1752 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1753}\
1754\
1755static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1756 int16_t tmp[SIZE*(SIZE+5)];\
1757 uint8_t halfH[SIZE*SIZE];\
1758 uint8_t halfHV[SIZE*SIZE];\
1759 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1760 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1761 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1762}\
1763\
1764static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1765 int16_t tmp[SIZE*(SIZE+5)];\
1766 uint8_t halfH[SIZE*SIZE];\
1767 uint8_t halfHV[SIZE*SIZE];\
1768 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1769 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1770 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1771}\
1772\
1773static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[SIZE*(SIZE+5)];\
1775 uint8_t * const full_mid= full + SIZE*2;\
1776 int16_t tmp[SIZE*(SIZE+5)];\
1777 uint8_t halfV[SIZE*SIZE];\
1778 uint8_t halfHV[SIZE*SIZE];\
1779 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1780 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1781 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1782 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1783}\
1784\
1785static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[SIZE*(SIZE+5)];\
1787 uint8_t * const full_mid= full + SIZE*2;\
1788 int16_t tmp[SIZE*(SIZE+5)];\
1789 uint8_t halfV[SIZE*SIZE];\
1790 uint8_t halfHV[SIZE*SIZE];\
1791 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1792 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1793 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1794 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1795}\
1796
1797#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1798//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1799#define op_put(a, b) a = cm[((b) + 16)>>5]
1800#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1801#define op2_put(a, b) a = cm[((b) + 512)>>10]
1802
1803H264_LOWPASS(put_ , op_put, op2_put)
1804H264_LOWPASS(avg_ , op_avg, op2_avg)
1805H264_MC(put_, 4)
1806H264_MC(put_, 8)
1807H264_MC(put_, 16)
1808H264_MC(avg_, 4)
1809H264_MC(avg_, 8)
1810H264_MC(avg_, 16)
1811
1812#undef op_avg
1813#undef op_put
1814#undef op2_avg
1815#undef op2_put
1816#endif
1817
1457ab52
MN
1818static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1819 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1820 int i;
1821
1822 for(i=0; i<h; i++){
1823 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1824 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1825 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1826 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1827 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1828 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1829 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1830 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1831 dst+=dstStride;
1832 src+=srcStride;
1833 }
1834}
1835
1836static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1837 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1838 int i;
1839
1840 for(i=0; i<w; i++){
1841 const int src_1= src[ -srcStride];
1842 const int src0 = src[0 ];
1843 const int src1 = src[ srcStride];
1844 const int src2 = src[2*srcStride];
1845 const int src3 = src[3*srcStride];
1846 const int src4 = src[4*srcStride];
1847 const int src5 = src[5*srcStride];
1848 const int src6 = src[6*srcStride];
1849 const int src7 = src[7*srcStride];
1850 const int src8 = src[8*srcStride];
1851 const int src9 = src[9*srcStride];
1852 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1853 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1854 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1855 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1856 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1857 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1858 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1859 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1860 src++;
1861 dst++;
1862 }
1863}
1864
1865static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1866 put_pixels8_c(dst, src, stride, 8);
1867}
1868
1869static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1870 uint8_t half[64];
1871 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1872 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1873}
1874
1875static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1876 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1877}
1878
1879static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1880 uint8_t half[64];
1881 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1882 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1883}
1884
1885static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1886 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1887}
1888
1889static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1890 uint8_t halfH[88];
1891 uint8_t halfV[64];
1892 uint8_t halfHV[64];
1893 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1894 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1895 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1896 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1897}
1898static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1899 uint8_t halfH[88];
1900 uint8_t halfV[64];
1901 uint8_t halfHV[64];
1902 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1903 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1904 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1905 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1906}
1907static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1908 uint8_t halfH[88];
1909 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1910 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1911}
1912
1913
0c1a9eda 1914static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
de6d9b64
FB
1915{
1916 int s, i;
1917
1918 s = 0;
ba6802de 1919 for(i=0;i<16;i++) {
de6d9b64
FB
1920 s += abs(pix1[0] - pix2[0]);
1921 s += abs(pix1[1] - pix2[1]);
1922 s += abs(pix1[2] - pix2[2]);
1923 s += abs(pix1[3] - pix2[3]);
1924 s += abs(pix1[4] - pix2[4]);
1925 s += abs(pix1[5] - pix2[5]);
1926 s += abs(pix1[6] - pix2[6]);
1927 s += abs(pix1[7] - pix2[7]);
1928 s += abs(pix1[8] - pix2[8]);
1929 s += abs(pix1[9] - pix2[9]);
1930 s += abs(pix1[10] - pix2[10]);
1931 s += abs(pix1[11] - pix2[11]);
1932 s += abs(pix1[12] - pix2[12]);
1933 s += abs(pix1[13] - pix2[13]);
1934 s += abs(pix1[14] - pix2[14]);
1935 s += abs(pix1[15] - pix2[15]);
1936 pix1 += line_size;
1937 pix2 += line_size;
1938 }
1939 return s;
1940}
1941
0c1a9eda 1942static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
de6d9b64
FB
1943{
1944 int s, i;
1945
1946 s = 0;
ba6802de 1947 for(i=0;i<16;i++) {
de6d9b64
FB
1948 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1949 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1950 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1951 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1952 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1953 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1954 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1955 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1956 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1957 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1958 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1959 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1960 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1961 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1962 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1963 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1964 pix1 += line_size;
1965 pix2 += line_size;
1966 }
1967 return s;
1968}
1969
0c1a9eda 1970static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
de6d9b64
FB
1971{
1972 int s, i;
0c1a9eda 1973 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
1974
1975 s = 0;
ba6802de 1976 for(i=0;i<16;i++) {
de6d9b64
FB
1977 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1978 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1979 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1980 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1981 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1982 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1983 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1984 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1985 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1986 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1987 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1988 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1989 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1990 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1991 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1992 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1993 pix1 += line_size;
1994 pix2 += line_size;
1995 pix3 += line_size;
1996 }
1997 return s;
1998}
1999
0c1a9eda 2000static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
de6d9b64
FB
2001{
2002 int s, i;
0c1a9eda 2003 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2004
2005 s = 0;
ba6802de 2006 for(i=0;i<16;i++) {
de6d9b64
FB
2007 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2008 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2009 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2010 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2011 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2012 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2013 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2014 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2015 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2016 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2017 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2018 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2019 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2020 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2021 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2022 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2023 pix1 += line_size;
2024 pix2 += line_size;
2025 pix3 += line_size;
2026 }
2027 return s;
2028}
2029
0c1a9eda 2030static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
ba6802de
MN
2031{
2032 int s, i;
2033
2034 s = 0;
2035 for(i=0;i<8;i++) {
2036 s += abs(pix1[0] - pix2[0]);
2037 s += abs(pix1[1] - pix2[1]);
2038 s += abs(pix1[2] - pix2[2]);
2039 s += abs(pix1[3] - pix2[3]);
2040 s += abs(pix1[4] - pix2[4]);
2041 s += abs(pix1[5] - pix2[5]);
2042 s += abs(pix1[6] - pix2[6]);
2043 s += abs(pix1[7] - pix2[7]);
2044 pix1 += line_size;
2045 pix2 += line_size;
2046 }
2047 return s;
2048}
2049
0c1a9eda 2050static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
ba6802de
MN
2051{
2052 int s, i;
2053
2054 s = 0;
2055 for(i=0;i<8;i++) {
2056 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2057 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2058 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2059 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2060 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2061 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2062 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2063 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2064 pix1 += line_size;
2065 pix2 += line_size;
2066 }
2067 return s;
2068}
2069
0c1a9eda 2070static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
ba6802de
MN
2071{
2072 int s, i;
0c1a9eda 2073 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2074
2075 s = 0;
2076 for(i=0;i<8;i++) {
2077 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2078 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2079 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2080 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2081 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2082 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2083 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2084 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2085 pix1 += line_size;
2086 pix2 += line_size;
2087 pix3 += line_size;
2088 }
2089 return s;
2090}
2091
0c1a9eda 2092static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
ba6802de
MN
2093{
2094 int s, i;
0c1a9eda 2095 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
2096
2097 s = 0;
2098 for(i=0;i<8;i++) {
2099 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2100 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2101 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2102 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2103 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2104 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2105 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2106 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2107 pix1 += line_size;
2108 pix2 += line_size;
2109 pix3 += line_size;
2110 }
2111 return s;
2112}
2113
1457ab52
MN
2114static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2115 return pix_abs16x16_c(a,b,stride);
2116}
2117
2118static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2119 return pix_abs8x8_c(a,b,stride);
2120}
2121
a9badb51
MN
2122/**
2123 * permutes an 8x8 block.
2a5700de 2124 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
2125 * @param permutation the permutation vector
2126 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2a5700de
MN
2127 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2128 * (inverse) permutated to scantable order!
a9badb51 2129 */
0c1a9eda 2130void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 2131{
7801d21d 2132 int i;
477ab036 2133 DCTELEM temp[64];
7801d21d
MN
2134
2135 if(last<=0) return;
9a7b310d 2136 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 2137
7801d21d
MN
2138 for(i=0; i<=last; i++){
2139 const int j= scantable[i];
2140 temp[j]= block[j];
2141 block[j]=0;
2142 }
2143
2144 for(i=0; i<=last; i++){
2145 const int j= scantable[i];
2146 const int perm_j= permutation[j];
2147 block[perm_j]= temp[j];
2148 }
d962f6fd 2149}
e0eac44e 2150
2a5700de
MN
2151/**
2152 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2153 */
eb4b3dd3 2154static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
2155{
2156 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2157}
2158
11f18faf
MN
2159static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2160 int i;
d32ac509 2161 for(i=0; i+7<w; i+=8){
11f18faf
MN
2162 dst[i+0] += src[i+0];
2163 dst[i+1] += src[i+1];
2164 dst[i+2] += src[i+2];
2165 dst[i+3] += src[i+3];
2166 dst[i+4] += src[i+4];
2167 dst[i+5] += src[i+5];
2168 dst[i+6] += src[i+6];
2169 dst[i+7] += src[i+7];
2170 }
2171 for(; i<w; i++)
2172 dst[i+0] += src[i+0];
2173}
2174
2175static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2176 int i;
d32ac509 2177 for(i=0; i+7<w; i+=8){
11f18faf
MN
2178 dst[i+0] = src1[i+0]-src2[i+0];
2179 dst[i+1] = src1[i+1]-src2[i+1];
2180 dst[i+2] = src1[i+2]-src2[i+2];
2181 dst[i+3] = src1[i+3]-src2[i+3];
2182 dst[i+4] = src1[i+4]-src2[i+4];
2183 dst[i+5] = src1[i+5]-src2[i+5];
2184 dst[i+6] = src1[i+6]-src2[i+6];
2185 dst[i+7] = src1[i+7]-src2[i+7];
2186 }
2187 for(; i<w; i++)
2188 dst[i+0] = src1[i+0]-src2[i+0];
2189}
2190
1457ab52
MN
2191#define BUTTERFLY2(o1,o2,i1,i2) \
2192o1= (i1)+(i2);\
2193o2= (i1)-(i2);
2194
2195#define BUTTERFLY1(x,y) \
2196{\
2197 int a,b;\
2198 a= x;\
2199 b= y;\
2200 x= a+b;\
2201 y= a-b;\
2202}
2203
2204#define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2205
2206static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2207 int i;
2208 int temp[64];
2209 int sum=0;
2210
2211 for(i=0; i<8; i++){
2212 //FIXME try pointer walks
2213 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2214 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2215 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2216 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2217
2218 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2219 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2220 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2221 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2222
2223 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2224 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2225 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2226 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2227 }
2228
2229 for(i=0; i<8; i++){
2230 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2231 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2232 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2233 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2234
2235 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2236 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2237 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2238 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2239
2240 sum +=
2241 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2242 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2243 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2244 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2245 }
2246#if 0
2247static int maxi=0;
2248if(sum>maxi){
2249 maxi=sum;
2250 printf("MAX:%d\n", maxi);
2251}
2252#endif
2253 return sum;
2254}
2255
2256static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2257 int i;
2258 int temp[64];
2259 int sum=0;
2260//FIXME OOOPS ignore 0 term instead of mean mess
2261 for(i=0; i<8; i++){
2262 //FIXME try pointer walks
2263 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2264 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2265 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2266 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2267
2268 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2269 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2270 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2271 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2272
2273 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2274 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2275 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2276 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2277 }
2278
2279 for(i=0; i<8; i++){
2280 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2281 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2282 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2283 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2284
2285 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2286 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2287 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2288 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2289
2290 sum +=
2291 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2292 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2293 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2294 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2295 }
2296
2297 return sum;
2298}
2299
2300static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2301 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2302 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2303 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52
MN
2304 int sum=0, i;
2305
2306 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 2307 s->dsp.fdct(temp);
1457ab52
MN
2308
2309 for(i=0; i<64; i++)
2310 sum+= ABS(temp[i]);
2311
2312 return sum;
2313}
2314
0e15384d 2315void simple_idct(DCTELEM *block); //FIXME
1457ab52
MN
2316
2317static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2318 MpegEncContext * const s= (MpegEncContext *)c;
76fbb024
MN
2319 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2320 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2321 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
2322 int sum=0, i;
2323
2324 s->mb_intra=0;
2325
2326 s->dsp.diff_pixels(temp, src1, src2, stride);
2327
2328 memcpy(bak, temp, 64*sizeof(DCTELEM));
2329
67725183 2330 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1457ab52
MN
2331 s->dct_unquantize(s, temp, 0, s->qscale);
2332 simple_idct(temp); //FIXME
2333
2334 for(i=0; i<64; i++)
2335 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2336
2337 return sum;
2338}
2339
3a87ac94
MN
2340static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2341 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 2342 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
2343 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2344 uint64_t __align8 aligned_bak[stride];
2345 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2346 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
2347 int i, last, run, bits, level, distoration, start_i;
2348 const int esc_length= s->ac_esc_length;
2349 uint8_t * length;
2350 uint8_t * last_length;
67725183
MN
2351
2352 for(i=0; i<8; i++){
2353 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2354 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2355 }
3a87ac94 2356
67725183
MN
2357 s->dsp.diff_pixels(temp, src1, src2, stride);
2358
2359 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2360
2361 bits=0;
3a87ac94
MN
2362
2363 if (s->mb_intra) {
67725183 2364 start_i = 1;
3a87ac94
MN
2365 length = s->intra_ac_vlc_length;
2366 last_length= s->intra_ac_vlc_last_length;
67725183 2367 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
2368 } else {
2369 start_i = 0;
2370 length = s->inter_ac_vlc_length;
2371 last_length= s->inter_ac_vlc_last_length;
2372 }
3a87ac94 2373
67725183 2374 if(last>=start_i){
3a87ac94
MN
2375 run=0;
2376 for(i=start_i; i<last; i++){
2377 int j= scantable[i];
2378 level= temp[j];
2379
2380 if(level){
2381 level+=64;
2382 if((level&(~127)) == 0){
2383 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2384 }else
2385 bits+= esc_length;
2386 run=0;
2387 }else
2388 run++;
2389 }
2390 i= scantable[last];
1d0eab1d 2391
3a87ac94 2392 level= temp[i] + 64;
1d0eab1d
MN
2393
2394 assert(level - 64);
2395
3a87ac94
MN
2396 if((level&(~127)) == 0){
2397 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2398 }else
2399 bits+= esc_length;
2400
67725183
MN
2401 }
2402
2403 if(last>=0){
3a87ac94
MN
2404 s->dct_unquantize(s, temp, 0, s->qscale);
2405 }
2406
b0368839 2407 s->dsp.idct_add(bak, stride, temp);
3a87ac94
MN
2408
2409 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2410
67725183 2411 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
2412}
2413
2414static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2415 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 2416 const uint8_t *scantable= s->intra_scantable.permutated;
76fbb024
MN
2417 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2418 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3a87ac94
MN
2419 int i, last, run, bits, level, start_i;
2420 const int esc_length= s->ac_esc_length;
2421 uint8_t * length;
2422 uint8_t * last_length;
67725183
MN
2423
2424 s->dsp.diff_pixels(temp, src1, src2, stride);
3a87ac94 2425
67725183
MN
2426 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2427
2428 bits=0;
3a87ac94
MN
2429
2430 if (s->mb_intra) {
67725183 2431 start_i = 1;
3a87ac94
MN
2432 length = s->intra_ac_vlc_length;
2433 last_length= s->intra_ac_vlc_last_length;
67725183 2434 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
2435 } else {
2436 start_i = 0;
2437 length = s->inter_ac_vlc_length;
2438 last_length= s->inter_ac_vlc_last_length;
2439 }
3a87ac94 2440
67725183 2441 if(last>=start_i){
3a87ac94
MN
2442 run=0;
2443 for(i=start_i; i<last; i++){
2444 int j= scantable[i];
2445 level= temp[j];
2446
2447 if(level){
2448 level+=64;
2449 if((level&(~127)) == 0){
2450 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2451 }else
2452 bits+= esc_length;
2453 run=0;
2454 }else
2455 run++;
2456 }
2457 i= scantable[last];
67725183
MN
2458
2459 level= temp[i] + 64;
3a87ac94 2460
67725183 2461 assert(level - 64);
3a87ac94 2462
3a87ac94
MN
2463 if((level&(~127)) == 0){
2464 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2465 }else
2466 bits+= esc_length;
2467 }
2468
2469 return bits;
2470}
2471
2472
1457ab52
MN
2473WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2474WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2475WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
3a87ac94
MN
2476WARPER88_1616(rd8x8_c, rd16x16_c)
2477WARPER88_1616(bit8x8_c, bit16x16_c)
1457ab52 2478
b0368839
MN
2479/* XXX: those functions should be suppressed ASAP when all IDCTs are
2480 converted */
2481static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2482{
2483 j_rev_dct (block);
2484 put_pixels_clamped_c(block, dest, line_size);
2485}
2486static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2487{
2488 j_rev_dct (block);
2489 add_pixels_clamped_c(block, dest, line_size);
2490}
2491
59cf08ce
FB
2492/* init static data */
2493void dsputil_static_init(void)
e0eac44e 2494{
d2975f8d 2495 int i;
e0eac44e 2496
59cf08ce
FB
2497 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2498 for(i=0;i<MAX_NEG_CROP;i++) {
2499 cropTbl[i] = 0;
2500 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2501 }
2502
2503 for(i=0;i<512;i++) {
2504 squareTbl[i] = (i - 256) * (i - 256);
2505 }
2506
2507 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2508}
92ddb692 2509
92ddb692 2510
59cf08ce
FB
2511void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2512{
2513 int i;
de6d9b64 2514
b0368839
MN
2515#ifdef CONFIG_ENCODERS
2516 if(avctx->dct_algo==FF_DCT_FASTINT)
2517 c->fdct = fdct_ifast;
2518 else
2519 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2520#endif //CONFIG_ENCODERS
2521
2522 if(avctx->idct_algo==FF_IDCT_INT){
2523 c->idct_put= ff_jref_idct_put;
2524 c->idct_add= ff_jref_idct_add;
2525 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2526 }else{ //accurate/default
2527 c->idct_put= simple_idct_put;
2528 c->idct_add= simple_idct_add;
2529 c->idct_permutation_type= FF_NO_IDCT_PERM;
2530 }
2531
eb4b3dd3
ZK
2532 c->get_pixels = get_pixels_c;
2533 c->diff_pixels = diff_pixels_c;
2534 c->put_pixels_clamped = put_pixels_clamped_c;
2535 c->add_pixels_clamped = add_pixels_clamped_c;
2536 c->gmc1 = gmc1_c;
2537 c->gmc = gmc_c;
2538 c->clear_blocks = clear_blocks_c;
2539 c->pix_sum = pix_sum_c;
2540 c->pix_norm1 = pix_norm1_c;
1457ab52
MN
2541 c->sse[0]= sse16_c;
2542 c->sse[1]= sse8_c;
eb4b3dd3 2543
45553457 2544 /* TODO [0] 16 [1] 8 */
eb4b3dd3
ZK
2545 c->pix_abs16x16 = pix_abs16x16_c;
2546 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2547 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2548 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2549 c->pix_abs8x8 = pix_abs8x8_c;
2550 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2551 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2552 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2553
45553457
ZK
2554#define dspfunc(PFX, IDX, NUM) \
2555 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2556 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2557 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2558 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2559
2560 dspfunc(put, 0, 16);
2561 dspfunc(put_no_rnd, 0, 16);
2562 dspfunc(put, 1, 8);
2563 dspfunc(put_no_rnd, 1, 8);
2564
2565 dspfunc(avg, 0, 16);
2566 dspfunc(avg_no_rnd, 0, 16);
2567 dspfunc(avg, 1, 8);
2568 dspfunc(avg_no_rnd, 1, 8);
2569#undef dspfunc
2570
2571#define dspfunc(PFX, IDX, NUM) \
2572 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2573 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2574 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2575 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2576 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2577 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2578 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2579 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2580 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2581 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2582 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2583 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2584 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2585 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2586 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2587 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2588
2589 dspfunc(put_qpel, 0, 16);
2590 dspfunc(put_no_rnd_qpel, 0, 16);
2591
2592 dspfunc(avg_qpel, 0, 16);
2593 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2594
2595 dspfunc(put_qpel, 1, 8);
2596 dspfunc(put_no_rnd_qpel, 1, 8);
2597
2598 dspfunc(avg_qpel, 1, 8);
2599 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
0da71265
MN
2600
2601 dspfunc(put_h264_qpel, 0, 16);
2602 dspfunc(put_h264_qpel, 1, 8);
2603 dspfunc(put_h264_qpel, 2, 4);
2604 dspfunc(avg_h264_qpel, 0, 16);
2605 dspfunc(avg_h264_qpel, 1, 8);
2606 dspfunc(avg_h264_qpel, 2, 4);
2607
45553457 2608#undef dspfunc
0da71265
MN
2609 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2610 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2611 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2612 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2613 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2614 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
c9a2ebc4 2615
1457ab52
MN
2616 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2617 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2618 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2619 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2620 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2621 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2622 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2623 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2624
2625 c->hadamard8_diff[0]= hadamard8_diff16_c;
2626 c->hadamard8_diff[1]= hadamard8_diff_c;
2627 c->hadamard8_abs = hadamard8_abs_c;
2628
2629 c->dct_sad[0]= dct_sad16x16_c;
2630 c->dct_sad[1]= dct_sad8x8_c;
2631
2632 c->sad[0]= sad16x16_c;
2633 c->sad[1]= sad8x8_c;
2634
2635 c->quant_psnr[0]= quant_psnr16x16_c;
2636 c->quant_psnr[1]= quant_psnr8x8_c;
3a87ac94
MN
2637
2638 c->rd[0]= rd16x16_c;
2639 c->rd[1]= rd8x8_c;
2640
2641 c->bit[0]= bit16x16_c;
2642 c->bit[1]= bit8x8_c;
2643
11f18faf
MN
2644 c->add_bytes= add_bytes_c;
2645 c->diff_bytes= diff_bytes_c;
2646
980fc7b8 2647#ifdef HAVE_MMX
b0368839 2648 dsputil_init_mmx(c, avctx);
de6d9b64 2649#endif
3d03c0a2 2650#ifdef ARCH_ARMV4L
b0368839 2651 dsputil_init_armv4l(c, avctx);
3d03c0a2 2652#endif
c34270f5 2653#ifdef HAVE_MLIB
b0368839 2654 dsputil_init_mlib(c, avctx);
c34270f5 2655#endif
1e98dffb 2656#ifdef ARCH_ALPHA
b0368839 2657 dsputil_init_alpha(c, avctx);
1e98dffb 2658#endif
59925ef2 2659#ifdef ARCH_POWERPC
b0368839 2660 dsputil_init_ppc(c, avctx);
a43bd1d7 2661#endif
d46aba26 2662#ifdef HAVE_MMI
b0368839 2663 dsputil_init_mmi(c, avctx);
d46aba26 2664#endif
0c6bd2ea
B
2665#ifdef ARCH_SH4
2666 dsputil_init_sh4(c,avctx);
2667#endif
43f1708f 2668
b0368839
MN
2669 switch(c->idct_permutation_type){
2670 case FF_NO_IDCT_PERM:
2671 for(i=0; i<64; i++)
2672 c->idct_permutation[i]= i;
2673 break;
2674 case FF_LIBMPEG2_IDCT_PERM:
2675 for(i=0; i<64; i++)
2676 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2677 break;
2678 case FF_SIMPLE_IDCT_PERM:
2679 for(i=0; i<64; i++)
2680 c->idct_permutation[i]= simple_mmx_permutation[i];
2681 break;
2682 case FF_TRANSPOSE_IDCT_PERM:
2683 for(i=0; i<64; i++)
2684 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2685 break;
2686 default:
2687 fprintf(stderr, "Internal error, IDCT permutation not set\n");
2688 }
57060b1e 2689}
b0368839 2690