improved stack misalignment warning
[libav.git] / libavcodec / dsputil.c
CommitLineData
de6d9b64
FB
1/*
2 * DSP utils
ff4ec49e 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
de6d9b64 5 *
b78e7197
DB
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
ff4ec49e
FB
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
de6d9b64 12 *
b78e7197 13 * FFmpeg is distributed in the hope that it will be useful,
de6d9b64 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
ff4ec49e
FB
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
de6d9b64 17 *
ff4ec49e 18 * You should have received a copy of the GNU Lesser General Public
b78e7197 19 * License along with FFmpeg; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
7ff037e9 21 *
59fe111e 22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
de6d9b64 23 */
115329f1 24
983e3246
MN
25/**
26 * @file dsputil.c
27 * DSP utils
28 */
115329f1 29
de6d9b64
FB
30#include "avcodec.h"
31#include "dsputil.h"
1457ab52 32#include "mpegvideo.h"
b0368839 33#include "simple_idct.h"
65e4c8c9 34#include "faandct.h"
059715a4 35#include "snow.h"
5596c60c 36
88730be6
MR
37/* snow.c */
38void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39
2dac4acf
LM
40/* vorbis.c */
41void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
42
55fde95e 43uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
1d503957 44uint32_t ff_squareTbl[512] = {0, };
de6d9b64 45
0c1a9eda 46const uint8_t ff_zigzag_direct[64] = {
2ad1516a
MN
47 0, 1, 8, 16, 9, 2, 3, 10,
48 17, 24, 32, 25, 18, 11, 4, 5,
e0eac44e 49 12, 19, 26, 33, 40, 48, 41, 34,
2ad1516a 50 27, 20, 13, 6, 7, 14, 21, 28,
e0eac44e
FB
51 35, 42, 49, 56, 57, 50, 43, 36,
52 29, 22, 15, 23, 30, 37, 44, 51,
53 58, 59, 52, 45, 38, 31, 39, 46,
54 53, 60, 61, 54, 47, 55, 62, 63
55};
56
10acc479
RS
57/* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
68};
69
2f349de2 70/* not permutated inverse zigzag_direct + 1 for MMX quantizer */
486497e0 71DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
2f349de2 72
0c1a9eda 73const uint8_t ff_alternate_horizontal_scan[64] = {
115329f1 74 0, 1, 2, 3, 8, 9, 16, 17,
e0eac44e 75 10, 11, 4, 5, 6, 7, 15, 14,
115329f1 76 13, 12, 19, 18, 24, 25, 32, 33,
e0eac44e 77 26, 27, 20, 21, 22, 23, 28, 29,
115329f1 78 30, 31, 34, 35, 40, 41, 48, 49,
e0eac44e 79 42, 43, 36, 37, 38, 39, 44, 45,
115329f1 80 46, 47, 50, 51, 56, 57, 58, 59,
e0eac44e
FB
81 52, 53, 54, 55, 60, 61, 62, 63,
82};
83
0c1a9eda 84const uint8_t ff_alternate_vertical_scan[64] = {
115329f1 85 0, 8, 16, 24, 1, 9, 2, 10,
e0eac44e 86 17, 25, 32, 40, 48, 56, 57, 49,
115329f1 87 41, 33, 26, 18, 3, 11, 4, 12,
e0eac44e 88 19, 27, 34, 42, 50, 58, 35, 43,
115329f1 89 51, 59, 20, 28, 5, 13, 6, 14,
e0eac44e 90 21, 29, 36, 44, 52, 60, 37, 45,
115329f1 91 53, 61, 22, 30, 7, 15, 23, 31,
e0eac44e
FB
92 38, 46, 54, 62, 39, 47, 55, 63,
93};
94
2f349de2 95/* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
36cd3069 96const uint32_t ff_inverse[256]={
115329f1
DB
97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
2f349de2
MN
128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
129};
130
b0368839
MN
131/* Input permutation for the simple_idct_mmx */
132static const uint8_t simple_mmx_permutation[64]={
bb270c08
DB
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
b0368839
MN
141};
142
0c1a9eda 143static int pix_sum_c(uint8_t * pix, int line_size)
3aa102be
MN
144{
145 int s, i, j;
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
bb270c08
DB
149 for (j = 0; j < 16; j += 8) {
150 s += pix[0];
151 s += pix[1];
152 s += pix[2];
153 s += pix[3];
154 s += pix[4];
155 s += pix[5];
156 s += pix[6];
157 s += pix[7];
158 pix += 8;
159 }
160 pix += line_size - 16;
3aa102be
MN
161 }
162 return s;
163}
164
0c1a9eda 165static int pix_norm1_c(uint8_t * pix, int line_size)
3aa102be
MN
166{
167 int s, i, j;
1d503957 168 uint32_t *sq = ff_squareTbl + 256;
3aa102be
MN
169
170 s = 0;
171 for (i = 0; i < 16; i++) {
bb270c08 172 for (j = 0; j < 16; j += 8) {
2a006cd3 173#if 0
bb270c08
DB
174 s += sq[pix[0]];
175 s += sq[pix[1]];
176 s += sq[pix[2]];
177 s += sq[pix[3]];
178 s += sq[pix[4]];
179 s += sq[pix[5]];
180 s += sq[pix[6]];
181 s += sq[pix[7]];
2a006cd3
FL
182#else
183#if LONG_MAX > 2147483647
bb270c08
DB
184 register uint64_t x=*(uint64_t*)pix;
185 s += sq[x&0xff];
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
2a006cd3
FL
189 s += sq[(x>>32)&0xff];
190 s += sq[(x>>40)&0xff];
191 s += sq[(x>>48)&0xff];
192 s += sq[(x>>56)&0xff];
193#else
bb270c08
DB
194 register uint32_t x=*(uint32_t*)pix;
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
2a006cd3
FL
199 x=*(uint32_t*)(pix+4);
200 s += sq[x&0xff];
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204#endif
205#endif
bb270c08
DB
206 pix += 8;
207 }
208 pix += line_size - 16;
3aa102be
MN
209 }
210 return s;
211}
212
3d2e8cce
MN
213static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214 int i;
115329f1 215
3d2e8cce
MN
216 for(i=0; i+8<=w; i+=8){
217 dst[i+0]= bswap_32(src[i+0]);
218 dst[i+1]= bswap_32(src[i+1]);
219 dst[i+2]= bswap_32(src[i+2]);
220 dst[i+3]= bswap_32(src[i+3]);
221 dst[i+4]= bswap_32(src[i+4]);
222 dst[i+5]= bswap_32(src[i+5]);
223 dst[i+6]= bswap_32(src[i+6]);
224 dst[i+7]= bswap_32(src[i+7]);
225 }
226 for(;i<w; i++){
227 dst[i+0]= bswap_32(src[i+0]);
228 }
229}
3aa102be 230
26efc54e
MN
231static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232{
233 int s, i;
1d503957 234 uint32_t *sq = ff_squareTbl + 256;
26efc54e
MN
235
236 s = 0;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
242 pix1 += line_size;
243 pix2 += line_size;
244 }
245 return s;
246}
247
bb198e19 248static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
1457ab52
MN
249{
250 int s, i;
1d503957 251 uint32_t *sq = ff_squareTbl + 256;
1457ab52
MN
252
253 s = 0;
bb198e19 254 for (i = 0; i < h; i++) {
1457ab52
MN
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
263 pix1 += line_size;
264 pix2 += line_size;
265 }
266 return s;
267}
268
bb198e19 269static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
9c76bd48 270{
6b026927 271 int s, i;
1d503957 272 uint32_t *sq = ff_squareTbl + 256;
9c76bd48
BF
273
274 s = 0;
bb198e19 275 for (i = 0; i < h; i++) {
6b026927
FH
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
2a006cd3 292
6b026927
FH
293 pix1 += line_size;
294 pix2 += line_size;
9c76bd48
BF
295 }
296 return s;
297}
298
26efc54e 299
871371a7 300#ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
3a6fc8fa 301static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
26efc54e
MN
302 int s, i, j;
303 const int dec_count= w==8 ? 3 : 4;
871371a7 304 int tmp[32*32];
26efc54e 305 int level, ori;
115329f1 306 static const int scale[2][2][4][4]={
26efc54e
MN
307 {
308 {
871371a7 309 // 9/7 8x8 dec=3
26efc54e
MN
310 {268, 239, 239, 213},
311 { 0, 224, 224, 152},
312 { 0, 135, 135, 110},
313 },{
871371a7 314 // 9/7 16x16 or 32x32 dec=4
26efc54e
MN
315 {344, 310, 310, 280},
316 { 0, 320, 320, 228},
317 { 0, 175, 175, 136},
318 { 0, 129, 129, 102},
319 }
320 },{
871371a7
LM
321 {
322 // 5/3 8x8 dec=3
26efc54e
MN
323 {275, 245, 245, 218},
324 { 0, 230, 230, 156},
325 { 0, 138, 138, 113},
326 },{
871371a7 327 // 5/3 16x16 or 32x32 dec=4
26efc54e
MN
328 {352, 317, 317, 286},
329 { 0, 328, 328, 233},
330 { 0, 180, 180, 140},
331 { 0, 132, 132, 105},
332 }
333 }
334 };
26efc54e
MN
335
336 for (i = 0; i < h; i++) {
337 for (j = 0; j < w; j+=4) {
871371a7
LM
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
26efc54e
MN
342 }
343 pix1 += line_size;
344 pix2 += line_size;
345 }
8b975b7c 346
871371a7 347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
26efc54e
MN
348
349 s=0;
871371a7 350 assert(w==h);
26efc54e
MN
351 for(level=0; level<dec_count; level++){
352 for(ori= level ? 1 : 0; ori<4; ori++){
871371a7
LM
353 int size= w>>(dec_count-level);
354 int sx= (ori&1) ? size : 0;
355 int stride= 32<<(dec_count-level);
26efc54e 356 int sy= (ori&2) ? stride>>1 : 0;
115329f1 357
26efc54e
MN
358 for(i=0; i<size; i++){
359 for(j=0; j<size; j++){
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
c26abfa5 361 s += FFABS(v);
26efc54e
MN
362 }
363 }
364 }
365 }
115329f1 366 assert(s>=0);
871371a7 367 return s>>9;
26efc54e
MN
368}
369
370static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371 return w_c(v, pix1, pix2, line_size, 8, h, 1);
372}
373
374static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 0);
376}
377
378static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 16, h, 1);
380}
381
382static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 0);
384}
385
486497e0 386int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
387 return w_c(v, pix1, pix2, line_size, 32, h, 1);
388}
389
486497e0 390int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
871371a7
LM
391 return w_c(v, pix1, pix2, line_size, 32, h, 0);
392}
3a6fc8fa 393#endif
871371a7 394
0c1a9eda 395static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
de6d9b64 396{
de6d9b64
FB
397 int i;
398
399 /* read the pixels */
de6d9b64 400 for(i=0;i<8;i++) {
c13e1abd
FH
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
409 pixels += line_size;
410 block += 8;
de6d9b64
FB
411 }
412}
413
0c1a9eda 414static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
bb270c08 415 const uint8_t *s2, int stride){
9dbcbd92
MN
416 int i;
417
418 /* read the pixels */
9dbcbd92 419 for(i=0;i<8;i++) {
c13e1abd
FH
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
9dbcbd92
MN
428 s1 += stride;
429 s2 += stride;
c13e1abd 430 block += 8;
9dbcbd92
MN
431 }
432}
433
434
0c1a9eda 435static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 436 int line_size)
de6d9b64 437{
de6d9b64 438 int i;
55fde95e 439 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 440
de6d9b64 441 /* read the pixels */
de6d9b64 442 for(i=0;i<8;i++) {
c13e1abd
FH
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
451
452 pixels += line_size;
453 block += 8;
de6d9b64
FB
454 }
455}
456
178fcca8 457static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 458 int line_size)
178fcca8
MN
459{
460 int i;
55fde95e 461 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 462
178fcca8
MN
463 /* read the pixels */
464 for(i=0;i<4;i++) {
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
469
470 pixels += line_size;
471 block += 8;
472 }
473}
474
9ca358b9 475static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
bb270c08 476 int line_size)
9ca358b9
MN
477{
478 int i;
55fde95e 479 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 480
9ca358b9
MN
481 /* read the pixels */
482 for(i=0;i<2;i++) {
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
485
486 pixels += line_size;
487 block += 8;
488 }
489}
490
115329f1 491static void put_signed_pixels_clamped_c(const DCTELEM *block,
f9ed9d85
MM
492 uint8_t *restrict pixels,
493 int line_size)
494{
495 int i, j;
496
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
499 if (*block < -128)
500 *pixels = 0;
501 else if (*block > 127)
502 *pixels = 255;
503 else
504 *pixels = (uint8_t)(*block + 128);
505 block++;
506 pixels++;
507 }
508 pixels += (line_size - 8);
509 }
510}
511
0c1a9eda 512static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
c13e1abd 513 int line_size)
de6d9b64 514{
de6d9b64 515 int i;
55fde95e 516 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 517
de6d9b64 518 /* read the pixels */
de6d9b64 519 for(i=0;i<8;i++) {
c13e1abd
FH
520 pixels[0] = cm[pixels[0] + block[0]];
521 pixels[1] = cm[pixels[1] + block[1]];
522 pixels[2] = cm[pixels[2] + block[2]];
523 pixels[3] = cm[pixels[3] + block[3]];
524 pixels[4] = cm[pixels[4] + block[4]];
525 pixels[5] = cm[pixels[5] + block[5]];
526 pixels[6] = cm[pixels[6] + block[6]];
527 pixels[7] = cm[pixels[7] + block[7]];
528 pixels += line_size;
529 block += 8;
de6d9b64
FB
530 }
531}
178fcca8
MN
532
533static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
534 int line_size)
535{
536 int i;
55fde95e 537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 538
178fcca8
MN
539 /* read the pixels */
540 for(i=0;i<4;i++) {
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
545 pixels += line_size;
546 block += 8;
547 }
548}
9ca358b9
MN
549
550static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
551 int line_size)
552{
553 int i;
55fde95e 554 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
115329f1 555
9ca358b9
MN
556 /* read the pixels */
557 for(i=0;i<2;i++) {
558 pixels[0] = cm[pixels[0] + block[0]];
559 pixels[1] = cm[pixels[1] + block[1]];
560 pixels += line_size;
561 block += 8;
562 }
563}
36940eca
LM
564
565static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
566{
567 int i;
568 for(i=0;i<8;i++) {
569 pixels[0] += block[0];
570 pixels[1] += block[1];
571 pixels[2] += block[2];
572 pixels[3] += block[3];
573 pixels[4] += block[4];
574 pixels[5] += block[5];
575 pixels[6] += block[6];
576 pixels[7] += block[7];
577 pixels += line_size;
578 block += 8;
579 }
580}
581
582static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
583{
584 int i;
585 for(i=0;i<4;i++) {
586 pixels[0] += block[0];
587 pixels[1] += block[1];
588 pixels[2] += block[2];
589 pixels[3] += block[3];
590 pixels += line_size;
591 block += 4;
592 }
593}
594
59fe111e
MN
595#if 0
596
597#define PIXOP2(OPNAME, OP) \
b3184779 598static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
599{\
600 int i;\
601 for(i=0; i<h; i++){\
602 OP(*((uint64_t*)block), LD64(pixels));\
603 pixels+=line_size;\
604 block +=line_size;\
605 }\
606}\
607\
45553457 608static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
609{\
610 int i;\
611 for(i=0; i<h; i++){\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+1);\
614 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618}\
619\
45553457 620static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
621{\
622 int i;\
623 for(i=0; i<h; i++){\
624 const uint64_t a= LD64(pixels );\
625 const uint64_t b= LD64(pixels+1);\
626 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627 pixels+=line_size;\
628 block +=line_size;\
629 }\
630}\
631\
45553457 632static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
633{\
634 int i;\
635 for(i=0; i<h; i++){\
636 const uint64_t a= LD64(pixels );\
637 const uint64_t b= LD64(pixels+line_size);\
638 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642}\
643\
45553457 644static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
645{\
646 int i;\
647 for(i=0; i<h; i++){\
648 const uint64_t a= LD64(pixels );\
649 const uint64_t b= LD64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654}\
655\
45553457 656static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
657{\
658 int i;\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0202020202020202ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666 uint64_t l1,h1;\
667\
668 pixels+=line_size;\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677 pixels+=line_size;\
678 block +=line_size;\
679 a= LD64(pixels );\
680 b= LD64(pixels+1);\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0202020202020202ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687 pixels+=line_size;\
688 block +=line_size;\
689 }\
690}\
691\
45553457 692static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
693{\
694 int i;\
695 const uint64_t a= LD64(pixels );\
696 const uint64_t b= LD64(pixels+1);\
697 uint64_t l0= (a&0x0303030303030303ULL)\
698 + (b&0x0303030303030303ULL)\
699 + 0x0101010101010101ULL;\
700 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
702 uint64_t l1,h1;\
703\
704 pixels+=line_size;\
705 for(i=0; i<h; i+=2){\
706 uint64_t a= LD64(pixels );\
707 uint64_t b= LD64(pixels+1);\
708 l1= (a&0x0303030303030303ULL)\
709 + (b&0x0303030303030303ULL);\
710 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
713 pixels+=line_size;\
714 block +=line_size;\
715 a= LD64(pixels );\
716 b= LD64(pixels+1);\
717 l0= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL)\
719 + 0x0101010101010101ULL;\
720 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
723 pixels+=line_size;\
724 block +=line_size;\
725 }\
726}\
727\
45553457
ZK
728CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
729CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
59fe111e
MN
735
736#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737#else // 64 bit variant
738
739#define PIXOP2(OPNAME, OP) \
669ac79c
MN
740static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741 int i;\
742 for(i=0; i<h; i++){\
743 OP(*((uint16_t*)(block )), LD16(pixels ));\
744 pixels+=line_size;\
745 block +=line_size;\
746 }\
747}\
0da71265
MN
748static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749 int i;\
750 for(i=0; i<h; i++){\
751 OP(*((uint32_t*)(block )), LD32(pixels ));\
752 pixels+=line_size;\
753 block +=line_size;\
754 }\
755}\
45553457 756static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
59fe111e
MN
757 int i;\
758 for(i=0; i<h; i++){\
759 OP(*((uint32_t*)(block )), LD32(pixels ));\
760 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
761 pixels+=line_size;\
762 block +=line_size;\
763 }\
764}\
45553457
ZK
765static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
b3184779 767}\
59fe111e 768\
b3184779
MN
769static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
59fe111e
MN
771 int i;\
772 for(i=0; i<h; i++){\
b3184779
MN
773 uint32_t a,b;\
774 a= LD32(&src1[i*src_stride1 ]);\
775 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 776 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
b3184779
MN
777 a= LD32(&src1[i*src_stride1+4]);\
778 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 779 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
59fe111e
MN
780 }\
781}\
782\
b3184779
MN
783static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
59fe111e
MN
785 int i;\
786 for(i=0; i<h; i++){\
b3184779
MN
787 uint32_t a,b;\
788 a= LD32(&src1[i*src_stride1 ]);\
789 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 790 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
b3184779
MN
791 a= LD32(&src1[i*src_stride1+4]);\
792 b= LD32(&src2[i*src_stride2+4]);\
d8085ea7 793 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
59fe111e
MN
794 }\
795}\
796\
0da71265
MN
797static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798 int src_stride1, int src_stride2, int h){\
799 int i;\
800 for(i=0; i<h; i++){\
801 uint32_t a,b;\
802 a= LD32(&src1[i*src_stride1 ]);\
803 b= LD32(&src2[i*src_stride2 ]);\
d8085ea7 804 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
0da71265
MN
805 }\
806}\
807\
669ac79c
MN
808static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809 int src_stride1, int src_stride2, int h){\
810 int i;\
811 for(i=0; i<h; i++){\
812 uint32_t a,b;\
813 a= LD16(&src1[i*src_stride1 ]);\
814 b= LD16(&src2[i*src_stride2 ]);\
815 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
816 }\
817}\
818\
b3184779
MN
819static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820 int src_stride1, int src_stride2, int h){\
821 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
822 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823}\
824\
825static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826 int src_stride1, int src_stride2, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
828 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829}\
830\
45553457 831static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
832 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833}\
834\
45553457 835static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
836 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837}\
838\
45553457 839static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841}\
842\
45553457 843static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
b3184779
MN
844 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845}\
846\
847static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
849 int i;\
850 for(i=0; i<h; i++){\
b3184779
MN
851 uint32_t a, b, c, d, l0, l1, h0, h1;\
852 a= LD32(&src1[i*src_stride1]);\
853 b= LD32(&src2[i*src_stride2]);\
854 c= LD32(&src3[i*src_stride3]);\
855 d= LD32(&src4[i*src_stride4]);\
856 l0= (a&0x03030303UL)\
857 + (b&0x03030303UL)\
858 + 0x02020202UL;\
859 h0= ((a&0xFCFCFCFCUL)>>2)\
860 + ((b&0xFCFCFCFCUL)>>2);\
861 l1= (c&0x03030303UL)\
862 + (d&0x03030303UL);\
863 h1= ((c&0xFCFCFCFCUL)>>2)\
864 + ((d&0xFCFCFCFCUL)>>2);\
865 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866 a= LD32(&src1[i*src_stride1+4]);\
867 b= LD32(&src2[i*src_stride2+4]);\
868 c= LD32(&src3[i*src_stride3+4]);\
869 d= LD32(&src4[i*src_stride4+4]);\
870 l0= (a&0x03030303UL)\
871 + (b&0x03030303UL)\
872 + 0x02020202UL;\
873 h0= ((a&0xFCFCFCFCUL)>>2)\
874 + ((b&0xFCFCFCFCUL)>>2);\
875 l1= (c&0x03030303UL)\
876 + (d&0x03030303UL);\
877 h1= ((c&0xFCFCFCFCUL)>>2)\
878 + ((d&0xFCFCFCFCUL)>>2);\
879 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
880 }\
881}\
669ac79c
MN
882\
883static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885}\
886\
887static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889}\
890\
891static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893}\
894\
895static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897}\
898\
b3184779
MN
899static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
59fe111e
MN
901 int i;\
902 for(i=0; i<h; i++){\
b3184779
MN
903 uint32_t a, b, c, d, l0, l1, h0, h1;\
904 a= LD32(&src1[i*src_stride1]);\
905 b= LD32(&src2[i*src_stride2]);\
906 c= LD32(&src3[i*src_stride3]);\
907 d= LD32(&src4[i*src_stride4]);\
908 l0= (a&0x03030303UL)\
909 + (b&0x03030303UL)\
910 + 0x01010101UL;\
911 h0= ((a&0xFCFCFCFCUL)>>2)\
912 + ((b&0xFCFCFCFCUL)>>2);\
913 l1= (c&0x03030303UL)\
914 + (d&0x03030303UL);\
915 h1= ((c&0xFCFCFCFCUL)>>2)\
916 + ((d&0xFCFCFCFCUL)>>2);\
917 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918 a= LD32(&src1[i*src_stride1+4]);\
919 b= LD32(&src2[i*src_stride2+4]);\
920 c= LD32(&src3[i*src_stride3+4]);\
921 d= LD32(&src4[i*src_stride4+4]);\
922 l0= (a&0x03030303UL)\
923 + (b&0x03030303UL)\
924 + 0x01010101UL;\
925 h0= ((a&0xFCFCFCFCUL)>>2)\
926 + ((b&0xFCFCFCFCUL)>>2);\
927 l1= (c&0x03030303UL)\
928 + (d&0x03030303UL);\
929 h1= ((c&0xFCFCFCFCUL)>>2)\
930 + ((d&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
59fe111e
MN
932 }\
933}\
b3184779
MN
934static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938}\
939static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943}\
59fe111e 944\
669ac79c
MN
945static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946{\
947 int i, a0, b0, a1, b1;\
948 a0= pixels[0];\
949 b0= pixels[1] + 2;\
950 a0 += b0;\
951 b0 += pixels[2];\
952\
953 pixels+=line_size;\
954 for(i=0; i<h; i+=2){\
955 a1= pixels[0];\
956 b1= pixels[1];\
957 a1 += b1;\
958 b1 += pixels[2];\
959\
960 block[0]= (a1+a0)>>2; /* FIXME non put */\
961 block[1]= (b1+b0)>>2;\
962\
963 pixels+=line_size;\
964 block +=line_size;\
965\
966 a0= pixels[0];\
967 b0= pixels[1] + 2;\
968 a0 += b0;\
969 b0 += pixels[2];\
970\
971 block[0]= (a1+a0)>>2;\
972 block[1]= (b1+b0)>>2;\
973 pixels+=line_size;\
974 block +=line_size;\
975 }\
976}\
977\
978static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979{\
980 int i;\
981 const uint32_t a= LD32(pixels );\
982 const uint32_t b= LD32(pixels+1);\
983 uint32_t l0= (a&0x03030303UL)\
984 + (b&0x03030303UL)\
985 + 0x02020202UL;\
986 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987 + ((b&0xFCFCFCFCUL)>>2);\
988 uint32_t l1,h1;\
989\
990 pixels+=line_size;\
991 for(i=0; i<h; i+=2){\
992 uint32_t a= LD32(pixels );\
993 uint32_t b= LD32(pixels+1);\
994 l1= (a&0x03030303UL)\
995 + (b&0x03030303UL);\
996 h1= ((a&0xFCFCFCFCUL)>>2)\
997 + ((b&0xFCFCFCFCUL)>>2);\
998 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999 pixels+=line_size;\
1000 block +=line_size;\
1001 a= LD32(pixels );\
1002 b= LD32(pixels+1);\
1003 l0= (a&0x03030303UL)\
1004 + (b&0x03030303UL)\
1005 + 0x02020202UL;\
1006 h0= ((a&0xFCFCFCFCUL)>>2)\
1007 + ((b&0xFCFCFCFCUL)>>2);\
1008 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009 pixels+=line_size;\
1010 block +=line_size;\
1011 }\
1012}\
1013\
45553457 1014static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1015{\
1016 int j;\
1017 for(j=0; j<2; j++){\
1018 int i;\
1019 const uint32_t a= LD32(pixels );\
1020 const uint32_t b= LD32(pixels+1);\
1021 uint32_t l0= (a&0x03030303UL)\
1022 + (b&0x03030303UL)\
1023 + 0x02020202UL;\
1024 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1026 uint32_t l1,h1;\
1027\
1028 pixels+=line_size;\
1029 for(i=0; i<h; i+=2){\
1030 uint32_t a= LD32(pixels );\
1031 uint32_t b= LD32(pixels+1);\
1032 l1= (a&0x03030303UL)\
1033 + (b&0x03030303UL);\
1034 h1= ((a&0xFCFCFCFCUL)>>2)\
1035 + ((b&0xFCFCFCFCUL)>>2);\
1036 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1037 pixels+=line_size;\
1038 block +=line_size;\
1039 a= LD32(pixels );\
1040 b= LD32(pixels+1);\
1041 l0= (a&0x03030303UL)\
1042 + (b&0x03030303UL)\
1043 + 0x02020202UL;\
1044 h0= ((a&0xFCFCFCFCUL)>>2)\
1045 + ((b&0xFCFCFCFCUL)>>2);\
1046 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1047 pixels+=line_size;\
1048 block +=line_size;\
1049 }\
1050 pixels+=4-line_size*(h+1);\
1051 block +=4-line_size*h;\
1052 }\
1053}\
1054\
45553457 1055static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
59fe111e
MN
1056{\
1057 int j;\
1058 for(j=0; j<2; j++){\
1059 int i;\
1060 const uint32_t a= LD32(pixels );\
1061 const uint32_t b= LD32(pixels+1);\
1062 uint32_t l0= (a&0x03030303UL)\
1063 + (b&0x03030303UL)\
1064 + 0x01010101UL;\
1065 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066 + ((b&0xFCFCFCFCUL)>>2);\
1067 uint32_t l1,h1;\
1068\
1069 pixels+=line_size;\
1070 for(i=0; i<h; i+=2){\
1071 uint32_t a= LD32(pixels );\
1072 uint32_t b= LD32(pixels+1);\
1073 l1= (a&0x03030303UL)\
1074 + (b&0x03030303UL);\
1075 h1= ((a&0xFCFCFCFCUL)>>2)\
1076 + ((b&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078 pixels+=line_size;\
1079 block +=line_size;\
1080 a= LD32(pixels );\
1081 b= LD32(pixels+1);\
1082 l0= (a&0x03030303UL)\
1083 + (b&0x03030303UL)\
1084 + 0x01010101UL;\
1085 h0= ((a&0xFCFCFCFCUL)>>2)\
1086 + ((b&0xFCFCFCFCUL)>>2);\
1087 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088 pixels+=line_size;\
1089 block +=line_size;\
1090 }\
1091 pixels+=4-line_size*(h+1);\
1092 block +=4-line_size*h;\
1093 }\
1094}\
1095\
45553457
ZK
1096CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1097CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1101CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
b3184779 1104
d8085ea7 1105#define op_avg(a, b) a = rnd_avg32(a, b)
59fe111e 1106#endif
59fe111e
MN
1107#define op_put(a, b) a = b
1108
1109PIXOP2(avg, op_avg)
1110PIXOP2(put, op_put)
1111#undef op_avg
1112#undef op_put
1113
de6d9b64
FB
1114#define avg2(a,b) ((a+b+1)>>1)
1115#define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116
c0a0170c
MN
1117static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119}
1120
1121static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123}
073b013d 1124
0c1a9eda 1125static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
44eb4951
MN
1126{
1127 const int A=(16-x16)*(16-y16);
1128 const int B=( x16)*(16-y16);
1129 const int C=(16-x16)*( y16);
1130 const int D=( x16)*( y16);
1131 int i;
44eb4951
MN
1132
1133 for(i=0; i<h; i++)
1134 {
b3184779
MN
1135 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143 dst+= stride;
1144 src+= stride;
44eb4951
MN
1145 }
1146}
1147
703c8195 1148void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
073b013d
MN
1149 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150{
1151 int y, vx, vy;
1152 const int s= 1<<shift;
115329f1 1153
073b013d
MN
1154 width--;
1155 height--;
1156
1157 for(y=0; y<h; y++){
1158 int x;
1159
1160 vx= ox;
1161 vy= oy;
1162 for(x=0; x<8; x++){ //XXX FIXME optimize
1163 int src_x, src_y, frac_x, frac_y, index;
1164
1165 src_x= vx>>16;
1166 src_y= vy>>16;
1167 frac_x= src_x&(s-1);
1168 frac_y= src_y&(s-1);
1169 src_x>>=shift;
1170 src_y>>=shift;
115329f1 1171
073b013d
MN
1172 if((unsigned)src_x < width){
1173 if((unsigned)src_y < height){
1174 index= src_x + src_y*stride;
1175 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1176 + src[index +1]* frac_x )*(s-frac_y)
1177 + ( src[index+stride ]*(s-frac_x)
1178 + src[index+stride+1]* frac_x )* frac_y
1179 + r)>>(shift*2);
1180 }else{
115329f1
DB
1181 index= src_x + clip(src_y, 0, height)*stride;
1182 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
073b013d
MN
1183 + src[index +1]* frac_x )*s
1184 + r)>>(shift*2);
1185 }
1186 }else{
1187 if((unsigned)src_y < height){
115329f1
DB
1188 index= clip(src_x, 0, width) + src_y*stride;
1189 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
073b013d
MN
1190 + src[index+stride ]* frac_y )*s
1191 + r)>>(shift*2);
1192 }else{
115329f1 1193 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
073b013d
MN
1194 dst[y*stride + x]= src[index ];
1195 }
1196 }
115329f1 1197
073b013d
MN
1198 vx+= dxx;
1199 vy+= dyx;
1200 }
1201 ox += dxy;
1202 oy += dyy;
1203 }
1204}
669ac79c
MN
1205
1206static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207 switch(width){
1208 case 2: put_pixels2_c (dst, src, stride, height); break;
1209 case 4: put_pixels4_c (dst, src, stride, height); break;
1210 case 8: put_pixels8_c (dst, src, stride, height); break;
1211 case 16:put_pixels16_c(dst, src, stride, height); break;
1212 }
1213}
1214
1215static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 int i,j;
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
bb270c08 1219 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
669ac79c
MN
1220 }
1221 src += stride;
1222 dst += stride;
1223 }
1224}
1225
1226static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227 int i,j;
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
bb270c08 1230 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
669ac79c
MN
1231 }
1232 src += stride;
1233 dst += stride;
1234 }
1235}
115329f1 1236
669ac79c
MN
1237static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238 int i,j;
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
bb270c08 1241 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
669ac79c
MN
1242 }
1243 src += stride;
1244 dst += stride;
1245 }
1246}
115329f1 1247
669ac79c
MN
1248static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249 int i,j;
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
bb270c08 1252 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1253 }
1254 src += stride;
1255 dst += stride;
1256 }
1257}
1258
1259static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260 int i,j;
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
bb270c08 1263 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1264 }
1265 src += stride;
1266 dst += stride;
1267 }
1268}
1269
1270static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 int i,j;
1272 for (i=0; i < height; i++) {
1273 for (j=0; j < width; j++) {
bb270c08 1274 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
669ac79c
MN
1275 }
1276 src += stride;
1277 dst += stride;
1278 }
1279}
1280
1281static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282 int i,j;
1283 for (i=0; i < height; i++) {
1284 for (j=0; j < width; j++) {
bb270c08 1285 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1286 }
1287 src += stride;
1288 dst += stride;
1289 }
1290}
1291
1292static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293 int i,j;
1294 for (i=0; i < height; i++) {
1295 for (j=0; j < width; j++) {
bb270c08 1296 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669ac79c
MN
1297 }
1298 src += stride;
1299 dst += stride;
1300 }
1301}
da3b9756
MM
1302
1303static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304 switch(width){
1305 case 2: avg_pixels2_c (dst, src, stride, height); break;
1306 case 4: avg_pixels4_c (dst, src, stride, height); break;
1307 case 8: avg_pixels8_c (dst, src, stride, height); break;
1308 case 16:avg_pixels16_c(dst, src, stride, height); break;
1309 }
1310}
1311
1312static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 int i,j;
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
bb270c08 1316 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1317 }
1318 src += stride;
1319 dst += stride;
1320 }
1321}
1322
1323static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 int i,j;
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
bb270c08 1327 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1328 }
1329 src += stride;
1330 dst += stride;
1331 }
1332}
115329f1 1333
da3b9756
MM
1334static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 int i,j;
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
bb270c08 1338 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1339 }
1340 src += stride;
1341 dst += stride;
1342 }
1343}
115329f1 1344
da3b9756
MM
1345static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 int i,j;
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
bb270c08 1349 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1350 }
1351 src += stride;
1352 dst += stride;
1353 }
1354}
1355
1356static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 int i,j;
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
bb270c08 1360 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1361 }
1362 src += stride;
1363 dst += stride;
1364 }
1365}
1366
1367static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 int i,j;
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
bb270c08 1371 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
da3b9756
MM
1372 }
1373 src += stride;
1374 dst += stride;
1375 }
1376}
1377
1378static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 int i,j;
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
bb270c08 1382 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1383 }
1384 src += stride;
1385 dst += stride;
1386 }
1387}
1388
1389static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 int i,j;
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
bb270c08 1393 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
da3b9756
MM
1394 }
1395 src += stride;
1396 dst += stride;
1397 }
1398}
669ac79c
MN
1399#if 0
1400#define TPEL_WIDTH(width)\
1401static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419#endif
1420
0da71265
MN
1421#define H264_CHROMA_MC(OPNAME, OP)\
1422static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423 const int A=(8-x)*(8-y);\
1424 const int B=( x)*(8-y);\
1425 const int C=(8-x)*( y);\
1426 const int D=( x)*( y);\
1427 int i;\
1428 \
1429 assert(x<8 && y<8 && x>=0 && y>=0);\
1430\
1431 for(i=0; i<h; i++)\
1432 {\
1433 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435 dst+= stride;\
1436 src+= stride;\
1437 }\
1438}\
1439\
1440static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441 const int A=(8-x)*(8-y);\
1442 const int B=( x)*(8-y);\
1443 const int C=(8-x)*( y);\
1444 const int D=( x)*( y);\
1445 int i;\
1446 \
1447 assert(x<8 && y<8 && x>=0 && y>=0);\
1448\
1449 for(i=0; i<h; i++)\
1450 {\
1451 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455 dst+= stride;\
1456 src+= stride;\
1457 }\
1458}\
1459\
1460static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461 const int A=(8-x)*(8-y);\
1462 const int B=( x)*(8-y);\
1463 const int C=(8-x)*( y);\
1464 const int D=( x)*( y);\
1465 int i;\
1466 \
1467 assert(x<8 && y<8 && x>=0 && y>=0);\
1468\
1469 for(i=0; i<h; i++)\
1470 {\
1471 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479 dst+= stride;\
1480 src+= stride;\
1481 }\
1482}
1483
1484#define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485#define op_put(a, b) a = (((b) + 32)>>6)
1486
1487H264_CHROMA_MC(put_ , op_put)
1488H264_CHROMA_MC(avg_ , op_avg)
1489#undef op_avg
1490#undef op_put
1491
e34350a3
KS
1492static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493 const int A=(8-x)*(8-y);
1494 const int B=( x)*(8-y);
1495 const int C=(8-x)*( y);
1496 const int D=( x)*( y);
1497 int i;
1498
1499 assert(x<8 && y<8 && x>=0 && y>=0);
1500
1501 for(i=0; i<h; i++)
1502 {
1503 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1511 dst+= stride;
1512 src+= stride;
1513 }
1514}
1515
b3184779 1516#define QPEL_MC(r, OPNAME, RND, OP) \
0c1a9eda 1517static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1518 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1519 int i;\
1520 for(i=0; i<h; i++)\
1521 {\
1522 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1523 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1524 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1525 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1526 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1527 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1528 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1529 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1530 dst+=dstStride;\
1531 src+=srcStride;\
1532 }\
44eb4951
MN
1533}\
1534\
0c1a9eda 1535static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
db794953 1536 const int w=8;\
55fde95e 1537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779
MN
1538 int i;\
1539 for(i=0; i<w; i++)\
1540 {\
1541 const int src0= src[0*srcStride];\
1542 const int src1= src[1*srcStride];\
1543 const int src2= src[2*srcStride];\
1544 const int src3= src[3*srcStride];\
1545 const int src4= src[4*srcStride];\
1546 const int src5= src[5*srcStride];\
1547 const int src6= src[6*srcStride];\
1548 const int src7= src[7*srcStride];\
1549 const int src8= src[8*srcStride];\
1550 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1551 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1552 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1553 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1554 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1555 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1556 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1557 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1558 dst++;\
1559 src++;\
1560 }\
1561}\
1562\
0c1a9eda 1563static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
55fde95e 1564 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1565 int i;\
826f429a 1566 \
b3184779
MN
1567 for(i=0; i<h; i++)\
1568 {\
1569 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1570 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1571 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1572 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1573 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1574 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1575 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1576 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1577 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1578 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1579 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1580 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1581 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1582 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1583 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1584 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1585 dst+=dstStride;\
1586 src+=srcStride;\
1587 }\
1588}\
1589\
0c1a9eda 1590static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
55fde95e 1591 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
b3184779 1592 int i;\
826f429a 1593 const int w=16;\
b3184779
MN
1594 for(i=0; i<w; i++)\
1595 {\
1596 const int src0= src[0*srcStride];\
1597 const int src1= src[1*srcStride];\
1598 const int src2= src[2*srcStride];\
1599 const int src3= src[3*srcStride];\
1600 const int src4= src[4*srcStride];\
1601 const int src5= src[5*srcStride];\
1602 const int src6= src[6*srcStride];\
1603 const int src7= src[7*srcStride];\
1604 const int src8= src[8*srcStride];\
1605 const int src9= src[9*srcStride];\
1606 const int src10= src[10*srcStride];\
1607 const int src11= src[11*srcStride];\
1608 const int src12= src[12*srcStride];\
1609 const int src13= src[13*srcStride];\
1610 const int src14= src[14*srcStride];\
1611 const int src15= src[15*srcStride];\
1612 const int src16= src[16*srcStride];\
1613 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1614 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1615 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1616 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1617 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1618 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1619 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1620 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1621 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1622 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1623 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1624 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1625 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1626 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1627 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1628 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1629 dst++;\
1630 src++;\
1631 }\
1632}\
1633\
0c1a9eda 1634static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1635 OPNAME ## pixels8_c(dst, src, stride, 8);\
b3184779
MN
1636}\
1637\
0c1a9eda
ZK
1638static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t half[64];\
b3184779
MN
1640 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1641 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
44eb4951
MN
1642}\
1643\
0c1a9eda 1644static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1645 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
44eb4951
MN
1646}\
1647\
0c1a9eda
ZK
1648static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1649 uint8_t half[64];\
b3184779
MN
1650 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1651 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
44eb4951
MN
1652}\
1653\
0c1a9eda
ZK
1654static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t full[16*9];\
1656 uint8_t half[64];\
b3184779 1657 copy_block9(full, src, 16, stride, 9);\
db794953 1658 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1659 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
44eb4951
MN
1660}\
1661\
0c1a9eda
ZK
1662static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1663 uint8_t full[16*9];\
b3184779 1664 copy_block9(full, src, 16, stride, 9);\
db794953 1665 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
44eb4951
MN
1666}\
1667\
0c1a9eda
ZK
1668static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1670 uint8_t half[64];\
b3184779 1671 copy_block9(full, src, 16, stride, 9);\
db794953 1672 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
b3184779 1673 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
44eb4951 1674}\
0c1a9eda
ZK
1675void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t full[16*9];\
1677 uint8_t halfH[72];\
1678 uint8_t halfV[64];\
1679 uint8_t halfHV[64];\
b3184779
MN
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1682 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1683 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1684 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1685}\
0c1a9eda
ZK
1686static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t full[16*9];\
1688 uint8_t halfH[72];\
1689 uint8_t halfHV[64];\
db794953
MN
1690 copy_block9(full, src, 16, stride, 9);\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1693 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1695}\
0c1a9eda
ZK
1696void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t full[16*9];\
1698 uint8_t halfH[72];\
1699 uint8_t halfV[64];\
1700 uint8_t halfHV[64];\
b3184779
MN
1701 copy_block9(full, src, 16, stride, 9);\
1702 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1703 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1704 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1705 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1706}\
0c1a9eda
ZK
1707static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1708 uint8_t full[16*9];\
1709 uint8_t halfH[72];\
1710 uint8_t halfHV[64];\
db794953
MN
1711 copy_block9(full, src, 16, stride, 9);\
1712 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1713 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1714 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1715 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1716}\
0c1a9eda
ZK
1717void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1718 uint8_t full[16*9];\
1719 uint8_t halfH[72];\
1720 uint8_t halfV[64];\
1721 uint8_t halfHV[64];\
b3184779
MN
1722 copy_block9(full, src, 16, stride, 9);\
1723 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1724 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1725 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1726 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1727}\
0c1a9eda
ZK
1728static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[16*9];\
1730 uint8_t halfH[72];\
1731 uint8_t halfHV[64];\
db794953
MN
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1737}\
0c1a9eda
ZK
1738void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1740 uint8_t halfH[72];\
1741 uint8_t halfV[64];\
1742 uint8_t halfHV[64];\
b3184779
MN
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
db794953
MN
1745 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1746 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1747 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
44eb4951 1748}\
0c1a9eda
ZK
1749static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[16*9];\
1751 uint8_t halfH[72];\
1752 uint8_t halfHV[64];\
db794953
MN
1753 copy_block9(full, src, 16, stride, 9);\
1754 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1758}\
0c1a9eda
ZK
1759static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1760 uint8_t halfH[72];\
1761 uint8_t halfHV[64];\
b3184779 1762 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1763 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1764 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
44eb4951 1765}\
0c1a9eda
ZK
1766static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1767 uint8_t halfH[72];\
1768 uint8_t halfHV[64];\
b3184779 1769 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1771 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
44eb4951 1772}\
0c1a9eda
ZK
1773void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1775 uint8_t halfH[72];\
1776 uint8_t halfV[64];\
1777 uint8_t halfHV[64];\
b3184779
MN
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1782 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1783}\
0c1a9eda
ZK
1784static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1785 uint8_t full[16*9];\
1786 uint8_t halfH[72];\
db794953
MN
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1790 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1791}\
0c1a9eda
ZK
1792void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t full[16*9];\
1794 uint8_t halfH[72];\
1795 uint8_t halfV[64];\
1796 uint8_t halfHV[64];\
b3184779
MN
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
db794953
MN
1799 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1800 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
b3184779 1801 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
44eb4951 1802}\
0c1a9eda
ZK
1803static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1804 uint8_t full[16*9];\
1805 uint8_t halfH[72];\
db794953
MN
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1810}\
0c1a9eda
ZK
1811static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t halfH[72];\
b3184779 1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
db794953 1814 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
b3184779 1815}\
0c1a9eda 1816static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
45553457 1817 OPNAME ## pixels16_c(dst, src, stride, 16);\
b3184779
MN
1818}\
1819\
0c1a9eda
ZK
1820static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1821 uint8_t half[256];\
b3184779
MN
1822 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1823 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1824}\
1825\
0c1a9eda 1826static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
b3184779 1827 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
44eb4951 1828}\
b3184779 1829\
0c1a9eda
ZK
1830static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1831 uint8_t half[256];\
b3184779
MN
1832 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1833 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1834}\
1835\
0c1a9eda
ZK
1836static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[24*17];\
1838 uint8_t half[256];\
b3184779 1839 copy_block17(full, src, 24, stride, 17);\
826f429a 1840 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1841 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1842}\
1843\
0c1a9eda
ZK
1844static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[24*17];\
b3184779 1846 copy_block17(full, src, 24, stride, 17);\
826f429a 1847 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
b3184779
MN
1848}\
1849\
0c1a9eda
ZK
1850static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1852 uint8_t half[256];\
b3184779 1853 copy_block17(full, src, 24, stride, 17);\
826f429a 1854 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
b3184779
MN
1855 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1856}\
0c1a9eda
ZK
1857void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[24*17];\
1859 uint8_t halfH[272];\
1860 uint8_t halfV[256];\
1861 uint8_t halfHV[256];\
b3184779
MN
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1864 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1865 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1866 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1867}\
0c1a9eda
ZK
1868static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[24*17];\
1870 uint8_t halfH[272];\
1871 uint8_t halfHV[256];\
db794953
MN
1872 copy_block17(full, src, 24, stride, 17);\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1875 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1877}\
0c1a9eda
ZK
1878void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t full[24*17];\
1880 uint8_t halfH[272];\
1881 uint8_t halfV[256];\
1882 uint8_t halfHV[256];\
b3184779
MN
1883 copy_block17(full, src, 24, stride, 17);\
1884 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1885 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1886 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1887 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1888}\
0c1a9eda
ZK
1889static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1890 uint8_t full[24*17];\
1891 uint8_t halfH[272];\
1892 uint8_t halfHV[256];\
db794953
MN
1893 copy_block17(full, src, 24, stride, 17);\
1894 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1895 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1896 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1897 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1898}\
0c1a9eda
ZK
1899void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900 uint8_t full[24*17];\
1901 uint8_t halfH[272];\
1902 uint8_t halfV[256];\
1903 uint8_t halfHV[256];\
b3184779
MN
1904 copy_block17(full, src, 24, stride, 17);\
1905 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1906 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1907 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1908 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1909}\
0c1a9eda
ZK
1910static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[24*17];\
1912 uint8_t halfH[272];\
1913 uint8_t halfHV[256];\
db794953
MN
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1919}\
0c1a9eda
ZK
1920void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 uint8_t halfH[272];\
1923 uint8_t halfV[256];\
1924 uint8_t halfHV[256];\
b3184779
MN
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
826f429a
MN
1927 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1928 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1929 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1930}\
0c1a9eda
ZK
1931static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[24*17];\
1933 uint8_t halfH[272];\
1934 uint8_t halfHV[256];\
db794953
MN
1935 copy_block17(full, src, 24, stride, 17);\
1936 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1940}\
0c1a9eda
ZK
1941static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
b3184779 1944 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1945 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1946 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1947}\
0c1a9eda
ZK
1948static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t halfH[272];\
1950 uint8_t halfHV[256];\
b3184779 1951 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1953 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1954}\
0c1a9eda
ZK
1955void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfV[256];\
1959 uint8_t halfHV[256];\
b3184779
MN
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1964 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1965}\
0c1a9eda
ZK
1966static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 uint8_t halfH[272];\
db794953
MN
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1972 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1973}\
0c1a9eda
ZK
1974void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[24*17];\
1976 uint8_t halfH[272];\
1977 uint8_t halfV[256];\
1978 uint8_t halfHV[256];\
b3184779
MN
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
826f429a
MN
1981 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
b3184779
MN
1983 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1984}\
0c1a9eda
ZK
1985static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1986 uint8_t full[24*17];\
1987 uint8_t halfH[272];\
db794953
MN
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1992}\
0c1a9eda
ZK
1993static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t halfH[272];\
b3184779 1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
826f429a 1996 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
45553457 1997}
44eb4951 1998
b3184779
MN
1999#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2000#define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2001#define op_put(a, b) a = cm[((b) + 16)>>5]
2002#define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2003
2004QPEL_MC(0, put_ , _ , op_put)
2005QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2006QPEL_MC(0, avg_ , _ , op_avg)
2007//QPEL_MC(1, avg_no_rnd , _ , op_avg)
2008#undef op_avg
2009#undef op_avg_no_rnd
2010#undef op_put
2011#undef op_put_no_rnd
44eb4951 2012
0da71265
MN
2013#if 1
2014#define H264_LOWPASS(OPNAME, OP, OP2) \
80e44bc3
MN
2015static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2016 const int h=2;\
55fde95e 2017 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2018 int i;\
2019 for(i=0; i<h; i++)\
2020 {\
2021 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2022 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2023 dst+=dstStride;\
2024 src+=srcStride;\
2025 }\
2026}\
2027\
2028static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2029 const int w=2;\
55fde95e 2030 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2031 int i;\
2032 for(i=0; i<w; i++)\
2033 {\
2034 const int srcB= src[-2*srcStride];\
2035 const int srcA= src[-1*srcStride];\
2036 const int src0= src[0 *srcStride];\
2037 const int src1= src[1 *srcStride];\
2038 const int src2= src[2 *srcStride];\
2039 const int src3= src[3 *srcStride];\
2040 const int src4= src[4 *srcStride];\
2041 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2042 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2043 dst++;\
2044 src++;\
2045 }\
2046}\
2047\
2048static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2049 const int h=2;\
2050 const int w=2;\
55fde95e 2051 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
80e44bc3
MN
2052 int i;\
2053 src -= 2*srcStride;\
2054 for(i=0; i<h+5; i++)\
2055 {\
2056 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2057 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2058 tmp+=tmpStride;\
2059 src+=srcStride;\
2060 }\
2061 tmp -= tmpStride*(h+5-2);\
2062 for(i=0; i<w; i++)\
2063 {\
2064 const int tmpB= tmp[-2*tmpStride];\
2065 const int tmpA= tmp[-1*tmpStride];\
2066 const int tmp0= tmp[0 *tmpStride];\
2067 const int tmp1= tmp[1 *tmpStride];\
2068 const int tmp2= tmp[2 *tmpStride];\
2069 const int tmp3= tmp[3 *tmpStride];\
2070 const int tmp4= tmp[4 *tmpStride];\
2071 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2072 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2073 dst++;\
2074 tmp++;\
2075 }\
2076}\
0da71265
MN
2077static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2078 const int h=4;\
55fde95e 2079 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2080 int i;\
2081 for(i=0; i<h; i++)\
2082 {\
2083 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2084 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2085 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2086 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2087 dst+=dstStride;\
2088 src+=srcStride;\
2089 }\
2090}\
2091\
2092static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2093 const int w=4;\
55fde95e 2094 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2095 int i;\
2096 for(i=0; i<w; i++)\
2097 {\
2098 const int srcB= src[-2*srcStride];\
2099 const int srcA= src[-1*srcStride];\
2100 const int src0= src[0 *srcStride];\
2101 const int src1= src[1 *srcStride];\
2102 const int src2= src[2 *srcStride];\
2103 const int src3= src[3 *srcStride];\
2104 const int src4= src[4 *srcStride];\
2105 const int src5= src[5 *srcStride];\
2106 const int src6= src[6 *srcStride];\
2107 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2108 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2109 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2110 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2111 dst++;\
2112 src++;\
2113 }\
2114}\
2115\
2116static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2117 const int h=4;\
2118 const int w=4;\
55fde95e 2119 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2120 int i;\
2121 src -= 2*srcStride;\
2122 for(i=0; i<h+5; i++)\
2123 {\
2124 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2125 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2126 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2127 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2128 tmp+=tmpStride;\
2129 src+=srcStride;\
2130 }\
2131 tmp -= tmpStride*(h+5-2);\
2132 for(i=0; i<w; i++)\
2133 {\
2134 const int tmpB= tmp[-2*tmpStride];\
2135 const int tmpA= tmp[-1*tmpStride];\
2136 const int tmp0= tmp[0 *tmpStride];\
2137 const int tmp1= tmp[1 *tmpStride];\
2138 const int tmp2= tmp[2 *tmpStride];\
2139 const int tmp3= tmp[3 *tmpStride];\
2140 const int tmp4= tmp[4 *tmpStride];\
2141 const int tmp5= tmp[5 *tmpStride];\
2142 const int tmp6= tmp[6 *tmpStride];\
2143 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2144 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2145 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2146 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2147 dst++;\
2148 tmp++;\
2149 }\
2150}\
2151\
2152static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2153 const int h=8;\
55fde95e 2154 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2155 int i;\
2156 for(i=0; i<h; i++)\
2157 {\
2158 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2159 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2160 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2161 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2162 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2163 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2164 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2165 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2166 dst+=dstStride;\
2167 src+=srcStride;\
2168 }\
2169}\
2170\
2171static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2172 const int w=8;\
55fde95e 2173 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2174 int i;\
2175 for(i=0; i<w; i++)\
2176 {\
2177 const int srcB= src[-2*srcStride];\
2178 const int srcA= src[-1*srcStride];\
2179 const int src0= src[0 *srcStride];\
2180 const int src1= src[1 *srcStride];\
2181 const int src2= src[2 *srcStride];\
2182 const int src3= src[3 *srcStride];\
2183 const int src4= src[4 *srcStride];\
2184 const int src5= src[5 *srcStride];\
2185 const int src6= src[6 *srcStride];\
2186 const int src7= src[7 *srcStride];\
2187 const int src8= src[8 *srcStride];\
2188 const int src9= src[9 *srcStride];\
2189 const int src10=src[10*srcStride];\
2190 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2191 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2192 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2193 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2194 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2195 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2196 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2197 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2198 dst++;\
2199 src++;\
2200 }\
2201}\
2202\
2203static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2204 const int h=8;\
2205 const int w=8;\
55fde95e 2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
0da71265
MN
2207 int i;\
2208 src -= 2*srcStride;\
2209 for(i=0; i<h+5; i++)\
2210 {\
2211 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2212 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2213 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2214 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2215 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2216 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2217 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2218 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2219 tmp+=tmpStride;\
2220 src+=srcStride;\
2221 }\
2222 tmp -= tmpStride*(h+5-2);\
2223 for(i=0; i<w; i++)\
2224 {\
2225 const int tmpB= tmp[-2*tmpStride];\
2226 const int tmpA= tmp[-1*tmpStride];\
2227 const int tmp0= tmp[0 *tmpStride];\
2228 const int tmp1= tmp[1 *tmpStride];\
2229 const int tmp2= tmp[2 *tmpStride];\
2230 const int tmp3= tmp[3 *tmpStride];\
2231 const int tmp4= tmp[4 *tmpStride];\
2232 const int tmp5= tmp[5 *tmpStride];\
2233 const int tmp6= tmp[6 *tmpStride];\
2234 const int tmp7= tmp[7 *tmpStride];\
2235 const int tmp8= tmp[8 *tmpStride];\
2236 const int tmp9= tmp[9 *tmpStride];\
2237 const int tmp10=tmp[10*tmpStride];\
2238 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2239 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2240 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2241 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2242 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2243 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2244 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2245 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2246 dst++;\
2247 tmp++;\
2248 }\
2249}\
2250\
2251static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2252 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2253 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2254 src += 8*srcStride;\
2255 dst += 8*dstStride;\
2256 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2257 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2258}\
2259\
2260static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2262 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2263 src += 8*srcStride;\
2264 dst += 8*dstStride;\
2265 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2266 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2267}\
2268\
2269static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2270 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2271 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2272 src += 8*srcStride;\
0da71265
MN
2273 dst += 8*dstStride;\
2274 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2275 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2276}\
2277
2278#define H264_MC(OPNAME, SIZE) \
2279static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2280 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2281}\
2282\
2283static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2284 uint8_t half[SIZE*SIZE];\
2285 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2286 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2287}\
2288\
2289static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2290 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2291}\
2292\
2293static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2294 uint8_t half[SIZE*SIZE];\
2295 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2296 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2297}\
2298\
2299static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2300 uint8_t full[SIZE*(SIZE+5)];\
2301 uint8_t * const full_mid= full + SIZE*2;\
2302 uint8_t half[SIZE*SIZE];\
2303 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2304 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2305 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2306}\
2307\
2308static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2309 uint8_t full[SIZE*(SIZE+5)];\
2310 uint8_t * const full_mid= full + SIZE*2;\
2311 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2312 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2313}\
2314\
2315static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2316 uint8_t full[SIZE*(SIZE+5)];\
2317 uint8_t * const full_mid= full + SIZE*2;\
2318 uint8_t half[SIZE*SIZE];\
2319 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2320 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2321 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2322}\
2323\
2324static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2325 uint8_t full[SIZE*(SIZE+5)];\
2326 uint8_t * const full_mid= full + SIZE*2;\
2327 uint8_t halfH[SIZE*SIZE];\
2328 uint8_t halfV[SIZE*SIZE];\
2329 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2330 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2331 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2333}\
2334\
2335static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 uint8_t halfH[SIZE*SIZE];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2341 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344}\
2345\
2346static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2347 uint8_t full[SIZE*(SIZE+5)];\
2348 uint8_t * const full_mid= full + SIZE*2;\
2349 uint8_t halfH[SIZE*SIZE];\
2350 uint8_t halfV[SIZE*SIZE];\
2351 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2352 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2353 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2354 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2355}\
2356\
2357static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2358 uint8_t full[SIZE*(SIZE+5)];\
2359 uint8_t * const full_mid= full + SIZE*2;\
2360 uint8_t halfH[SIZE*SIZE];\
2361 uint8_t halfV[SIZE*SIZE];\
2362 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2363 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2364 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2365 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2366}\
2367\
2368static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2369 int16_t tmp[SIZE*(SIZE+5)];\
2370 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2371}\
2372\
2373static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2374 int16_t tmp[SIZE*(SIZE+5)];\
2375 uint8_t halfH[SIZE*SIZE];\
2376 uint8_t halfHV[SIZE*SIZE];\
2377 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2379 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2380}\
2381\
2382static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2383 int16_t tmp[SIZE*(SIZE+5)];\
2384 uint8_t halfH[SIZE*SIZE];\
2385 uint8_t halfHV[SIZE*SIZE];\
2386 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2387 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2389}\
2390\
2391static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2392 uint8_t full[SIZE*(SIZE+5)];\
2393 uint8_t * const full_mid= full + SIZE*2;\
2394 int16_t tmp[SIZE*(SIZE+5)];\
2395 uint8_t halfV[SIZE*SIZE];\
2396 uint8_t halfHV[SIZE*SIZE];\
2397 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2398 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2401}\
2402\
2403static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2404 uint8_t full[SIZE*(SIZE+5)];\
2405 uint8_t * const full_mid= full + SIZE*2;\
2406 int16_t tmp[SIZE*(SIZE+5)];\
2407 uint8_t halfV[SIZE*SIZE];\
2408 uint8_t halfHV[SIZE*SIZE];\
2409 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2410 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2413}\
2414
2415#define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2416//#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2417#define op_put(a, b) a = cm[((b) + 16)>>5]
2418#define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2419#define op2_put(a, b) a = cm[((b) + 512)>>10]
2420
2421H264_LOWPASS(put_ , op_put, op2_put)
2422H264_LOWPASS(avg_ , op_avg, op2_avg)
80e44bc3 2423H264_MC(put_, 2)
0da71265
MN
2424H264_MC(put_, 4)
2425H264_MC(put_, 8)
2426H264_MC(put_, 16)
2427H264_MC(avg_, 4)
2428H264_MC(avg_, 8)
2429H264_MC(avg_, 16)
2430
2431#undef op_avg
2432#undef op_put
2433#undef op2_avg
2434#undef op2_put
2435#endif
2436
91c56db6
MN
2437#define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2438#define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
9f2d1b4f
LM
2439#define H264_WEIGHT(W,H) \
2440static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
e8b56208 2441 int y; \
9f2d1b4f
LM
2442 offset <<= log2_denom; \
2443 if(log2_denom) offset += 1<<(log2_denom-1); \
2444 for(y=0; y<H; y++, block += stride){ \
2445 op_scale1(0); \
2446 op_scale1(1); \
2447 if(W==2) continue; \
2448 op_scale1(2); \
2449 op_scale1(3); \
2450 if(W==4) continue; \
2451 op_scale1(4); \
2452 op_scale1(5); \
2453 op_scale1(6); \
2454 op_scale1(7); \
2455 if(W==8) continue; \
2456 op_scale1(8); \
2457 op_scale1(9); \
2458 op_scale1(10); \
2459 op_scale1(11); \
2460 op_scale1(12); \
2461 op_scale1(13); \
2462 op_scale1(14); \
2463 op_scale1(15); \
2464 } \
2465} \
e8b56208
LM
2466static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2467 int y; \
2468 offset = ((offset + 1) | 1) << log2_denom; \
9f2d1b4f
LM
2469 for(y=0; y<H; y++, dst += stride, src += stride){ \
2470 op_scale2(0); \
2471 op_scale2(1); \
2472 if(W==2) continue; \
2473 op_scale2(2); \
2474 op_scale2(3); \
2475 if(W==4) continue; \
2476 op_scale2(4); \
2477 op_scale2(5); \
2478 op_scale2(6); \
2479 op_scale2(7); \
2480 if(W==8) continue; \
2481 op_scale2(8); \
2482 op_scale2(9); \
2483 op_scale2(10); \
2484 op_scale2(11); \
2485 op_scale2(12); \
2486 op_scale2(13); \
2487 op_scale2(14); \
2488 op_scale2(15); \
2489 } \
2490}
2491
2492H264_WEIGHT(16,16)
2493H264_WEIGHT(16,8)
2494H264_WEIGHT(8,16)
2495H264_WEIGHT(8,8)
2496H264_WEIGHT(8,4)
2497H264_WEIGHT(4,8)
2498H264_WEIGHT(4,4)
2499H264_WEIGHT(4,2)
2500H264_WEIGHT(2,4)
2501H264_WEIGHT(2,2)
2502
2503#undef op_scale1
2504#undef op_scale2
2505#undef H264_WEIGHT
2506
1457ab52 2507static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
55fde95e 2508 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2509 int i;
2510
2511 for(i=0; i<h; i++){
2512 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2513 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2514 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2515 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2516 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2517 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2518 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2519 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2520 dst+=dstStride;
115329f1 2521 src+=srcStride;
1457ab52
MN
2522 }
2523}
2524
29c5cdca 2525#ifdef CONFIG_CAVS_DECODER
b482e2d1
MN
2526/* AVS specific */
2527void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2528
2529void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2530 put_pixels8_c(dst, src, stride, 8);
2531}
2532void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2533 avg_pixels8_c(dst, src, stride, 8);
2534}
2535void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2536 put_pixels16_c(dst, src, stride, 16);
2537}
2538void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539 avg_pixels16_c(dst, src, stride, 16);
2540}
29c5cdca 2541#endif /* CONFIG_CAVS_DECODER */
b482e2d1 2542
64db55ae
KS
2543#if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2544/* VC-1 specific */
2545void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2546
2547void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
74691b7b
KS
2548 put_pixels8_c(dst, src, stride, 8);
2549}
64db55ae
KS
2550#endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2551
1457ab52 2552static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
55fde95e 2553 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1457ab52
MN
2554 int i;
2555
2556 for(i=0; i<w; i++){
2557 const int src_1= src[ -srcStride];
2558 const int src0 = src[0 ];
2559 const int src1 = src[ srcStride];
2560 const int src2 = src[2*srcStride];
2561 const int src3 = src[3*srcStride];
2562 const int src4 = src[4*srcStride];
2563 const int src5 = src[5*srcStride];
2564 const int src6 = src[6*srcStride];
2565 const int src7 = src[7*srcStride];
2566 const int src8 = src[8*srcStride];
2567 const int src9 = src[9*srcStride];
2568 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2569 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2570 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2571 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2572 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2573 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2574 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2575 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2576 src++;
2577 dst++;
2578 }
2579}
2580
2581static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2582 put_pixels8_c(dst, src, stride, 8);
2583}
2584
2585static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2586 uint8_t half[64];
2587 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2588 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2589}
2590
2591static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2592 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2593}
2594
2595static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2596 uint8_t half[64];
2597 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2598 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2599}
2600
2601static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2602 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2603}
2604
2605static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2606 uint8_t halfH[88];
2607 uint8_t halfV[64];
2608 uint8_t halfHV[64];
2609 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2610 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2611 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2612 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2613}
2614static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2615 uint8_t halfH[88];
2616 uint8_t halfV[64];
2617 uint8_t halfHV[64];
2618 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2619 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2620 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2621 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2622}
2623static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2624 uint8_t halfH[88];
2625 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2626 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2627}
2628
332f9ac4
MN
2629static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2630 int x;
2631 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2632
332f9ac4
MN
2633 for(x=0; x<8; x++){
2634 int d1, d2, ad1;
2635 int p0= src[x-2*stride];
2636 int p1= src[x-1*stride];
2637 int p2= src[x+0*stride];
2638 int p3= src[x+1*stride];
2639 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2640
2641 if (d<-2*strength) d1= 0;
2642 else if(d<- strength) d1=-2*strength - d;
2643 else if(d< strength) d1= d;
2644 else if(d< 2*strength) d1= 2*strength - d;
2645 else d1= 0;
115329f1 2646
332f9ac4
MN
2647 p1 += d1;
2648 p2 -= d1;
2649 if(p1&256) p1= ~(p1>>31);
2650 if(p2&256) p2= ~(p2>>31);
115329f1 2651
332f9ac4
MN
2652 src[x-1*stride] = p1;
2653 src[x+0*stride] = p2;
2654
c26abfa5 2655 ad1= FFABS(d1)>>1;
115329f1 2656
332f9ac4 2657 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2658
332f9ac4
MN
2659 src[x-2*stride] = p0 - d2;
2660 src[x+ stride] = p3 + d2;
2661 }
2662}
2663
2664static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2665 int y;
2666 const int strength= ff_h263_loop_filter_strength[qscale];
115329f1 2667
332f9ac4
MN
2668 for(y=0; y<8; y++){
2669 int d1, d2, ad1;
2670 int p0= src[y*stride-2];
2671 int p1= src[y*stride-1];
2672 int p2= src[y*stride+0];
2673 int p3= src[y*stride+1];
2674 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2675
2676 if (d<-2*strength) d1= 0;
2677 else if(d<- strength) d1=-2*strength - d;
2678 else if(d< strength) d1= d;
2679 else if(d< 2*strength) d1= 2*strength - d;
2680 else d1= 0;
115329f1 2681
332f9ac4
MN
2682 p1 += d1;
2683 p2 -= d1;
2684 if(p1&256) p1= ~(p1>>31);
2685 if(p2&256) p2= ~(p2>>31);
115329f1 2686
332f9ac4
MN
2687 src[y*stride-1] = p1;
2688 src[y*stride+0] = p2;
2689
c26abfa5 2690 ad1= FFABS(d1)>>1;
115329f1 2691
332f9ac4 2692 d2= clip((p0-p3)/4, -ad1, ad1);
115329f1 2693
332f9ac4
MN
2694 src[y*stride-2] = p0 - d2;
2695 src[y*stride+1] = p3 + d2;
2696 }
2697}
1457ab52 2698
fdbbf2e0
MN
2699static void h261_loop_filter_c(uint8_t *src, int stride){
2700 int x,y,xy,yz;
2701 int temp[64];
2702
2703 for(x=0; x<8; x++){
2704 temp[x ] = 4*src[x ];
2705 temp[x + 7*8] = 4*src[x + 7*stride];
2706 }
2707 for(y=1; y<7; y++){
2708 for(x=0; x<8; x++){
2709 xy = y * stride + x;
2710 yz = y * 8 + x;
2711 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
c6148de2
MN
2712 }
2713 }
115329f1 2714
fdbbf2e0
MN
2715 for(y=0; y<8; y++){
2716 src[ y*stride] = (temp[ y*8] + 2)>>2;
2717 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2718 for(x=1; x<7; x++){
2719 xy = y * stride + x;
2720 yz = y * 8 + x;
2721 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
c6148de2
MN
2722 }
2723 }
2724}
2725
5cf08f23 2726static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2727{
2728 int i, d;
2729 for( i = 0; i < 4; i++ ) {
2730 if( tc0[i] < 0 ) {
2731 pix += 4*ystride;
2732 continue;
2733 }
2734 for( d = 0; d < 4; d++ ) {
2735 const int p0 = pix[-1*xstride];
2736 const int p1 = pix[-2*xstride];
2737 const int p2 = pix[-3*xstride];
2738 const int q0 = pix[0];
2739 const int q1 = pix[1*xstride];
2740 const int q2 = pix[2*xstride];
115329f1 2741
c26abfa5
DB
2742 if( FFABS( p0 - q0 ) < alpha &&
2743 FFABS( p1 - p0 ) < beta &&
2744 FFABS( q1 - q0 ) < beta ) {
115329f1 2745
42251a2a
LM
2746 int tc = tc0[i];
2747 int i_delta;
115329f1 2748
c26abfa5 2749 if( FFABS( p2 - p0 ) < beta ) {
bda1c56c 2750 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
42251a2a
LM
2751 tc++;
2752 }
c26abfa5 2753 if( FFABS( q2 - q0 ) < beta ) {
bda1c56c 2754 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
42251a2a
LM
2755 tc++;
2756 }
115329f1 2757
42251a2a
LM
2758 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2759 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2760 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2761 }
2762 pix += ystride;
2763 }
2764 }
2765}
5cf08f23 2766static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2767{
2768 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2769}
5cf08f23 2770static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2771{
2772 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2773}
2774
5cf08f23 2775static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2776{
2777 int i, d;
2778 for( i = 0; i < 4; i++ ) {
2779 const int tc = tc0[i];
2780 if( tc <= 0 ) {
2781 pix += 2*ystride;
2782 continue;
2783 }
2784 for( d = 0; d < 2; d++ ) {
2785 const int p0 = pix[-1*xstride];
2786 const int p1 = pix[-2*xstride];
2787 const int q0 = pix[0];
2788 const int q1 = pix[1*xstride];
2789
c26abfa5
DB
2790 if( FFABS( p0 - q0 ) < alpha &&
2791 FFABS( p1 - p0 ) < beta &&
2792 FFABS( q1 - q0 ) < beta ) {
42251a2a
LM
2793
2794 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2795
2796 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2797 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2798 }
2799 pix += ystride;
2800 }
2801 }
2802}
5cf08f23 2803static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2804{
2805 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2806}
5cf08f23 2807static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
42251a2a
LM
2808{
2809 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2810}
2811
5cf08f23
LM
2812static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2813{
2814 int d;
2815 for( d = 0; d < 8; d++ ) {
2816 const int p0 = pix[-1*xstride];
2817 const int p1 = pix[-2*xstride];
2818 const int q0 = pix[0];
2819 const int q1 = pix[1*xstride];
2820
c26abfa5
DB
2821 if( FFABS( p0 - q0 ) < alpha &&
2822 FFABS( p1 - p0 ) < beta &&
2823 FFABS( q1 - q0 ) < beta ) {
5cf08f23
LM
2824
2825 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2826 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2827 }
2828 pix += ystride;
2829 }
2830}
2831static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2832{
2833 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2834}
2835static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2836{
2837 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2838}
2839
bb198e19 2840static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2841{
2842 int s, i;
2843
2844 s = 0;
bb198e19 2845 for(i=0;i<h;i++) {
de6d9b64
FB
2846 s += abs(pix1[0] - pix2[0]);
2847 s += abs(pix1[1] - pix2[1]);
2848 s += abs(pix1[2] - pix2[2]);
2849 s += abs(pix1[3] - pix2[3]);
2850 s += abs(pix1[4] - pix2[4]);
2851 s += abs(pix1[5] - pix2[5]);
2852 s += abs(pix1[6] - pix2[6]);
2853 s += abs(pix1[7] - pix2[7]);
2854 s += abs(pix1[8] - pix2[8]);
2855 s += abs(pix1[9] - pix2[9]);
2856 s += abs(pix1[10] - pix2[10]);
2857 s += abs(pix1[11] - pix2[11]);
2858 s += abs(pix1[12] - pix2[12]);
2859 s += abs(pix1[13] - pix2[13]);
2860 s += abs(pix1[14] - pix2[14]);
2861 s += abs(pix1[15] - pix2[15]);
2862 pix1 += line_size;
2863 pix2 += line_size;
2864 }
2865 return s;
2866}
2867
bb198e19 2868static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2869{
2870 int s, i;
2871
2872 s = 0;
bb198e19 2873 for(i=0;i<h;i++) {
de6d9b64
FB
2874 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2875 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2876 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2877 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2878 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2879 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2880 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2881 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2882 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2883 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2884 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2885 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2886 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2887 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2888 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2889 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2890 pix1 += line_size;
2891 pix2 += line_size;
2892 }
2893 return s;
2894}
2895
bb198e19 2896static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2897{
2898 int s, i;
0c1a9eda 2899 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2900
2901 s = 0;
bb198e19 2902 for(i=0;i<h;i++) {
de6d9b64
FB
2903 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2904 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2905 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2906 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2907 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2908 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2909 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2910 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2911 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2912 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2913 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2914 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2915 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2916 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2917 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2918 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2919 pix1 += line_size;
2920 pix2 += line_size;
2921 pix3 += line_size;
2922 }
2923 return s;
2924}
2925
bb198e19 2926static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
de6d9b64
FB
2927{
2928 int s, i;
0c1a9eda 2929 uint8_t *pix3 = pix2 + line_size;
de6d9b64
FB
2930
2931 s = 0;
bb198e19 2932 for(i=0;i<h;i++) {
de6d9b64
FB
2933 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2934 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2935 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2936 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2937 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2938 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2939 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2940 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2941 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2942 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2943 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2944 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2945 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2946 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2947 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2948 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2949 pix1 += line_size;
2950 pix2 += line_size;
2951 pix3 += line_size;
2952 }
2953 return s;
2954}
2955
bb198e19 2956static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2957{
2958 int s, i;
2959
2960 s = 0;
bb198e19 2961 for(i=0;i<h;i++) {
ba6802de
MN
2962 s += abs(pix1[0] - pix2[0]);
2963 s += abs(pix1[1] - pix2[1]);
2964 s += abs(pix1[2] - pix2[2]);
2965 s += abs(pix1[3] - pix2[3]);
2966 s += abs(pix1[4] - pix2[4]);
2967 s += abs(pix1[5] - pix2[5]);
2968 s += abs(pix1[6] - pix2[6]);
2969 s += abs(pix1[7] - pix2[7]);
2970 pix1 += line_size;
2971 pix2 += line_size;
2972 }
2973 return s;
2974}
2975
bb198e19 2976static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2977{
2978 int s, i;
2979
2980 s = 0;
bb198e19 2981 for(i=0;i<h;i++) {
ba6802de
MN
2982 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2983 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2984 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2985 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2986 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2987 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2988 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2989 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2990 pix1 += line_size;
2991 pix2 += line_size;
2992 }
2993 return s;
2994}
2995
bb198e19 2996static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
2997{
2998 int s, i;
0c1a9eda 2999 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3000
3001 s = 0;
bb198e19 3002 for(i=0;i<h;i++) {
ba6802de
MN
3003 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3004 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3005 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3006 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3007 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3008 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3009 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3010 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3011 pix1 += line_size;
3012 pix2 += line_size;
3013 pix3 += line_size;
3014 }
3015 return s;
3016}
3017
bb198e19 3018static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
ba6802de
MN
3019{
3020 int s, i;
0c1a9eda 3021 uint8_t *pix3 = pix2 + line_size;
ba6802de
MN
3022
3023 s = 0;
bb198e19 3024 for(i=0;i<h;i++) {
ba6802de
MN
3025 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3026 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3027 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3028 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3029 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3030 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3031 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3032 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3033 pix1 += line_size;
3034 pix2 += line_size;
3035 pix3 += line_size;
3036 }
3037 return s;
3038}
3039
bf4e3bd2
MR
3040static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3041 MpegEncContext *c = v;
e6a2ac34
MN
3042 int score1=0;
3043 int score2=0;
3044 int x,y;
d4c5d2ad 3045
e6a2ac34
MN
3046 for(y=0; y<h; y++){
3047 for(x=0; x<16; x++){
3048 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3049 }
3050 if(y+1<h){
3051 for(x=0; x<15; x++){
c26abfa5 3052 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3053 - s1[x+1] + s1[x+1+stride])
c26abfa5 3054 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3055 - s2[x+1] + s2[x+1+stride]);
3056 }
3057 }
3058 s1+= stride;
3059 s2+= stride;
3060 }
d4c5d2ad 3061
c26abfa5
DB
3062 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3063 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3064}
3065
bf4e3bd2
MR
3066static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3067 MpegEncContext *c = v;
e6a2ac34
MN
3068 int score1=0;
3069 int score2=0;
3070 int x,y;
115329f1 3071
e6a2ac34
MN
3072 for(y=0; y<h; y++){
3073 for(x=0; x<8; x++){
3074 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3075 }
3076 if(y+1<h){
3077 for(x=0; x<7; x++){
c26abfa5 3078 score2+= FFABS( s1[x ] - s1[x +stride]
e6a2ac34 3079 - s1[x+1] + s1[x+1+stride])
c26abfa5 3080 -FFABS( s2[x ] - s2[x +stride]
e6a2ac34
MN
3081 - s2[x+1] + s2[x+1+stride]);
3082 }
3083 }
3084 s1+= stride;
3085 s2+= stride;
3086 }
115329f1 3087
c26abfa5
DB
3088 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3089 else return score1 + FFABS(score2)*8;
e6a2ac34
MN
3090}
3091
364a1797
MN
3092static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3093 int i;
3094 unsigned int sum=0;
3095
3096 for(i=0; i<8*8; i++){
3097 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3098 int w= weight[i];
3099 b>>= RECON_SHIFT;
3100 assert(-512<b && b<512);
3101
3102 sum += (w*b)*(w*b)>>4;
3103 }
3104 return sum>>2;
3105}
3106
3107static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3108 int i;
3109
3110 for(i=0; i<8*8; i++){
3111 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
115329f1 3112 }
364a1797
MN
3113}
3114
a9badb51
MN
3115/**
3116 * permutes an 8x8 block.
2a5700de 3117 * @param block the block which will be permuted according to the given permutation vector
a9badb51
MN
3118 * @param permutation the permutation vector
3119 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
115329f1 3120 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2a5700de 3121 * (inverse) permutated to scantable order!
a9badb51 3122 */
0c1a9eda 3123void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
d962f6fd 3124{
7801d21d 3125 int i;
477ab036 3126 DCTELEM temp[64];
115329f1 3127
7801d21d 3128 if(last<=0) return;
9a7b310d 3129 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
d962f6fd 3130
7801d21d
MN
3131 for(i=0; i<=last; i++){
3132 const int j= scantable[i];
3133 temp[j]= block[j];
3134 block[j]=0;
3135 }
115329f1 3136
7801d21d
MN
3137 for(i=0; i<=last; i++){
3138 const int j= scantable[i];
3139 const int perm_j= permutation[j];
3140 block[perm_j]= temp[j];
3141 }
d962f6fd 3142}
e0eac44e 3143
622348f9
MN
3144static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3145 return 0;
3146}
3147
3148void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3149 int i;
115329f1 3150
622348f9 3151 memset(cmp, 0, sizeof(void*)*5);
115329f1 3152
622348f9
MN
3153 for(i=0; i<5; i++){
3154 switch(type&0xFF){
3155 case FF_CMP_SAD:
3156 cmp[i]= c->sad[i];
3157 break;
3158 case FF_CMP_SATD:
3159 cmp[i]= c->hadamard8_diff[i];
3160 break;
3161 case FF_CMP_SSE:
3162 cmp[i]= c->sse[i];
3163 break;
3164 case FF_CMP_DCT:
3165 cmp[i]= c->dct_sad[i];
3166 break;
27c61ac5
MN
3167 case FF_CMP_DCT264:
3168 cmp[i]= c->dct264_sad[i];
3169 break;
0fd6aea1
MN
3170 case FF_CMP_DCTMAX:
3171 cmp[i]= c->dct_max[i];
3172 break;
622348f9
MN
3173 case FF_CMP_PSNR:
3174 cmp[i]= c->quant_psnr[i];
3175 break;
3176 case FF_CMP_BIT:
3177 cmp[i]= c->bit[i];
3178 break;
3179 case FF_CMP_RD:
3180 cmp[i]= c->rd[i];
3181 break;
3182 case FF_CMP_VSAD:
3183 cmp[i]= c->vsad[i];
3184 break;
3185 case FF_CMP_VSSE:
3186 cmp[i]= c->vsse[i];
3187 break;
3188 case FF_CMP_ZERO:
3189 cmp[i]= zero_cmp;
3190 break;
e6a2ac34
MN
3191 case FF_CMP_NSSE:
3192 cmp[i]= c->nsse[i];
3193 break;
3a6fc8fa 3194#ifdef CONFIG_SNOW_ENCODER
26efc54e
MN
3195 case FF_CMP_W53:
3196 cmp[i]= c->w53[i];
3197 break;
3198 case FF_CMP_W97:
3199 cmp[i]= c->w97[i];
3200 break;
3a6fc8fa 3201#endif
622348f9
MN
3202 default:
3203 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3204 }
3205 }
3206}
3207
2a5700de
MN
3208/**
3209 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3210 */
eb4b3dd3 3211static void clear_blocks_c(DCTELEM *blocks)
649c00c9
MN
3212{
3213 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3214}
3215
11f18faf
MN
3216static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3217 int i;
d32ac509 3218 for(i=0; i+7<w; i+=8){
11f18faf
MN
3219 dst[i+0] += src[i+0];
3220 dst[i+1] += src[i+1];
3221 dst[i+2] += src[i+2];
3222 dst[i+3] += src[i+3];
3223 dst[i+4] += src[i+4];
3224 dst[i+5] += src[i+5];
3225 dst[i+6] += src[i+6];
3226 dst[i+7] += src[i+7];
3227 }
3228 for(; i<w; i++)
3229 dst[i+0] += src[i+0];
3230}
3231
3232static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3233 int i;
d32ac509 3234 for(i=0; i+7<w; i+=8){
11f18faf
MN
3235 dst[i+0] = src1[i+0]-src2[i+0];
3236 dst[i+1] = src1[i+1]-src2[i+1];
3237 dst[i+2] = src1[i+2]-src2[i+2];
3238 dst[i+3] = src1[i+3]-src2[i+3];
3239 dst[i+4] = src1[i+4]-src2[i+4];
3240 dst[i+5] = src1[i+5]-src2[i+5];
3241 dst[i+6] = src1[i+6]-src2[i+6];
3242 dst[i+7] = src1[i+7]-src2[i+7];
3243 }
3244 for(; i<w; i++)
3245 dst[i+0] = src1[i+0]-src2[i+0];
3246}
3247
84705403
MN
3248static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3249 int i;
3250 uint8_t l, lt;
3251
3252 l= *left;
3253 lt= *left_top;
3254
3255 for(i=0; i<w; i++){
3256 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3257 lt= src1[i];
3258 l= src2[i];
3259 dst[i]= l - pred;
115329f1 3260 }
84705403
MN
3261
3262 *left= l;
3263 *left_top= lt;
3264}
3265
1457ab52
MN
3266#define BUTTERFLY2(o1,o2,i1,i2) \
3267o1= (i1)+(i2);\
3268o2= (i1)-(i2);
3269
3270#define BUTTERFLY1(x,y) \
3271{\
3272 int a,b;\
3273 a= x;\
3274 b= y;\
3275 x= a+b;\
3276 y= a-b;\
3277}
3278
c26abfa5 3279#define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1457ab52 3280
bb198e19 3281static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1457ab52
MN
3282 int i;
3283 int temp[64];
3284 int sum=0;
115329f1 3285
bb198e19 3286 assert(h==8);
1457ab52
MN
3287
3288 for(i=0; i<8; i++){
3289 //FIXME try pointer walks
3290 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3291 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3292 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3293 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
115329f1 3294
1457ab52
MN
3295 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3296 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3297 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3298 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3299
1457ab52
MN
3300 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3301 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3302 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3303 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3304 }
3305
3306 for(i=0; i<8; i++){
3307 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3308 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3309 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3310 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3311
1457ab52
MN
3312 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3313 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3314 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3315 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3316
115329f1 3317 sum +=
1457ab52
MN
3318 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3319 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3320 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3321 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3322 }
3323#if 0
3324static int maxi=0;
3325if(sum>maxi){
3326 maxi=sum;
3327 printf("MAX:%d\n", maxi);
3328}
3329#endif
3330 return sum;
3331}
3332
622348f9 3333static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1457ab52
MN
3334 int i;
3335 int temp[64];
3336 int sum=0;
115329f1 3337
622348f9 3338 assert(h==8);
115329f1 3339
1457ab52
MN
3340 for(i=0; i<8; i++){
3341 //FIXME try pointer walks
622348f9
MN
3342 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3343 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3344 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3345 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
115329f1 3346
1457ab52
MN
3347 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3348 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3349 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3350 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
115329f1 3351
1457ab52
MN
3352 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3353 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3354 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3355 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3356 }
3357
3358 for(i=0; i<8; i++){
3359 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3360 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3361 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3362 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
115329f1 3363
1457ab52
MN
3364 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3365 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3366 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3367 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
115329f1
DB
3368
3369 sum +=
1457ab52
MN
3370 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3371 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3372 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3373 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3374 }
115329f1 3375
c26abfa5 3376 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
115329f1 3377
1457ab52
MN
3378 return sum;
3379}
3380
bb198e19 3381static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3382 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3383 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
76fbb024 3384 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1457ab52 3385 int sum=0, i;
115329f1 3386
bb198e19 3387 assert(h==8);
1457ab52
MN
3388
3389 s->dsp.diff_pixels(temp, src1, src2, stride);
b0368839 3390 s->dsp.fdct(temp);
1457ab52
MN
3391
3392 for(i=0; i<64; i++)
c26abfa5 3393 sum+= FFABS(temp[i]);
115329f1 3394
1457ab52
MN
3395 return sum;
3396}
3397
27c61ac5
MN
3398#ifdef CONFIG_GPL
3399#define DCT8_1D {\
3400 const int s07 = SRC(0) + SRC(7);\
3401 const int s16 = SRC(1) + SRC(6);\
3402 const int s25 = SRC(2) + SRC(5);\
3403 const int s34 = SRC(3) + SRC(4);\
3404 const int a0 = s07 + s34;\
3405 const int a1 = s16 + s25;\
3406 const int a2 = s07 - s34;\
3407 const int a3 = s16 - s25;\
3408 const int d07 = SRC(0) - SRC(7);\
3409 const int d16 = SRC(1) - SRC(6);\
3410 const int d25 = SRC(2) - SRC(5);\
3411 const int d34 = SRC(3) - SRC(4);\
3412 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3413 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3414 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3415 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3416 DST(0, a0 + a1 ) ;\
3417 DST(1, a4 + (a7>>2)) ;\
3418 DST(2, a2 + (a3>>1)) ;\
3419 DST(3, a5 + (a6>>2)) ;\
3420 DST(4, a0 - a1 ) ;\
3421 DST(5, a6 - (a5>>2)) ;\
3422 DST(6, (a2>>1) - a3 ) ;\
3423 DST(7, (a4>>2) - a7 ) ;\
3424}
3425
3426static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3427 MpegEncContext * const s= (MpegEncContext *)c;
3428 int16_t dct[8][8];
3429 int i;
3430 int sum=0;
3431
3432 s->dsp.diff_pixels(dct, src1, src2, stride);
3433
3434#define SRC(x) dct[i][x]
3435#define DST(x,v) dct[i][x]= v
3436 for( i = 0; i < 8; i++ )
3437 DCT8_1D
3438#undef SRC
3439#undef DST
3440
3441#define SRC(x) dct[x][i]
c26abfa5 3442#define DST(x,v) sum += FFABS(v)
27c61ac5
MN
3443 for( i = 0; i < 8; i++ )
3444 DCT8_1D
3445#undef SRC
3446#undef DST
3447 return sum;
3448}
3449#endif
3450
0fd6aea1
MN
3451static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3452 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3453 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
0fd6aea1
MN
3454 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3455 int sum=0, i;
115329f1 3456
0fd6aea1
MN
3457 assert(h==8);
3458
3459 s->dsp.diff_pixels(temp, src1, src2, stride);
3460 s->dsp.fdct(temp);
3461
3462 for(i=0; i<64; i++)
c26abfa5 3463 sum= FFMAX(sum, FFABS(temp[i]));
115329f1 3464
0fd6aea1
MN
3465 return sum;
3466}
3467
bb198e19 3468static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1457ab52 3469 MpegEncContext * const s= (MpegEncContext *)c;
68b51e58 3470 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
76fbb024
MN
3471 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3472 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1457ab52
MN
3473 int sum=0, i;
3474
bb198e19 3475 assert(h==8);
1457ab52 3476 s->mb_intra=0;
115329f1 3477
1457ab52 3478 s->dsp.diff_pixels(temp, src1, src2, stride);
115329f1 3479
1457ab52 3480 memcpy(bak, temp, 64*sizeof(DCTELEM));
115329f1 3481
67725183 3482 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
d50635cd 3483 s->dct_unquantize_inter(s, temp, 0, s->qscale);
115329f1
DB
3484 simple_idct(temp); //FIXME
3485
1457ab52
MN
3486 for(i=0; i<64; i++)
3487 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
115329f1 3488
1457ab52
MN
3489 return sum;
3490}
3491
bb198e19 3492static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3493 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3494 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58
SH
3495 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3496 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
76fbb024
MN
3497 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3498 uint8_t * const bak= (uint8_t*)aligned_bak;
3a87ac94
MN
3499 int i, last, run, bits, level, distoration, start_i;
3500 const int esc_length= s->ac_esc_length;
3501 uint8_t * length;
3502 uint8_t * last_length;
115329f1 3503
bb198e19
MN
3504 assert(h==8);
3505
67725183
MN
3506 for(i=0; i<8; i++){
3507 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3508 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3509 }
3a87ac94 3510
67725183
MN
3511 s->dsp.diff_pixels(temp, src1, src2, stride);
3512
3513 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3514
3515 bits=0;
115329f1 3516
3a87ac94 3517 if (s->mb_intra) {
115329f1 3518 start_i = 1;
3a87ac94
MN
3519 length = s->intra_ac_vlc_length;
3520 last_length= s->intra_ac_vlc_last_length;
67725183 3521 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3a87ac94
MN
3522 } else {
3523 start_i = 0;
3524 length = s->inter_ac_vlc_length;
3525 last_length= s->inter_ac_vlc_last_length;
3526 }
115329f1 3527
67725183 3528 if(last>=start_i){
3a87ac94
MN
3529 run=0;
3530 for(i=start_i; i<last; i++){
3531 int j= scantable[i];
3532 level= temp[j];
115329f1 3533
3a87ac94
MN
3534 if(level){
3535 level+=64;
3536 if((level&(~127)) == 0){
3537 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3538 }else
3539 bits+= esc_length;
3540 run=0;
3541 }else
3542 run++;
3543 }
3544 i= scantable[last];
115329f1 3545
3a87ac94 3546 level= temp[i] + 64;
1d0eab1d
MN
3547
3548 assert(level - 64);
115329f1 3549
3a87ac94
MN
3550 if((level&(~127)) == 0){
3551 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3552 }else
3553 bits+= esc_length;
115329f1 3554
67725183
MN
3555 }
3556
3557 if(last>=0){
d50635cd
MN
3558 if(s->mb_intra)
3559 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3560 else
3561 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3a87ac94 3562 }
115329f1 3563
b0368839 3564 s->dsp.idct_add(bak, stride, temp);
115329f1 3565
bb198e19 3566 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3a87ac94 3567
67725183 3568 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3a87ac94
MN
3569}
3570
bb198e19 3571static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3a87ac94 3572 MpegEncContext * const s= (MpegEncContext *)c;
0c1a9eda 3573 const uint8_t *scantable= s->intra_scantable.permutated;
68b51e58 3574