wmv2: move IDCT to its own DSP context.
[libav.git] / libavcodec / dsputil.h
CommitLineData
ff4ec49e
FB
1/*
2 * DSP utils
406792e7 3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
ff4ec49e 5 *
2912e87a 6 * This file is part of Libav.
b78e7197 7 *
2912e87a 8 * Libav is free software; you can redistribute it and/or
ff4ec49e
FB
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
b78e7197 11 * version 2.1 of the License, or (at your option) any later version.
ff4ec49e 12 *
2912e87a 13 * Libav is distributed in the hope that it will be useful,
ff4ec49e
FB
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
2912e87a 19 * License along with Libav; if not, write to the Free Software
5509bffa 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
ff4ec49e 21 */
24641185
MN
22
23/**
ba87f080 24 * @file
983e3246 25 * DSP utils.
5755c27f
MN
26 * note, many functions in here may use MMX which trashes the FPU state, it is
27 * absolutely necessary to call emms_c() between dsp & float/double code
24641185
MN
28 */
29
98790382
SS
30#ifndef AVCODEC_DSPUTIL_H
31#define AVCODEC_DSPUTIL_H
de6d9b64 32
6a5d31ac 33#include "libavutil/intreadwrite.h"
43f1708f 34#include "avcodec.h"
de6d9b64 35
24641185 36
44eb4951 37//#define DEBUG
de6d9b64
FB
38/* dct code */
39typedef short DCTELEM;
40
3e2efacd
MS
41void ff_fdct_ifast (DCTELEM *data);
42void ff_fdct_ifast248 (DCTELEM *data);
0a72533e
MR
43void ff_jpeg_fdct_islow_8(DCTELEM *data);
44void ff_jpeg_fdct_islow_10(DCTELEM *data);
45void ff_fdct248_islow_8(DCTELEM *data);
46void ff_fdct248_islow_10(DCTELEM *data);
de6d9b64 47
c8e1b2fb 48void ff_j_rev_dct (DCTELEM *data);
de6d9b64 49
3f09f52a 50void ff_fdct_mmx(DCTELEM *block);
d8eda370 51void ff_fdct_mmxext(DCTELEM *block);
8fd19ab2 52void ff_fdct_sse2(DCTELEM *block);
de6d9b64 53
19a0729b
OA
54#define H264_IDCT(depth) \
55void ff_h264_idct8_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
56void ff_h264_idct_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
57void ff_h264_idct8_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
58void ff_h264_idct_dc_add_ ## depth ## _c(uint8_t *dst, DCTELEM *block, int stride);\
19a0729b
OA
59void ff_h264_idct_add16_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
60void ff_h264_idct_add16intra_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
61void ff_h264_idct8_add4_ ## depth ## _c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
76741b0e 62void ff_h264_idct_add8_422_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
19a0729b
OA
63void ff_h264_idct_add8_ ## depth ## _c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);\
64void ff_h264_luma_dc_dequant_idct_ ## depth ## _c(DCTELEM *output, DCTELEM *input, int qmul);\
76741b0e 65void ff_h264_chroma422_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);\
19a0729b
OA
66void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(DCTELEM *block, int qmul);
67
68H264_IDCT( 8)
69H264_IDCT( 9)
70H264_IDCT(10)
71
290fabc6 72void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
19fb234e
JGG
73void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
74
e0eac44e 75/* encoding scans */
0c1a9eda
ZK
76extern const uint8_t ff_alternate_horizontal_scan[64];
77extern const uint8_t ff_alternate_vertical_scan[64];
78extern const uint8_t ff_zigzag_direct[64];
10acc479 79extern const uint8_t ff_zigzag248_direct[64];
5a240838 80
de6d9b64 81/* pixel operations */
f2e92ef2 82#define MAX_NEG_CROP 1024
de6d9b64
FB
83
84/* temporary */
1d503957 85extern uint32_t ff_squareTbl[512];
55fde95e 86extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64 87
19a0729b
OA
88#define PUTAVG_PIXELS(depth)\
89void ff_put_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
90void ff_avg_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
91void ff_put_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
92void ff_avg_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);
93
94PUTAVG_PIXELS( 8)
95PUTAVG_PIXELS( 9)
96PUTAVG_PIXELS(10)
97
98#define ff_put_pixels8x8_c ff_put_pixels8x8_8_c
99#define ff_avg_pixels8x8_c ff_avg_pixels8x8_8_c
100#define ff_put_pixels16x16_c ff_put_pixels16x16_8_c
101#define ff_avg_pixels16x16_c ff_avg_pixels16x16_8_c
3d1b1caa 102
d241f51e
KS
103/* RV40 functions */
104void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride);
105void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride);
106void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride);
107void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride);
108
54009d42 109/* 1/2^n downscaling functions from imgconvert.c */
54009d42
MN
110void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
111void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
112void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
703c8195
LM
113
114void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
115 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
54009d42 116
b7c27ee6 117/* minimum alignment rules ;)
7ce68923
DB
118If you notice errors in the align stuff, need more alignment for some ASM code
119for some CPU or need to use a function with less aligned data then send a mail
21de9204 120to the libav-devel mailing list, ...
7ce68923
DB
121
122!warning These alignments might not match reality, (missing attribute((align))
123stuff somewhere possible).
2cab6401 124I (Michael) did not check them, these are just the alignments which I think
7ce68923 125could be reached easily ...
de6d9b64 126
b7c27ee6
MN
127!future video codecs might need functions with less strict alignment
128*/
129
eb4b3dd3 130/*
0c1a9eda
ZK
131void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
132void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
133void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
134void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
649c00c9 135void clear_blocks_c(DCTELEM *blocks);
eb4b3dd3 136*/
de6d9b64
FB
137
138/* add and put pixel (decoding) */
b7c27ee6 139// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
be73d76b 140//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller than 4
0c1a9eda 141typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
669ac79c 142typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
0c1a9eda 143typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
0da71265 144typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
b3184779 145
342c7dfd
KS
146typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);
147
db794953 148#define DEF_OLD_QPEL(name)\
0c1a9eda
ZK
149void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
150void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
151void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
db794953
MN
152
153DEF_OLD_QPEL(qpel16_mc11_old_c)
154DEF_OLD_QPEL(qpel16_mc31_old_c)
155DEF_OLD_QPEL(qpel16_mc12_old_c)
156DEF_OLD_QPEL(qpel16_mc32_old_c)
157DEF_OLD_QPEL(qpel16_mc13_old_c)
158DEF_OLD_QPEL(qpel16_mc33_old_c)
159DEF_OLD_QPEL(qpel8_mc11_old_c)
160DEF_OLD_QPEL(qpel8_mc31_old_c)
161DEF_OLD_QPEL(qpel8_mc12_old_c)
162DEF_OLD_QPEL(qpel8_mc32_old_c)
163DEF_OLD_QPEL(qpel8_mc13_old_c)
164DEF_OLD_QPEL(qpel8_mc33_old_c)
b3184779
MN
165
166#define CALL_2X_PIXELS(a, b, n)\
167static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
168 b(block , pixels , line_size, h);\
169 b(block+n, pixels+n, line_size, h);\
170}
44eb4951 171
de6d9b64 172/* motion estimation */
be73d76b 173// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller than 2
755bfeab 174// although currently h<4 is not used as functions with width <8 are neither used nor implemented
bb198e19 175typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
1457ab52 176
24641185 177/**
4c79b95c
AJ
178 * Scantable.
179 */
180typedef struct ScanTable{
181 const uint8_t *scantable;
182 uint8_t permutated[64];
183 uint8_t raster_end[64];
4c79b95c
AJ
184} ScanTable;
185
186void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
92fb52d9
RB
187void ff_init_scantable_permutation(uint8_t *idct_permutation,
188 int idct_permutation_type);
4c79b95c
AJ
189
190/**
24641185
MN
191 * DSPContext.
192 */
eb4b3dd3 193typedef struct DSPContext {
5cc26009
MR
194 /**
195 * Size of DCT coefficients.
196 */
197 int dct_bits;
198
eb4b3dd3 199 /* pixel ops : interface with DCT */
0c1a9eda
ZK
200 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
201 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
202 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
f9ed9d85 203 void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
0c1a9eda 204 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
36940eca
LM
205 void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
206 void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
1edbfe19 207 int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
d518aebd
MN
208 /**
209 * translational global motion compensation.
210 */
0c1a9eda 211 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
d518aebd
MN
212 /**
213 * global motion compensation.
214 */
0c1a9eda 215 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
bb270c08 216 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
5fecfb7d 217 void (*clear_block)(DCTELEM *block/*align 16*/);
eb4b3dd3 218 void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
0c1a9eda
ZK
219 int (*pix_sum)(uint8_t * pix, int line_size);
220 int (*pix_norm1)(uint8_t * pix, int line_size);
bb198e19 221// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
115329f1 222
3899eb2f
RS
223 me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
224 me_cmp_func sse[6];
225 me_cmp_func hadamard8_diff[6];
226 me_cmp_func dct_sad[6];
227 me_cmp_func quant_psnr[6];
228 me_cmp_func bit[6];
229 me_cmp_func rd[6];
230 me_cmp_func vsad[6];
231 me_cmp_func vsse[6];
232 me_cmp_func nsse[6];
3899eb2f
RS
233 me_cmp_func dct_max[6];
234 me_cmp_func dct264_sad[6];
235
236 me_cmp_func me_pre_cmp[6];
237 me_cmp_func me_cmp[6];
238 me_cmp_func me_sub_cmp[6];
239 me_cmp_func mb_cmp[6];
240 me_cmp_func ildct_cmp[6]; //only width 16 used
241 me_cmp_func frame_skip_cmp[6]; //only width 8 used
eb4b3dd3 242
a00177a9
MR
243 int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
244 int size);
59006372 245
d518aebd
MN
246 /**
247 * Halfpel motion compensation with rounding (a+b+1)>>1.
30f15053 248 * this is an array[4][4] of motion compensation functions for 4
e5771f4f 249 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 250 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
251 * @param block destination where the result is stored
252 * @param pixels source
253 * @param line_size number of bytes in a horizontal line of block
254 * @param h height
255 */
669ac79c 256 op_pixels_func put_pixels_tab[4][4];
d518aebd
MN
257
258 /**
259 * Halfpel motion compensation with rounding (a+b+1)>>1.
115329f1 260 * This is an array[4][4] of motion compensation functions for 4
e5771f4f 261 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 262 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
263 * @param block destination into which the result is averaged (a+b+1)>>1
264 * @param pixels source
265 * @param line_size number of bytes in a horizontal line of block
266 * @param h height
267 */
da3b9756 268 op_pixels_func avg_pixels_tab[4][4];
d518aebd
MN
269
270 /**
271 * Halfpel motion compensation with no rounding (a+b)>>1.
30f15053 272 * this is an array[2][4] of motion compensation functions for 2
eb14c713 273 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 274 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
275 * @param block destination where the result is stored
276 * @param pixels source
277 * @param line_size number of bytes in a horizontal line of block
278 * @param h height
279 */
dbc56b39 280 op_pixels_func put_no_rnd_pixels_tab[4][4];
d518aebd
MN
281
282 /**
283 * Halfpel motion compensation with no rounding (a+b)>>1.
30f15053 284 * this is an array[2][4] of motion compensation functions for 2
eb14c713 285 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 286 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
287 * @param block destination into which the result is averaged (a+b)>>1
288 * @param pixels source
289 * @param line_size number of bytes in a horizontal line of block
290 * @param h height
291 */
dbc56b39 292 op_pixels_func avg_no_rnd_pixels_tab[4][4];
115329f1 293
0e02b381 294 void (*put_no_rnd_pixels_l2)(uint8_t *block/*align 8*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
115329f1 295
669ac79c
MN
296 /**
297 * Thirdpel motion compensation with rounding (a+b+1)>>1.
30f15053
LB
298 * this is an array[12] of motion compensation functions for the 9 thirdpe
299 * positions<br>
669ac79c
MN
300 * *pixels_tab[ xthirdpel + 4*ythirdpel ]
301 * @param block destination where the result is stored
302 * @param pixels source
303 * @param line_size number of bytes in a horizontal line of block
304 * @param h height
305 */
306 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
da3b9756
MM
307 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
308
eb4b3dd3
ZK
309 qpel_mc_func put_qpel_pixels_tab[2][16];
310 qpel_mc_func avg_qpel_pixels_tab[2][16];
311 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
312 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
1457ab52 313 qpel_mc_func put_mspel_pixels_tab[8];
115329f1 314
0da71265 315 /**
30f15053 316 * h264 Chroma MC
0da71265
MN
317 */
318 h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
319 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
eb4b3dd3 320
80e44bc3
MN
321 qpel_mc_func put_h264_qpel_pixels_tab[4][16];
322 qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
115329f1 323
bb198e19 324 me_cmp_func pix_abs[2][4];
115329f1 325
11f18faf 326 /* huffyuv specific */
11f18faf 327 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
1457ab52 328 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
84705403
MN
329 /**
330 * subtract huffyuv's variant of median prediction
331 * note, this might read from src1[-1], src2[-1]
332 */
e17ccf60
LM
333 void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
334 void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2f77923d 335 int (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
f267d3ac 336 void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
96711ecf 337 void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
381d37fd 338 void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);
42251a2a 339
332f9ac4
MN
340 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
341 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
342
fdbbf2e0 343 void (*h261_loop_filter)(uint8_t *src, int stride);
c6148de2 344
5ff01259 345 /* assume len is a multiple of 16, and arrays are 32-byte aligned */
eb4825b5
LM
346 void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
347 /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
952e8721 348 void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
911e21a3 349 /* assume len is a multiple of 8, and arrays are 16-byte aligned */
50e23ae9 350 void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
53b57211 351 /**
53b57211
MR
352 * Calculate the scalar product of two vectors of floats.
353 * @param v1 first vector, 16-byte aligned
354 * @param v2 second vector, 16-byte aligned
355 * @param len length of vectors, multiple of 4
356 */
357 float (*scalarproduct_float)(const float *v1, const float *v2, int len);
358 /**
359 * Calculate the sum and difference of two vectors of floats.
360 * @param v1 first input vector, sum output, 16-byte aligned
361 * @param v2 second input vector, difference output, 16-byte aligned
362 * @param len length of vectors, multiple of 4
363 */
364 void (*butterflies_float)(float *restrict v1, float *restrict v2, int len);
eb4825b5 365
b0368839
MN
366 /* (I)DCT */
367 void (*fdct)(DCTELEM *block/* align 16*/);
10acc479 368 void (*fdct248)(DCTELEM *block/* align 16*/);
115329f1 369
4fb518c3
MN
370 /* IDCT really*/
371 void (*idct)(DCTELEM *block/* align 16*/);
115329f1 372
24641185 373 /**
77c92c2d 374 * block -> idct -> clip to unsigned 8 bit -> dest.
24641185 375 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
9846cbdb 376 * @param line_size size in bytes of a horizontal line of dest
24641185 377 */
b0368839 378 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
115329f1 379
24641185
MN
380 /**
381 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
9846cbdb 382 * @param line_size size in bytes of a horizontal line of dest
24641185 383 */
b0368839 384 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
115329f1 385
24641185 386 /**
77c92c2d 387 * idct input permutation.
05493021
MN
388 * several optimized IDCTs need a permutated input (relative to the normal order of the reference
389 * IDCT)
390 * this permutation must be performed before the idct_put/add, note, normally this can be merged
391 * with the zigzag/alternate scan<br>
24641185
MN
392 * an example to avoid confusion:
393 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
da9cea77
DB
394 * - (x -> reference dct -> reference idct -> x)
395 * - (x -> reference dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
24641185
MN
396 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
397 */
b0368839
MN
398 uint8_t idct_permutation[64];
399 int idct_permutation_type;
400#define FF_NO_IDCT_PERM 1
401#define FF_LIBMPEG2_IDCT_PERM 2
402#define FF_SIMPLE_IDCT_PERM 3
403#define FF_TRANSPOSE_IDCT_PERM 4
5773a746 404#define FF_PARTTRANS_IDCT_PERM 5
0e956ba2 405#define FF_SSE2_IDCT_PERM 6
b0368839 406
364a1797
MN
407 int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
408 void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
409#define BASIS_SHIFT 16
410#define RECON_SHIFT 6
115329f1 411
c90b9442 412 void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);
cbcd6c8c 413#define EDGE_WIDTH 16
1500be13
AS
414#define EDGE_TOP 1
415#define EDGE_BOTTOM 2
5a6a9e78 416
54009d42 417 void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
64db55ae 418
88c0536a
KS
419 /**
420 * Calculate scalar product of two vectors.
bb68f8a2 421 * @param len length of vectors, should be multiple of 16
88c0536a 422 */
7e1ce6a6 423 int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len);
b1159ad9
LM
424 /* ape functions */
425 /**
426 * Calculate scalar product of v1 and v2,
427 * and v1[i] += v3[i] * mul
428 * @param len length of vectors, should be multiple of 16
429 */
b3858964 430 int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
2d8a0815 431
e6e98234
JR
432 /**
433 * Apply symmetric window in 16-bit fixed-point.
434 * @param output destination array
435 * constraints: 16-byte aligned
436 * @param input source array
437 * constraints: 16-byte aligned
438 * @param window window array
439 * constraints: 16-byte aligned, at least len/2 elements
440 * @param len full window length
441 * constraints: multiple of ? greater than zero
442 */
443 void (*apply_window_int16)(int16_t *output, const int16_t *input,
444 const int16_t *window, unsigned int len);
445
6054cd25
JR
446 /**
447 * Clip each element in an array of int32_t to a given minimum and maximum value.
448 * @param dst destination array
449 * constraints: 16-byte aligned
450 * @param src source array
451 * constraints: 16-byte aligned
452 * @param min minimum value
09f21198 453 * constraints: must be in the range [-(1 << 24), 1 << 24]
6054cd25 454 * @param max maximum value
09f21198 455 * constraints: must be in the range [-(1 << 24), 1 << 24]
6054cd25
JR
456 * @param len number of elements in the array
457 * constraints: multiple of 32 greater than zero
458 */
459 void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
460 int32_t max, unsigned int len);
461
342c7dfd 462 op_fill_func fill_block_tab[2];
eb4b3dd3
ZK
463} DSPContext;
464
9cf0841e
MS
465void ff_dsputil_static_init(void);
466void ff_dsputil_init(DSPContext* p, AVCodecContext *avctx);
de6d9b64 467
6dc7d5da
MN
468int ff_check_alignment(void);
469
7801d21d 470/**
dafcbfe4
DB
471 * Return the scalar product of two vectors.
472 *
473 * @param v1 first input vector
474 * @param v2 first input vector
475 * @param len number of elements
476 *
477 * @return sum of elementwise products
478 */
479float ff_scalarproduct_float_c(const float *v1, const float *v2, int len);
480
481/**
7801d21d
MN
482 * permute block according to permuatation.
483 * @param last last non zero element in scantable order
484 */
0c1a9eda 485void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
e0eac44e 486
622348f9
MN
487void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
488
bb270c08 489#define BYTE_VEC32(c) ((c)*0x01010101UL)
19a0729b 490#define BYTE_VEC64(c) ((c)*0x0001000100010001UL)
d8085ea7
MN
491
492static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
493{
494 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
495}
496
497static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
498{
499 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
500}
501
19a0729b
OA
502static inline uint64_t rnd_avg64(uint64_t a, uint64_t b)
503{
504 return (a | b) - (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
505}
506
507static inline uint64_t no_rnd_avg64(uint64_t a, uint64_t b)
508{
509 return (a & b) + (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
510}
511
26efc54e
MN
512static inline int get_penalty_factor(int lambda, int lambda2, int type){
513 switch(type&0xFF){
514 default:
515 case FF_CMP_SAD:
516 return lambda>>FF_LAMBDA_SHIFT;
517 case FF_CMP_DCT:
518 return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
26efc54e 519 case FF_CMP_SATD:
27c61ac5 520 case FF_CMP_DCT264:
26efc54e
MN
521 return (2*lambda)>>FF_LAMBDA_SHIFT;
522 case FF_CMP_RD:
523 case FF_CMP_PSNR:
524 case FF_CMP_SSE:
525 case FF_CMP_NSSE:
526 return lambda2>>FF_LAMBDA_SHIFT;
527 case FF_CMP_BIT:
528 return 1;
529 }
530}
531
9cf0841e
MS
532void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
533void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
534void ff_dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
9cf0841e
MS
535void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
536void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
537void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
538void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
a1bee080 539
ca411fc1 540#if (ARCH_ARM && HAVE_NEON) || ARCH_PPC || HAVE_MMX
f1424cfd 541# define STRIDE_ALIGN 16
9b027c0d 542#else
5c319d33 543# define STRIDE_ALIGN 8
de6d9b64
FB
544#endif
545
0db2d942
LB
546// Some broken preprocessors need a second expansion
547// to be forced to tokenize __VA_ARGS__
548#define E(x) x
549
96aad41e
MR
550#define LOCAL_ALIGNED_A(a, t, v, s, o, ...) \
551 uint8_t la_##v[sizeof(t s o) + (a)]; \
552 t (*v) o = (void *)FFALIGN((uintptr_t)la_##v, a)
553
02823f6d
MR
554#define LOCAL_ALIGNED_D(a, t, v, s, o, ...) \
555 DECLARE_ALIGNED(a, t, la_##v) s o; \
556 t (*v) o = la_##v
96aad41e 557
0db2d942 558#define LOCAL_ALIGNED(a, t, v, ...) E(LOCAL_ALIGNED_A(a, t, v, __VA_ARGS__,,))
d96cd429
MR
559
560#if HAVE_LOCAL_ALIGNED_8
0db2d942 561# define LOCAL_ALIGNED_8(t, v, ...) E(LOCAL_ALIGNED_D(8, t, v, __VA_ARGS__,,))
d96cd429 562#else
96aad41e 563# define LOCAL_ALIGNED_8(t, v, ...) LOCAL_ALIGNED(8, t, v, __VA_ARGS__)
d96cd429
MR
564#endif
565
566#if HAVE_LOCAL_ALIGNED_16
0db2d942 567# define LOCAL_ALIGNED_16(t, v, ...) E(LOCAL_ALIGNED_D(16, t, v, __VA_ARGS__,,))
d96cd429 568#else
96aad41e 569# define LOCAL_ALIGNED_16(t, v, ...) LOCAL_ALIGNED(16, t, v, __VA_ARGS__)
d96cd429
MR
570#endif
571
9fbd14ac 572#define WRAPPER8_16_SQ(name8, name16)\
bb198e19
MN
573static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
574 int score=0;\
575 score +=name8(s, dst , src , stride, 8);\
576 score +=name8(s, dst+8 , src+8 , stride, 8);\
577 if(h==16){\
578 dst += 8*stride;\
579 src += 8*stride;\
580 score +=name8(s, dst , src , stride, 8);\
581 score +=name8(s, dst+8 , src+8 , stride, 8);\
582 }\
583 return score;\
1457ab52
MN
584}
585
49cef744 586
184fcc60 587static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
588{
589 int i;
590 for(i=0; i<h; i++)
591 {
330b864c 592 AV_COPY16U(dst, src);
49cef744
BF
593 dst+=dstStride;
594 src+=srcStride;
595 }
596}
597
184fcc60 598static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
599{
600 int i;
601 for(i=0; i<h; i++)
602 {
330b864c 603 AV_COPY32U(dst, src);
49cef744
BF
604 dst+=dstStride;
605 src+=srcStride;
606 }
607}
608
184fcc60 609static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
610{
611 int i;
612 for(i=0; i<h; i++)
613 {
330b864c 614 AV_COPY64U(dst, src);
49cef744
BF
615 dst+=dstStride;
616 src+=srcStride;
617 }
618}
619
184fcc60 620static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
621{
622 int i;
623 for(i=0; i<h; i++)
624 {
330b864c 625 AV_COPY64U(dst, src);
49cef744
BF
626 dst[8]= src[8];
627 dst+=dstStride;
628 src+=srcStride;
629 }
630}
631
184fcc60 632static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
633{
634 int i;
635 for(i=0; i<h; i++)
636 {
330b864c 637 AV_COPY128U(dst, src);
49cef744
BF
638 dst+=dstStride;
639 src+=srcStride;
640 }
641}
642
184fcc60 643static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
49cef744
BF
644{
645 int i;
646 for(i=0; i<h; i++)
647 {
330b864c 648 AV_COPY128U(dst, src);
49cef744
BF
649 dst[16]= src[16];
650 dst+=dstStride;
651 src+=srcStride;
652 }
653}
654
98790382 655#endif /* AVCODEC_DSPUTIL_H */