MMX for H.264 deblocking filter
[libav.git] / libavcodec / dsputil.h
CommitLineData
ff4ec49e
FB
1/*
2 * DSP utils
3 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard.
8f2ab833 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
ff4ec49e
FB
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
24641185
MN
20
21/**
22 * @file dsputil.h
983e3246 23 * DSP utils.
5755c27f
MN
24 * note, many functions in here may use MMX which trashes the FPU state, it is
25 * absolutely necessary to call emms_c() between dsp & float/double code
24641185
MN
26 */
27
de6d9b64
FB
28#ifndef DSPUTIL_H
29#define DSPUTIL_H
30
31#include "common.h"
43f1708f 32#include "avcodec.h"
de6d9b64 33
24641185 34
44eb4951 35//#define DEBUG
de6d9b64
FB
36/* dct code */
37typedef short DCTELEM;
38
03c94ede 39void fdct_ifast (DCTELEM *data);
48b1f800 40void fdct_ifast248 (DCTELEM *data);
28db7fce 41void ff_jpeg_fdct_islow (DCTELEM *data);
10acc479 42void ff_fdct248_islow (DCTELEM *data);
de6d9b64
FB
43
44void j_rev_dct (DCTELEM *data);
178fcca8 45void j_rev_dct4 (DCTELEM *data);
9ca358b9 46void j_rev_dct2 (DCTELEM *data);
1aa8c57b 47void j_rev_dct1 (DCTELEM *data);
de6d9b64 48
3f09f52a 49void ff_fdct_mmx(DCTELEM *block);
cf3bf5bb 50void ff_fdct_mmx2(DCTELEM *block);
8fd19ab2 51void ff_fdct_sse2(DCTELEM *block);
de6d9b64 52
0fa8158d
MN
53void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
54void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
55void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);
56
e0eac44e 57/* encoding scans */
0c1a9eda
ZK
58extern const uint8_t ff_alternate_horizontal_scan[64];
59extern const uint8_t ff_alternate_vertical_scan[64];
60extern const uint8_t ff_zigzag_direct[64];
10acc479 61extern const uint8_t ff_zigzag248_direct[64];
5a240838 62
de6d9b64 63/* pixel operations */
f2e92ef2 64#define MAX_NEG_CROP 1024
de6d9b64
FB
65
66/* temporary */
0c1a9eda
ZK
67extern uint32_t squareTbl[512];
68extern uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
de6d9b64 69
44cb64ee
MM
70/* VP3 DSP functions */
71void vp3_dsp_init_c(void);
116824d0
MM
72void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
73 int coeff_count, DCTELEM *output_data);
44cb64ee
MM
74
75void vp3_dsp_init_mmx(void);
116824d0
MM
76void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
77 int coeff_count, DCTELEM *output_data);
44cb64ee 78
38acbc3c 79void vp3_dsp_init_sse2(void);
116824d0
MM
80void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
81 int coeff_count, DCTELEM *output_data);
de6d9b64 82
b7c27ee6 83/* minimum alignment rules ;)
eb4b3dd3 84if u notice errors in the align stuff, need more alignment for some asm code for some cpu
b7c27ee6
MN
85or need to use a function with less aligned data then send a mail to the ffmpeg-dev list, ...
86
87!warning these alignments might not match reallity, (missing attribute((align)) stuff somewhere possible)
88i (michael) didnt check them, these are just the alignents which i think could be reached easily ...
de6d9b64 89
b7c27ee6
MN
90!future video codecs might need functions with less strict alignment
91*/
92
eb4b3dd3 93/*
0c1a9eda
ZK
94void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
95void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
96void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
97void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
649c00c9 98void clear_blocks_c(DCTELEM *blocks);
eb4b3dd3 99*/
de6d9b64
FB
100
101/* add and put pixel (decoding) */
b7c27ee6 102// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
7d67aa9b 103//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
0c1a9eda 104typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
669ac79c 105typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
0c1a9eda 106typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
0da71265 107typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
9f2d1b4f
LM
108typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
109typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets);
b3184779 110
db794953 111#define DEF_OLD_QPEL(name)\
0c1a9eda
ZK
112void ff_put_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
113void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
114void ff_avg_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
db794953
MN
115
116DEF_OLD_QPEL(qpel16_mc11_old_c)
117DEF_OLD_QPEL(qpel16_mc31_old_c)
118DEF_OLD_QPEL(qpel16_mc12_old_c)
119DEF_OLD_QPEL(qpel16_mc32_old_c)
120DEF_OLD_QPEL(qpel16_mc13_old_c)
121DEF_OLD_QPEL(qpel16_mc33_old_c)
122DEF_OLD_QPEL(qpel8_mc11_old_c)
123DEF_OLD_QPEL(qpel8_mc31_old_c)
124DEF_OLD_QPEL(qpel8_mc12_old_c)
125DEF_OLD_QPEL(qpel8_mc32_old_c)
126DEF_OLD_QPEL(qpel8_mc13_old_c)
127DEF_OLD_QPEL(qpel8_mc33_old_c)
b3184779
MN
128
129#define CALL_2X_PIXELS(a, b, n)\
130static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
131 b(block , pixels , line_size, h);\
132 b(block+n, pixels+n, line_size, h);\
133}
44eb4951 134
de6d9b64 135/* motion estimation */
7d67aa9b
MN
136// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
137// allthough currently h<4 is not used as functions with width <8 are not used and neither implemented
bb198e19 138typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
1457ab52 139
0da71265 140
24641185
MN
141/**
142 * DSPContext.
143 */
eb4b3dd3
ZK
144typedef struct DSPContext {
145 /* pixel ops : interface with DCT */
0c1a9eda
ZK
146 void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
147 void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
148 void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
f9ed9d85 149 void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
0c1a9eda 150 void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
d518aebd
MN
151 /**
152 * translational global motion compensation.
153 */
0c1a9eda 154 void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
d518aebd
MN
155 /**
156 * global motion compensation.
157 */
0c1a9eda 158 void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
eb4b3dd3
ZK
159 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
160 void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
0c1a9eda
ZK
161 int (*pix_sum)(uint8_t * pix, int line_size);
162 int (*pix_norm1)(uint8_t * pix, int line_size);
bb198e19
MN
163// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
164
622348f9
MN
165 me_cmp_func sad[5]; /* identical to pix_absAxA except additional void * */
166 me_cmp_func sse[5];
167 me_cmp_func hadamard8_diff[5];
168 me_cmp_func dct_sad[5];
169 me_cmp_func quant_psnr[5];
170 me_cmp_func bit[5];
171 me_cmp_func rd[5];
172 me_cmp_func vsad[5];
173 me_cmp_func vsse[5];
e6a2ac34 174 me_cmp_func nsse[5];
26efc54e
MN
175 me_cmp_func w53[5];
176 me_cmp_func w97[5];
0fd6aea1 177 me_cmp_func dct_max[5];
1457ab52 178
bb198e19
MN
179 me_cmp_func me_pre_cmp[5];
180 me_cmp_func me_cmp[5];
181 me_cmp_func me_sub_cmp[5];
182 me_cmp_func mb_cmp[5];
622348f9 183 me_cmp_func ildct_cmp[5]; //only width 16 used
0fd6aea1 184 me_cmp_func frame_skip_cmp[5]; //only width 8 used
eb4b3dd3 185
d518aebd
MN
186 /**
187 * Halfpel motion compensation with rounding (a+b+1)>>1.
669ac79c 188 * this is an array[4][4] of motion compensation funcions for 4
e5771f4f 189 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 190 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
191 * @param block destination where the result is stored
192 * @param pixels source
193 * @param line_size number of bytes in a horizontal line of block
194 * @param h height
195 */
669ac79c 196 op_pixels_func put_pixels_tab[4][4];
d518aebd
MN
197
198 /**
199 * Halfpel motion compensation with rounding (a+b+1)>>1.
4e8eed2f 200 * This is an array[4][4] of motion compensation functions for 4
e5771f4f 201 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 202 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
203 * @param block destination into which the result is averaged (a+b+1)>>1
204 * @param pixels source
205 * @param line_size number of bytes in a horizontal line of block
206 * @param h height
207 */
da3b9756 208 op_pixels_func avg_pixels_tab[4][4];
d518aebd
MN
209
210 /**
211 * Halfpel motion compensation with no rounding (a+b)>>1.
eb14c713
MN
212 * this is an array[2][4] of motion compensation funcions for 2
213 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 214 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
215 * @param block destination where the result is stored
216 * @param pixels source
217 * @param line_size number of bytes in a horizontal line of block
218 * @param h height
219 */
dbc56b39 220 op_pixels_func put_no_rnd_pixels_tab[4][4];
d518aebd
MN
221
222 /**
223 * Halfpel motion compensation with no rounding (a+b)>>1.
eb14c713
MN
224 * this is an array[2][4] of motion compensation funcions for 2
225 * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
5755c27f 226 * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
d518aebd
MN
227 * @param block destination into which the result is averaged (a+b)>>1
228 * @param pixels source
229 * @param line_size number of bytes in a horizontal line of block
230 * @param h height
231 */
dbc56b39 232 op_pixels_func avg_no_rnd_pixels_tab[4][4];
669ac79c 233
c0a0170c
MN
234 void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
235
669ac79c
MN
236 /**
237 * Thirdpel motion compensation with rounding (a+b+1)>>1.
238 * this is an array[12] of motion compensation funcions for the 9 thirdpel positions<br>
239 * *pixels_tab[ xthirdpel + 4*ythirdpel ]
240 * @param block destination where the result is stored
241 * @param pixels source
242 * @param line_size number of bytes in a horizontal line of block
243 * @param h height
244 */
245 tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
da3b9756
MM
246 tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
247
eb4b3dd3
ZK
248 qpel_mc_func put_qpel_pixels_tab[2][16];
249 qpel_mc_func avg_qpel_pixels_tab[2][16];
250 qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
251 qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
1457ab52 252 qpel_mc_func put_mspel_pixels_tab[8];
0da71265
MN
253
254 /**
255 * h264 Chram MC
256 */
257 h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
258 h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
eb4b3dd3 259
0da71265
MN
260 qpel_mc_func put_h264_qpel_pixels_tab[3][16];
261 qpel_mc_func avg_h264_qpel_pixels_tab[3][16];
262
9f2d1b4f
LM
263 h264_weight_func weight_h264_pixels_tab[10];
264 h264_biweight_func biweight_h264_pixels_tab[10];
265
bb198e19 266 me_cmp_func pix_abs[2][4];
11f18faf
MN
267
268 /* huffyuv specific */
11f18faf 269 void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
1457ab52 270 void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
84705403
MN
271 /**
272 * subtract huffyuv's variant of median prediction
273 * note, this might read from src1[-1], src2[-1]
274 */
275 void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
3d2e8cce 276 void (*bswap_buf)(uint32_t *dst, uint32_t *src, int w);
42251a2a
LM
277
278 void (*h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int *tc0);
279 void (*h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int *tc0);
280 void (*h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int *tc0);
281 void (*h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int *tc0);
b0368839 282
332f9ac4
MN
283 void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
284 void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);
285
fdbbf2e0 286 void (*h261_loop_filter)(uint8_t *src, int stride);
c6148de2 287
b0368839
MN
288 /* (I)DCT */
289 void (*fdct)(DCTELEM *block/* align 16*/);
10acc479 290 void (*fdct248)(DCTELEM *block/* align 16*/);
4fb518c3
MN
291
292 /* IDCT really*/
293 void (*idct)(DCTELEM *block/* align 16*/);
24641185
MN
294
295 /**
77c92c2d 296 * block -> idct -> clip to unsigned 8 bit -> dest.
24641185 297 * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
a1adf436 298 * @param line_size size in bytes of a horizotal line of dest
24641185 299 */
b0368839 300 void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
24641185
MN
301
302 /**
303 * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
a1adf436 304 * @param line_size size in bytes of a horizotal line of dest
24641185 305 */
b0368839 306 void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
24641185
MN
307
308 /**
77c92c2d 309 * idct input permutation.
05493021
MN
310 * several optimized IDCTs need a permutated input (relative to the normal order of the reference
311 * IDCT)
312 * this permutation must be performed before the idct_put/add, note, normally this can be merged
313 * with the zigzag/alternate scan<br>
24641185
MN
314 * an example to avoid confusion:
315 * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
316 * - (x -> referece dct -> reference idct -> x)
317 * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
318 * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
319 */
b0368839
MN
320 uint8_t idct_permutation[64];
321 int idct_permutation_type;
322#define FF_NO_IDCT_PERM 1
323#define FF_LIBMPEG2_IDCT_PERM 2
324#define FF_SIMPLE_IDCT_PERM 3
325#define FF_TRANSPOSE_IDCT_PERM 4
326
364a1797
MN
327 int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
328 void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
329#define BASIS_SHIFT 16
330#define RECON_SHIFT 6
331
44cb64ee
MM
332 /**
333 * This function handles any initialization for the VP3 DSP functions.
334 */
335 void (*vp3_dsp_init)(void);
336
337 /**
338 * This function is responsible for taking a block of zigzag'd,
116824d0
MM
339 * quantized DCT coefficients and reconstructing the original block of
340 * samples.
44cb64ee
MM
341 * @param input_data 64 zigzag'd, quantized DCT coefficients
342 * @param dequant_matrix 64 zigzag'd quantizer coefficients
343 * @param coeff_count index of the last coefficient
116824d0
MM
344 * @param output_samples space for 64 DCTELEMs where the transformed
345 * samples will be stored
44cb64ee 346 */
116824d0
MM
347 void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix,
348 int coeff_count, DCTELEM *output_samples);
0fa8158d
MN
349
350 void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
eb4b3dd3
ZK
351} DSPContext;
352
59cf08ce 353void dsputil_static_init(void);
b0368839 354void dsputil_init(DSPContext* p, AVCodecContext *avctx);
de6d9b64 355
7801d21d
MN
356/**
357 * permute block according to permuatation.
358 * @param last last non zero element in scantable order
359 */
0c1a9eda 360void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
e0eac44e 361
622348f9
MN
362void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);
363
d8085ea7
MN
364#define BYTE_VEC32(c) ((c)*0x01010101UL)
365
366static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
367{
368 return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
369}
370
371static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
372{
373 return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
374}
375
26efc54e
MN
376static inline int get_penalty_factor(int lambda, int lambda2, int type){
377 switch(type&0xFF){
378 default:
379 case FF_CMP_SAD:
380 return lambda>>FF_LAMBDA_SHIFT;
381 case FF_CMP_DCT:
382 return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
383 case FF_CMP_W53:
384 return (4*lambda)>>(FF_LAMBDA_SHIFT);
385 case FF_CMP_W97:
386 return (2*lambda)>>(FF_LAMBDA_SHIFT);
387 case FF_CMP_SATD:
388 return (2*lambda)>>FF_LAMBDA_SHIFT;
389 case FF_CMP_RD:
390 case FF_CMP_PSNR:
391 case FF_CMP_SSE:
392 case FF_CMP_NSSE:
393 return lambda2>>FF_LAMBDA_SHIFT;
394 case FF_CMP_BIT:
395 return 1;
396 }
397}
398
24641185 399/**
77c92c2d 400 * Empty mmx state.
24641185
MN
401 * this must be called between any dsp function and float/double code.
402 * for example sin(); dsp->idct_put(); emms_c(); cos()
403 */
eb4b3dd3
ZK
404#define emms_c()
405
e629ab68
RD
406/* should be defined by architectures supporting
407 one or more MultiMedia extension */
408int mm_support(void);
409
92a69cf8
MM
410#define __align16 __attribute__ ((aligned (16)))
411
3d03c0a2 412#if defined(HAVE_MMX)
de6d9b64 413
18f77016 414#undef emms_c
eb4b3dd3 415
de6d9b64
FB
416#define MM_MMX 0x0001 /* standard MMX */
417#define MM_3DNOW 0x0004 /* AMD 3DNOW */
418#define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */
419#define MM_SSE 0x0008 /* SSE functions */
420#define MM_SSE2 0x0010 /* PIV SSE2 functions */
e42a152b 421#define MM_3DNOWEXT 0x0020 /* AMD 3DNowExt */
de6d9b64
FB
422
423extern int mm_flags;
424
0c1a9eda
ZK
425void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
426void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
f9ed9d85 427void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
de6d9b64
FB
428
429static inline void emms(void)
430{
fb16b7e7
FB
431 __asm __volatile ("emms;":::"memory");
432}
433
1457ab52 434
fb16b7e7
FB
435#define emms_c() \
436{\
437 if (mm_flags & MM_MMX)\
438 emms();\
de6d9b64
FB
439}
440
441#define __align8 __attribute__ ((aligned (8)))
3237f731 442#define STRIDE_ALIGN 8
de6d9b64 443
b0368839
MN
444void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
445void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
2720569a 446
3d03c0a2
FB
447#elif defined(ARCH_ARMV4L)
448
3d03c0a2 449/* This is to use 4 bytes read to the IDCT pointers for some 'zero'
92a69cf8 450 line optimizations */
3d03c0a2 451#define __align8 __attribute__ ((aligned (4)))
3237f731 452#define STRIDE_ALIGN 4
3d03c0a2 453
b0368839 454void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
3d03c0a2 455
c34270f5 456#elif defined(HAVE_MLIB)
c34270f5
FB
457
458/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
459#define __align8 __attribute__ ((aligned (8)))
3237f731 460#define STRIDE_ALIGN 8
c34270f5 461
b0368839 462void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
c34270f5 463
44f54ceb
MN
464#elif defined(ARCH_SPARC)
465
466/* SPARC/VIS IDCT needs 8-byte aligned DCT blocks */
467#define __align8 __attribute__ ((aligned (8)))
3237f731 468#define STRIDE_ALIGN 8
44f54ceb
MN
469void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
470
1e98dffb
NK
471#elif defined(ARCH_ALPHA)
472
1e98dffb 473#define __align8 __attribute__ ((aligned (8)))
3237f731 474#define STRIDE_ALIGN 8
1e98dffb 475
b0368839 476void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
1e98dffb 477
59925ef2
BF
478#elif defined(ARCH_POWERPC)
479
404d2241
BF
480#define MM_ALTIVEC 0x0001 /* standard AltiVec */
481
482extern int mm_flags;
483
3b991c54 484#if defined(HAVE_ALTIVEC) && !defined(CONFIG_DARWIN)
9a197a24 485#define pixel altivec_pixel
3b991c54 486#include <altivec.h>
9a197a24 487#undef pixel
3b991c54
RD
488#endif
489
59925ef2 490#define __align8 __attribute__ ((aligned (16)))
3237f731 491#define STRIDE_ALIGN 16
59925ef2 492
b0368839 493void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
59925ef2 494
d46aba26
LS
495#elif defined(HAVE_MMI)
496
d46aba26 497#define __align8 __attribute__ ((aligned (16)))
3237f731 498#define STRIDE_ALIGN 16
d46aba26 499
b0368839 500void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
d46aba26 501
0c6bd2ea
B
502#elif defined(ARCH_SH4)
503
504#define __align8 __attribute__ ((aligned (8)))
3237f731 505#define STRIDE_ALIGN 8
0c6bd2ea
B
506
507void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
508
de6d9b64
FB
509#else
510
3237f731
MN
511#define __align8 __attribute__ ((aligned (8)))
512#define STRIDE_ALIGN 8
de6d9b64
FB
513
514#endif
515
6d4985bb
FB
516#ifdef __GNUC__
517
518struct unaligned_64 { uint64_t l; } __attribute__((packed));
519struct unaligned_32 { uint32_t l; } __attribute__((packed));
669ac79c 520struct unaligned_16 { uint16_t l; } __attribute__((packed));
6d4985bb 521
669ac79c 522#define LD16(a) (((const struct unaligned_16 *) (a))->l)
6d4985bb
FB
523#define LD32(a) (((const struct unaligned_32 *) (a))->l)
524#define LD64(a) (((const struct unaligned_64 *) (a))->l)
525
526#define ST32(a, b) (((struct unaligned_32 *) (a))->l) = (b)
527
528#else /* __GNUC__ */
529
669ac79c 530#define LD16(a) (*((uint16_t*)(a)))
6d4985bb
FB
531#define LD32(a) (*((uint32_t*)(a)))
532#define LD64(a) (*((uint64_t*)(a)))
533
534#define ST32(a, b) *((uint32_t*)(a)) = (b)
535
536#endif /* !__GNUC__ */
537
43f1708f 538/* PSNR */
0c1a9eda 539void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
43f1708f
J
540 int orig_linesize[3], int coded_linesize,
541 AVCodecContext *avctx);
bb6f5690
FB
542
543/* FFT computation */
544
545/* NOTE: soon integer code will be added, so you must use the
546 FFTSample type */
547typedef float FFTSample;
548
549typedef struct FFTComplex {
550 FFTSample re, im;
551} FFTComplex;
552
553typedef struct FFTContext {
554 int nbits;
555 int inverse;
556 uint16_t *revtab;
557 FFTComplex *exptab;
558 FFTComplex *exptab1; /* only used by SSE code */
559 void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
560} FFTContext;
561
68951ecf
GB
562int ff_fft_init(FFTContext *s, int nbits, int inverse);
563void ff_fft_permute(FFTContext *s, FFTComplex *z);
564void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
565void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
566void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
8d268a7d 567
68951ecf 568static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
bb6f5690
FB
569{
570 s->fft_calc(s, z);
571}
68951ecf 572void ff_fft_end(FFTContext *s);
bb6f5690
FB
573
574/* MDCT computation */
575
576typedef struct MDCTContext {
577 int n; /* size of MDCT (i.e. number of input data * 2) */
578 int nbits; /* n = 2^nbits */
579 /* pre/post rotation tables */
580 FFTSample *tcos;
581 FFTSample *tsin;
582 FFTContext fft;
583} MDCTContext;
584
82696bee 585int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
eb4b3dd3 586void ff_imdct_calc(MDCTContext *s, FFTSample *output,
bb6f5690 587 const FFTSample *input, FFTSample *tmp);
eb4b3dd3 588void ff_mdct_calc(MDCTContext *s, FFTSample *out,
bb6f5690 589 const FFTSample *input, FFTSample *tmp);
82696bee 590void ff_mdct_end(MDCTContext *s);
bb6f5690 591
bb198e19
MN
592#define WARPER8_16(name8, name16)\
593static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
594 return name8(s, dst , src , stride, h)\
595 +name8(s, dst+8 , src+8 , stride, h);\
596}
597
598#define WARPER8_16_SQ(name8, name16)\
599static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
600 int score=0;\
601 score +=name8(s, dst , src , stride, 8);\
602 score +=name8(s, dst+8 , src+8 , stride, 8);\
603 if(h==16){\
604 dst += 8*stride;\
605 src += 8*stride;\
606 score +=name8(s, dst , src , stride, 8);\
607 score +=name8(s, dst+8 , src+8 , stride, 8);\
608 }\
609 return score;\
1457ab52
MN
610}
611
95e2ce4a 612#ifndef HAVE_LRINTF
9d85cbd9
FB
613/* XXX: add ISOC specific test to avoid specific BSD testing. */
614/* better than nothing implementation. */
6234d753 615/* btw, rintf() is existing on fbsd too -- alex */
91d6655a 616static always_inline long int lrintf(float x)
9d85cbd9 617{
ea937d01 618#ifdef CONFIG_WIN32
91d6655a
MN
619# ifdef ARCH_X86
620 int32_t i;
621 asm volatile(
622 "fistpl %0\n\t"
623 : "=m" (i) : "t" (x) : "st"
624 );
625 return i;
626# else
ea937d01 627 /* XXX: incorrect, but make it compile */
91d6655a
MN
628 return (int)(x + (x < 0 ? -0.5 : 0.5));
629# endif
ea937d01 630#else
9d85cbd9 631 return (int)(rint(x));
ea937d01 632#endif
9d85cbd9 633}
e96682e6
MN
634#else
635#ifndef _ISOC9X_SOURCE
636#define _ISOC9X_SOURCE
637#endif
638#include <math.h>
9d85cbd9
FB
639#endif
640
de6d9b64 641#endif