2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 * H.264 / AVC / MPEG4 part10 codec.
24 * @author Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
39 #define interlaced_dct interlaced_dct_is_a_bad_name
40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
42 #define LUMA_DC_BLOCK_INDEX 25
43 #define CHROMA_DC_BLOCK_INDEX 26
45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
46 #define COEFF_TOKEN_VLC_BITS 8
47 #define TOTAL_ZEROS_VLC_BITS 9
48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
49 #define RUN_VLC_BITS 3
50 #define RUN7_VLC_BITS 6
52 #define MAX_SPS_COUNT 32
53 #define MAX_PPS_COUNT 256
55 #define MAX_MMCO_COUNT 66
58 * Sequence parameter set
64 int transform_bypass
; ///< qpprime_y_zero_transform_bypass_flag
65 int log2_max_frame_num
; ///< log2_max_frame_num_minus4 + 4
66 int poc_type
; ///< pic_order_cnt_type
67 int log2_max_poc_lsb
; ///< log2_max_pic_order_cnt_lsb_minus4
68 int delta_pic_order_always_zero_flag
;
69 int offset_for_non_ref_pic
;
70 int offset_for_top_to_bottom_field
;
71 int poc_cycle_length
; ///< num_ref_frames_in_pic_order_cnt_cycle
72 int ref_frame_count
; ///< num_ref_frames
73 int gaps_in_frame_num_allowed_flag
;
74 int mb_width
; ///< frame_width_in_mbs_minus1 + 1
75 int mb_height
; ///< frame_height_in_mbs_minus1 + 1
76 int frame_mbs_only_flag
;
77 int mb_aff
; ///<mb_adaptive_frame_field_flag
78 int direct_8x8_inference_flag
;
79 int crop
; ///< frame_cropping_flag
80 int crop_left
; ///< frame_cropping_rect_left_offset
81 int crop_right
; ///< frame_cropping_rect_right_offset
82 int crop_top
; ///< frame_cropping_rect_top_offset
83 int crop_bottom
; ///< frame_cropping_rect_bottom_offset
84 int vui_parameters_present_flag
;
86 int timing_info_present_flag
;
87 uint32_t num_units_in_tick
;
89 int fixed_frame_rate_flag
;
90 short offset_for_ref_frame
[256]; //FIXME dyn aloc?
91 int bitstream_restriction_flag
;
92 int num_reorder_frames
;
96 * Picture parameter set
100 int cabac
; ///< entropy_coding_mode_flag
101 int pic_order_present
; ///< pic_order_present_flag
102 int slice_group_count
; ///< num_slice_groups_minus1 + 1
103 int mb_slice_group_map_type
;
104 int ref_count
[2]; ///< num_ref_idx_l0/1_active_minus1 + 1
105 int weighted_pred
; ///< weighted_pred_flag
106 int weighted_bipred_idc
;
107 int init_qp
; ///< pic_init_qp_minus26 + 26
108 int init_qs
; ///< pic_init_qs_minus26 + 26
109 int chroma_qp_index_offset
;
110 int deblocking_filter_parameters_present
; ///< deblocking_filter_parameters_present_flag
111 int constrained_intra_pred
; ///< constrained_intra_pred_flag
112 int redundant_pic_cnt_present
; ///< redundant_pic_cnt_present_flag
113 int transform_8x8_mode
; ///< transform_8x8_mode_flag
117 * Memory management control operation opcode.
119 typedef enum MMCOOpcode
{
130 * Memory management control operation.
141 typedef struct H264Context
{
149 #define NAL_IDR_SLICE 5
153 #define NAL_PICTURE_DELIMITER 9
154 #define NAL_FILTER_DATA 10
155 uint8_t *rbsp_buffer
;
156 int rbsp_buffer_size
;
159 * Used to parse AVC variant of h264
161 int is_avc
; ///< this flag is != 0 if codec is avc1
162 int got_avcC
; ///< flag used to parse avcC data only once
163 int nal_length_size
; ///< Number of bytes used for nal length (1, 2 or 4)
167 int prev_mb_skipped
; //FIXME remove (IMHO not used)
170 int chroma_pred_mode
;
171 int intra16x16_pred_mode
;
176 int8_t intra4x4_pred_mode_cache
[5*8];
177 int8_t (*intra4x4_pred_mode
)[8];
178 void (*pred4x4
[9+3])(uint8_t *src
, uint8_t *topright
, int stride
);//FIXME move to dsp?
179 void (*pred8x8l
[9+3])(uint8_t *src
, int topleft
, int topright
, int stride
);
180 void (*pred8x8
[4+3])(uint8_t *src
, int stride
);
181 void (*pred16x16
[4+3])(uint8_t *src
, int stride
);
182 unsigned int topleft_samples_available
;
183 unsigned int top_samples_available
;
184 unsigned int topright_samples_available
;
185 unsigned int left_samples_available
;
186 uint8_t (*top_borders
[2])[16+2*8];
187 uint8_t left_border
[2*(17+2*9)];
190 * non zero coeff count cache.
191 * is 64 if not available.
193 uint8_t non_zero_count_cache
[6*8] __align8
;
194 uint8_t (*non_zero_count
)[16];
197 * Motion vector cache.
199 int16_t mv_cache
[2][5*8][2] __align8
;
200 int8_t ref_cache
[2][5*8] __align8
;
201 #define LIST_NOT_USED -1 //FIXME rename?
202 #define PART_NOT_AVAILABLE -2
205 * is 1 if the specific list MV&references are set to 0,0,-2.
207 int mv_cache_clean
[2];
210 * number of neighbors (top and/or left) that used 8x8 dct
212 int neighbor_transform_size
;
215 * block_offset[ 0..23] for frame macroblocks
216 * block_offset[24..47] for field macroblocks
218 int block_offset
[2*(16+8)];
220 uint32_t *mb2b_xy
; //FIXME are these 4 a good idea?
222 int b_stride
; //FIXME use s->b4_stride
228 int unknown_svq3_flag
;
229 int next_slice_index
;
231 SPS sps_buffer
[MAX_SPS_COUNT
];
232 SPS sps
; ///< current sps
234 PPS pps_buffer
[MAX_PPS_COUNT
];
238 PPS pps
; //FIXME move to Picture perhaps? (->no) do we need that?
240 uint16_t (*dequant4_coeff
)[16]; // FIXME quant matrices should be per SPS or PPS
241 uint16_t (*dequant8_coeff
)[64];
244 uint8_t *slice_table_base
;
245 uint8_t *slice_table
; ///< slice_table_base + mb_stride + 1
247 int slice_type_fixed
;
249 //interlacing specific flags
251 int mb_field_decoding_flag
;
258 int delta_poc_bottom
;
261 int prev_poc_msb
; ///< poc_msb of the last reference pic for POC type 0
262 int prev_poc_lsb
; ///< poc_lsb of the last reference pic for POC type 0
263 int frame_num_offset
; ///< for POC type 2
264 int prev_frame_num_offset
; ///< for POC type 2
265 int prev_frame_num
; ///< frame_num of the last pic for POC type 1/2
268 * frame_num for frames or 2*frame_num for field pics.
273 * max_frame_num or 2*max_frame_num for field pics.
277 //Weighted pred stuff
279 int use_weight_chroma
;
280 int luma_log2_weight_denom
;
281 int chroma_log2_weight_denom
;
282 int luma_weight
[2][16];
283 int luma_offset
[2][16];
284 int chroma_weight
[2][16][2];
285 int chroma_offset
[2][16][2];
286 int implicit_weight
[16][16];
289 int deblocking_filter
; ///< disable_deblocking_filter_idc with 1<->0
290 int slice_alpha_c0_offset
;
291 int slice_beta_offset
;
293 int redundant_pic_count
;
295 int direct_spatial_mv_pred
;
296 int dist_scale_factor
[16];
297 int map_col_to_list0
[2][16];
300 * num_ref_idx_l0/1_active_minus1 + 1
302 int ref_count
[2];// FIXME split for AFF
303 Picture
*short_ref
[32];
304 Picture
*long_ref
[32];
305 Picture default_ref_list
[2][32];
306 Picture ref_list
[2][32]; //FIXME size?
307 Picture field_ref_list
[2][32]; //FIXME size?
308 Picture
*delayed_pic
[16]; //FIXME size?
309 Picture
*delayed_output_pic
;
312 * memory management control operations buffer.
314 MMCO mmco
[MAX_MMCO_COUNT
];
317 int long_ref_count
; ///< number of actual long term references
318 int short_ref_count
; ///< number of actual short term references
321 GetBitContext intra_gb
;
322 GetBitContext inter_gb
;
323 GetBitContext
*intra_gb_ptr
;
324 GetBitContext
*inter_gb_ptr
;
326 DCTELEM mb
[16*24] __align8
;
332 uint8_t cabac_state
[460];
335 /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
339 /* chroma_pred_mode for i4x4 or i16x16, else 0 */
340 uint8_t *chroma_pred_mode_table
;
341 int last_qscale_diff
;
342 int16_t (*mvd_table
[2])[2];
343 int16_t mvd_cache
[2][5*8][2] __align8
;
344 uint8_t *direct_table
;
345 uint8_t direct_cache
[5*8];
347 uint8_t zigzag_scan
[16];
348 uint8_t field_scan
[16];
349 const uint8_t *zigzag_scan_q0
;
350 const uint8_t *field_scan_q0
;
355 static VLC coeff_token_vlc
[4];
356 static VLC chroma_dc_coeff_token_vlc
;
358 static VLC total_zeros_vlc
[15];
359 static VLC chroma_dc_total_zeros_vlc
[3];
361 static VLC run_vlc
[6];
364 static void svq3_luma_dc_dequant_idct_c(DCTELEM
*block
, int qp
);
365 static void svq3_add_idct_c(uint8_t *dst
, DCTELEM
*block
, int stride
, int qp
, int dc
);
366 static void filter_mb( H264Context
*h
, int mb_x
, int mb_y
, uint8_t *img_y
, uint8_t *img_cb
, uint8_t *img_cr
, unsigned int linesize
, unsigned int uvlinesize
);
368 static inline uint32_t pack16to32(int a
, int b
){
369 #ifdef WORDS_BIGENDIAN
370 return (b
&0xFFFF) + (a
<<16);
372 return (a
&0xFFFF) + (b
<<16);
378 * @param h height of the rectangle, should be a constant
379 * @param w width of the rectangle, should be a constant
380 * @param size the size of val (1 or 4), should be a constant
382 static inline void fill_rectangle(void *vp
, int w
, int h
, int stride
, uint32_t val
, int size
){ //FIXME ensure this IS inlined
383 uint8_t *p
= (uint8_t*)vp
;
384 assert(size
==1 || size
==4);
389 assert((((int)vp
)&(FFMIN(w
, STRIDE_ALIGN
)-1)) == 0);
390 assert((stride
&(w
-1))==0);
391 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
394 *(uint16_t*)(p
+ stride
)= size
==4 ? val
: val
*0x0101;
395 }else if(w
==2 && h
==4){
396 *(uint16_t*)(p
+ 0*stride
)=
397 *(uint16_t*)(p
+ 1*stride
)=
398 *(uint16_t*)(p
+ 2*stride
)=
399 *(uint16_t*)(p
+ 3*stride
)= size
==4 ? val
: val
*0x0101;
400 }else if(w
==4 && h
==1){
401 *(uint32_t*)(p
+ 0*stride
)= size
==4 ? val
: val
*0x01010101;
402 }else if(w
==4 && h
==2){
403 *(uint32_t*)(p
+ 0*stride
)=
404 *(uint32_t*)(p
+ 1*stride
)= size
==4 ? val
: val
*0x01010101;
405 }else if(w
==4 && h
==4){
406 *(uint32_t*)(p
+ 0*stride
)=
407 *(uint32_t*)(p
+ 1*stride
)=
408 *(uint32_t*)(p
+ 2*stride
)=
409 *(uint32_t*)(p
+ 3*stride
)= size
==4 ? val
: val
*0x01010101;
410 }else if(w
==8 && h
==1){
412 *(uint32_t*)(p
+ 4)= size
==4 ? val
: val
*0x01010101;
413 }else if(w
==8 && h
==2){
414 *(uint32_t*)(p
+ 0 + 0*stride
)=
415 *(uint32_t*)(p
+ 4 + 0*stride
)=
416 *(uint32_t*)(p
+ 0 + 1*stride
)=
417 *(uint32_t*)(p
+ 4 + 1*stride
)= size
==4 ? val
: val
*0x01010101;
418 }else if(w
==8 && h
==4){
419 *(uint64_t*)(p
+ 0*stride
)=
420 *(uint64_t*)(p
+ 1*stride
)=
421 *(uint64_t*)(p
+ 2*stride
)=
422 *(uint64_t*)(p
+ 3*stride
)= size
==4 ? val
*0x0100000001ULL
: val
*0x0101010101010101ULL
;
423 }else if(w
==16 && h
==2){
424 *(uint64_t*)(p
+ 0+0*stride
)=
425 *(uint64_t*)(p
+ 8+0*stride
)=
426 *(uint64_t*)(p
+ 0+1*stride
)=
427 *(uint64_t*)(p
+ 8+1*stride
)= size
==4 ? val
*0x0100000001ULL
: val
*0x0101010101010101ULL
;
428 }else if(w
==16 && h
==4){
429 *(uint64_t*)(p
+ 0+0*stride
)=
430 *(uint64_t*)(p
+ 8+0*stride
)=
431 *(uint64_t*)(p
+ 0+1*stride
)=
432 *(uint64_t*)(p
+ 8+1*stride
)=
433 *(uint64_t*)(p
+ 0+2*stride
)=
434 *(uint64_t*)(p
+ 8+2*stride
)=
435 *(uint64_t*)(p
+ 0+3*stride
)=
436 *(uint64_t*)(p
+ 8+3*stride
)= size
==4 ? val
*0x0100000001ULL
: val
*0x0101010101010101ULL
;
441 static inline void fill_caches(H264Context
*h
, int mb_type
, int for_deblock
){
442 MpegEncContext
* const s
= &h
->s
;
443 const int mb_xy
= s
->mb_x
+ s
->mb_y
*s
->mb_stride
;
444 int topleft_xy
, top_xy
, topright_xy
, left_xy
[2];
445 int topleft_type
, top_type
, topright_type
, left_type
[2];
449 //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
450 // the actual condition is whether we're on the edge of a slice,
451 // and even then the intra and nnz parts are unnecessary.
452 if(for_deblock
&& h
->slice_num
== 1)
455 //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
457 top_xy
= mb_xy
- s
->mb_stride
;
458 topleft_xy
= top_xy
- 1;
459 topright_xy
= top_xy
+ 1;
460 left_xy
[1] = left_xy
[0] = mb_xy
-1;
470 const int pair_xy
= s
->mb_x
+ (s
->mb_y
& ~1)*s
->mb_stride
;
471 const int top_pair_xy
= pair_xy
- s
->mb_stride
;
472 const int topleft_pair_xy
= top_pair_xy
- 1;
473 const int topright_pair_xy
= top_pair_xy
+ 1;
474 const int topleft_mb_frame_flag
= !IS_INTERLACED(s
->current_picture
.mb_type
[topleft_pair_xy
]);
475 const int top_mb_frame_flag
= !IS_INTERLACED(s
->current_picture
.mb_type
[top_pair_xy
]);
476 const int topright_mb_frame_flag
= !IS_INTERLACED(s
->current_picture
.mb_type
[topright_pair_xy
]);
477 const int left_mb_frame_flag
= !IS_INTERLACED(s
->current_picture
.mb_type
[pair_xy
-1]);
478 const int curr_mb_frame_flag
= !IS_INTERLACED(mb_type
);
479 const int bottom
= (s
->mb_y
& 1);
480 tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag
, left_mb_frame_flag
, topleft_mb_frame_flag
, top_mb_frame_flag
, topright_mb_frame_flag
);
482 ?
!curr_mb_frame_flag
// bottom macroblock
483 : (!curr_mb_frame_flag
&& !top_mb_frame_flag
) // top macroblock
485 top_xy
-= s
->mb_stride
;
488 ?
!curr_mb_frame_flag
// bottom macroblock
489 : (!curr_mb_frame_flag
&& !topleft_mb_frame_flag
) // top macroblock
491 topleft_xy
-= s
->mb_stride
;
494 ?
!curr_mb_frame_flag
// bottom macroblock
495 : (!curr_mb_frame_flag
&& !topright_mb_frame_flag
) // top macroblock
497 topright_xy
-= s
->mb_stride
;
499 if (left_mb_frame_flag
!= curr_mb_frame_flag
) {
500 left_xy
[1] = left_xy
[0] = pair_xy
- 1;
501 if (curr_mb_frame_flag
) {
522 left_xy
[1] += s
->mb_stride
;
535 h
->top_mb_xy
= top_xy
;
536 h
->left_mb_xy
[0] = left_xy
[0];
537 h
->left_mb_xy
[1] = left_xy
[1];
539 topleft_type
= h
->slice_table
[topleft_xy
] < 255 ? s
->current_picture
.mb_type
[topleft_xy
] : 0;
540 top_type
= h
->slice_table
[top_xy
] < 255 ? s
->current_picture
.mb_type
[top_xy
] : 0;
541 topright_type
= h
->slice_table
[topright_xy
] < 255 ? s
->current_picture
.mb_type
[topright_xy
]: 0;
542 left_type
[0] = h
->slice_table
[left_xy
[0] ] < 255 ? s
->current_picture
.mb_type
[left_xy
[0]] : 0;
543 left_type
[1] = h
->slice_table
[left_xy
[1] ] < 255 ? s
->current_picture
.mb_type
[left_xy
[1]] : 0;
545 topleft_type
= h
->slice_table
[topleft_xy
] == h
->slice_num ? s
->current_picture
.mb_type
[topleft_xy
] : 0;
546 top_type
= h
->slice_table
[top_xy
] == h
->slice_num ? s
->current_picture
.mb_type
[top_xy
] : 0;
547 topright_type
= h
->slice_table
[topright_xy
] == h
->slice_num ? s
->current_picture
.mb_type
[topright_xy
]: 0;
548 left_type
[0] = h
->slice_table
[left_xy
[0] ] == h
->slice_num ? s
->current_picture
.mb_type
[left_xy
[0]] : 0;
549 left_type
[1] = h
->slice_table
[left_xy
[1] ] == h
->slice_num ? s
->current_picture
.mb_type
[left_xy
[1]] : 0;
552 if(IS_INTRA(mb_type
)){
553 h
->topleft_samples_available
=
554 h
->top_samples_available
=
555 h
->left_samples_available
= 0xFFFF;
556 h
->topright_samples_available
= 0xEEEA;
558 if(!IS_INTRA(top_type
) && (top_type
==0 || h
->pps
.constrained_intra_pred
)){
559 h
->topleft_samples_available
= 0xB3FF;
560 h
->top_samples_available
= 0x33FF;
561 h
->topright_samples_available
= 0x26EA;
564 if(!IS_INTRA(left_type
[i
]) && (left_type
[i
]==0 || h
->pps
.constrained_intra_pred
)){
565 h
->topleft_samples_available
&= 0xDF5F;
566 h
->left_samples_available
&= 0x5F5F;
570 if(!IS_INTRA(topleft_type
) && (topleft_type
==0 || h
->pps
.constrained_intra_pred
))
571 h
->topleft_samples_available
&= 0x7FFF;
573 if(!IS_INTRA(topright_type
) && (topright_type
==0 || h
->pps
.constrained_intra_pred
))
574 h
->topright_samples_available
&= 0xFBFF;
576 if(IS_INTRA4x4(mb_type
)){
577 if(IS_INTRA4x4(top_type
)){
578 h
->intra4x4_pred_mode_cache
[4+8*0]= h
->intra4x4_pred_mode
[top_xy
][4];
579 h
->intra4x4_pred_mode_cache
[5+8*0]= h
->intra4x4_pred_mode
[top_xy
][5];
580 h
->intra4x4_pred_mode_cache
[6+8*0]= h
->intra4x4_pred_mode
[top_xy
][6];
581 h
->intra4x4_pred_mode_cache
[7+8*0]= h
->intra4x4_pred_mode
[top_xy
][3];
584 if(!top_type
|| (IS_INTER(top_type
) && h
->pps
.constrained_intra_pred
))
589 h
->intra4x4_pred_mode_cache
[4+8*0]=
590 h
->intra4x4_pred_mode_cache
[5+8*0]=
591 h
->intra4x4_pred_mode_cache
[6+8*0]=
592 h
->intra4x4_pred_mode_cache
[7+8*0]= pred
;
595 if(IS_INTRA4x4(left_type
[i
])){
596 h
->intra4x4_pred_mode_cache
[3+8*1 + 2*8*i
]= h
->intra4x4_pred_mode
[left_xy
[i
]][left_block
[0+2*i
]];
597 h
->intra4x4_pred_mode_cache
[3+8*2 + 2*8*i
]= h
->intra4x4_pred_mode
[left_xy
[i
]][left_block
[1+2*i
]];
600 if(!left_type
[i
] || (IS_INTER(left_type
[i
]) && h
->pps
.constrained_intra_pred
))
605 h
->intra4x4_pred_mode_cache
[3+8*1 + 2*8*i
]=
606 h
->intra4x4_pred_mode_cache
[3+8*2 + 2*8*i
]= pred
;
621 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
623 h
->non_zero_count_cache
[4+8*0]= h
->non_zero_count
[top_xy
][4];
624 h
->non_zero_count_cache
[5+8*0]= h
->non_zero_count
[top_xy
][5];
625 h
->non_zero_count_cache
[6+8*0]= h
->non_zero_count
[top_xy
][6];
626 h
->non_zero_count_cache
[7+8*0]= h
->non_zero_count
[top_xy
][3];
628 h
->non_zero_count_cache
[1+8*0]= h
->non_zero_count
[top_xy
][9];
629 h
->non_zero_count_cache
[2+8*0]= h
->non_zero_count
[top_xy
][8];
631 h
->non_zero_count_cache
[1+8*3]= h
->non_zero_count
[top_xy
][12];
632 h
->non_zero_count_cache
[2+8*3]= h
->non_zero_count
[top_xy
][11];
635 h
->non_zero_count_cache
[4+8*0]=
636 h
->non_zero_count_cache
[5+8*0]=
637 h
->non_zero_count_cache
[6+8*0]=
638 h
->non_zero_count_cache
[7+8*0]=
640 h
->non_zero_count_cache
[1+8*0]=
641 h
->non_zero_count_cache
[2+8*0]=
643 h
->non_zero_count_cache
[1+8*3]=
644 h
->non_zero_count_cache
[2+8*3]= h
->pps
.cabac
&& !IS_INTRA(mb_type
) ?
0 : 64;
648 for (i
=0; i
<2; i
++) {
650 h
->non_zero_count_cache
[3+8*1 + 2*8*i
]= h
->non_zero_count
[left_xy
[i
]][left_block
[0+2*i
]];
651 h
->non_zero_count_cache
[3+8*2 + 2*8*i
]= h
->non_zero_count
[left_xy
[i
]][left_block
[1+2*i
]];
652 h
->non_zero_count_cache
[0+8*1 + 8*i
]= h
->non_zero_count
[left_xy
[i
]][left_block
[4+2*i
]];
653 h
->non_zero_count_cache
[0+8*4 + 8*i
]= h
->non_zero_count
[left_xy
[i
]][left_block
[5+2*i
]];
655 h
->non_zero_count_cache
[3+8*1 + 2*8*i
]=
656 h
->non_zero_count_cache
[3+8*2 + 2*8*i
]=
657 h
->non_zero_count_cache
[0+8*1 + 8*i
]=
658 h
->non_zero_count_cache
[0+8*4 + 8*i
]= h
->pps
.cabac
&& !IS_INTRA(mb_type
) ?
0 : 64;
665 h
->top_cbp
= h
->cbp_table
[top_xy
];
666 } else if(IS_INTRA(mb_type
)) {
673 h
->left_cbp
= h
->cbp_table
[left_xy
[0]] & 0x1f0;
674 } else if(IS_INTRA(mb_type
)) {
680 h
->left_cbp
|= ((h
->cbp_table
[left_xy
[0]]>>((left_block
[0]&(~1))+1))&0x1) << 1;
683 h
->left_cbp
|= ((h
->cbp_table
[left_xy
[1]]>>((left_block
[2]&(~1))+1))&0x1) << 3;
688 //FIXME direct mb can skip much of this
689 if(IS_INTER(mb_type
) || IS_DIRECT(mb_type
)){
691 for(list
=0; list
<1+(h
->slice_type
==B_TYPE
); list
++){
692 if(!USES_LIST(mb_type
, list
) && !IS_DIRECT(mb_type
) && !h
->deblocking_filter
){
693 /*if(!h->mv_cache_clean[list]){
694 memset(h->mv_cache [list], 0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
695 memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
696 h->mv_cache_clean[list]= 1;
700 h
->mv_cache_clean
[list
]= 0;
702 if(IS_INTER(top_type
)){
703 const int b_xy
= h
->mb2b_xy
[top_xy
] + 3*h
->b_stride
;
704 const int b8_xy
= h
->mb2b8_xy
[top_xy
] + h
->b8_stride
;
705 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 0 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 0];
706 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 1 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 1];
707 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 2 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 2];
708 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 3 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 3];
709 h
->ref_cache
[list
][scan8
[0] + 0 - 1*8]=
710 h
->ref_cache
[list
][scan8
[0] + 1 - 1*8]= s
->current_picture
.ref_index
[list
][b8_xy
+ 0];
711 h
->ref_cache
[list
][scan8
[0] + 2 - 1*8]=
712 h
->ref_cache
[list
][scan8
[0] + 3 - 1*8]= s
->current_picture
.ref_index
[list
][b8_xy
+ 1];
714 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 0 - 1*8]=
715 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 1 - 1*8]=
716 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 2 - 1*8]=
717 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 3 - 1*8]= 0;
718 *(uint32_t*)&h
->ref_cache
[list
][scan8
[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED
: PART_NOT_AVAILABLE
)&0xFF)*0x01010101;
721 //FIXME unify cleanup or sth
722 if(IS_INTER(left_type
[0])){
723 const int b_xy
= h
->mb2b_xy
[left_xy
[0]] + 3;
724 const int b8_xy
= h
->mb2b8_xy
[left_xy
[0]] + 1;
725 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 0*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ h
->b_stride
*left_block
[0]];
726 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ h
->b_stride
*left_block
[1]];
727 h
->ref_cache
[list
][scan8
[0] - 1 + 0*8]=
728 h
->ref_cache
[list
][scan8
[0] - 1 + 1*8]= s
->current_picture
.ref_index
[list
][b8_xy
+ h
->b8_stride
*(left_block
[0]>>1)];
730 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 0*8]=
731 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 1*8]= 0;
732 h
->ref_cache
[list
][scan8
[0] - 1 + 0*8]=
733 h
->ref_cache
[list
][scan8
[0] - 1 + 1*8]= left_type
[0] ? LIST_NOT_USED
: PART_NOT_AVAILABLE
;
736 if(IS_INTER(left_type
[1])){
737 const int b_xy
= h
->mb2b_xy
[left_xy
[1]] + 3;
738 const int b8_xy
= h
->mb2b8_xy
[left_xy
[1]] + 1;
739 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 2*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ h
->b_stride
*left_block
[2]];
740 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 3*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
+ h
->b_stride
*left_block
[3]];
741 h
->ref_cache
[list
][scan8
[0] - 1 + 2*8]=
742 h
->ref_cache
[list
][scan8
[0] - 1 + 3*8]= s
->current_picture
.ref_index
[list
][b8_xy
+ h
->b8_stride
*(left_block
[2]>>1)];
744 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 2*8]=
745 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 + 3*8]= 0;
746 h
->ref_cache
[list
][scan8
[0] - 1 + 2*8]=
747 h
->ref_cache
[list
][scan8
[0] - 1 + 3*8]= left_type
[0] ? LIST_NOT_USED
: PART_NOT_AVAILABLE
;
748 assert((!left_type
[0]) == (!left_type
[1]));
751 if(for_deblock
|| (IS_DIRECT(mb_type
) && !h
->direct_spatial_mv_pred
))
754 if(IS_INTER(topleft_type
)){
755 const int b_xy
= h
->mb2b_xy
[topleft_xy
] + 3 + 3*h
->b_stride
;
756 const int b8_xy
= h
->mb2b8_xy
[topleft_xy
] + 1 + h
->b8_stride
;
757 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
];
758 h
->ref_cache
[list
][scan8
[0] - 1 - 1*8]= s
->current_picture
.ref_index
[list
][b8_xy
];
760 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] - 1 - 1*8]= 0;
761 h
->ref_cache
[list
][scan8
[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED
: PART_NOT_AVAILABLE
;
764 if(IS_INTER(topright_type
)){
765 const int b_xy
= h
->mb2b_xy
[topright_xy
] + 3*h
->b_stride
;
766 const int b8_xy
= h
->mb2b8_xy
[topright_xy
] + h
->b8_stride
;
767 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 4 - 1*8]= *(uint32_t*)s
->current_picture
.motion_val
[list
][b_xy
];
768 h
->ref_cache
[list
][scan8
[0] + 4 - 1*8]= s
->current_picture
.ref_index
[list
][b8_xy
];
770 *(uint32_t*)h
->mv_cache
[list
][scan8
[0] + 4 - 1*8]= 0;
771 h
->ref_cache
[list
][scan8
[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED
: PART_NOT_AVAILABLE
;
775 h
->ref_cache
[list
][scan8
[5 ]+1] =
776 h
->ref_cache
[list
][scan8
[7 ]+1] =
777 h
->ref_cache
[list
][scan8
[13]+1] = //FIXME remove past 3 (init somewhere else)
778 h
->ref_cache
[list
][scan8
[4 ]] =
779 h
->ref_cache
[list
][scan8
[12]] = PART_NOT_AVAILABLE
;
780 *(uint32_t*)h
->mv_cache
[list
][scan8
[5 ]+1]=
781 *(uint32_t*)h
->mv_cache
[list
][scan8
[7 ]+1]=
782 *(uint32_t*)h
->mv_cache
[list
][scan8
[13]+1]= //FIXME remove past 3 (init somewhere else)
783 *(uint32_t*)h
->mv_cache
[list
][scan8
[4 ]]=
784 *(uint32_t*)h
->mv_cache
[list
][scan8
[12]]= 0;
787 /* XXX beurk, Load mvd */
788 if(IS_INTER(topleft_type
)){
789 const int b_xy
= h
->mb2b_xy
[topleft_xy
] + 3 + 3*h
->b_stride
;
790 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 - 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
];
792 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 - 1*8]= 0;
795 if(IS_INTER(top_type
)){
796 const int b_xy
= h
->mb2b_xy
[top_xy
] + 3*h
->b_stride
;
797 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 0 - 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ 0];
798 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 1 - 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ 1];
799 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 2 - 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ 2];
800 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 3 - 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ 3];
802 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 0 - 1*8]=
803 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 1 - 1*8]=
804 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 2 - 1*8]=
805 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] + 3 - 1*8]= 0;
807 if(IS_INTER(left_type
[0])){
808 const int b_xy
= h
->mb2b_xy
[left_xy
[0]] + 3;
809 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 0*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ h
->b_stride
*left_block
[0]];
810 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 1*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ h
->b_stride
*left_block
[1]];
812 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 0*8]=
813 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 1*8]= 0;
815 if(IS_INTER(left_type
[1])){
816 const int b_xy
= h
->mb2b_xy
[left_xy
[1]] + 3;
817 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 2*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ h
->b_stride
*left_block
[2]];
818 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 3*8]= *(uint32_t*)h
->mvd_table
[list
][b_xy
+ h
->b_stride
*left_block
[3]];
820 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 2*8]=
821 *(uint32_t*)h
->mvd_cache
[list
][scan8
[0] - 1 + 3*8]= 0;
823 *(uint32_t*)h
->mvd_cache
[list
][scan8
[5 ]+1]=
824 *(uint32_t*)h
->mvd_cache
[list
][scan8
[7 ]+1]=
825 *(uint32_t*)h
->mvd_cache
[list
][scan8
[13]+1]= //FIXME remove past 3 (init somewhere else)
826 *(uint32_t*)h
->mvd_cache
[list
][scan8
[4 ]]=
827 *(uint32_t*)h
->mvd_cache
[list
][scan8
[12]]= 0;
829 if(h
->slice_type
== B_TYPE
){
830 fill_rectangle(&h
->direct_cache
[scan8
[0]], 4, 4, 8, 0, 1);
832 if(IS_DIRECT(top_type
)){
833 *(uint32_t*)&h
->direct_cache
[scan8
[0] - 1*8]= 0x01010101;
834 }else if(IS_8X8(top_type
)){
835 int b8_xy
= h
->mb2b8_xy
[top_xy
] + h
->b8_stride
;
836 h
->direct_cache
[scan8
[0] + 0 - 1*8]= h
->direct_table
[b8_xy
];
837 h
->direct_cache
[scan8
[0] + 2 - 1*8]= h
->direct_table
[b8_xy
+ 1];
839 *(uint32_t*)&h
->direct_cache
[scan8
[0] - 1*8]= 0;
843 if(IS_DIRECT(left_type
[0])){
844 h
->direct_cache
[scan8
[0] - 1 + 0*8]=
845 h
->direct_cache
[scan8
[0] - 1 + 2*8]= 1;
846 }else if(IS_8X8(left_type
[0])){
847 int b8_xy
= h
->mb2b8_xy
[left_xy
[0]] + 1;
848 h
->direct_cache
[scan8
[0] - 1 + 0*8]= h
->direct_table
[b8_xy
];
849 h
->direct_cache
[scan8
[0] - 1 + 2*8]= h
->direct_table
[b8_xy
+ h
->b8_stride
];
851 h
->direct_cache
[scan8
[0] - 1 + 0*8]=
852 h
->direct_cache
[scan8
[0] - 1 + 2*8]= 0;
860 h
->neighbor_transform_size
= !!IS_8x8DCT(top_type
) + !!IS_8x8DCT(left_type
[0]);
863 static inline void write_back_intra_pred_mode(H264Context
*h
){
864 MpegEncContext
* const s
= &h
->s
;
865 const int mb_xy
= s
->mb_x
+ s
->mb_y
*s
->mb_stride
;
867 h
->intra4x4_pred_mode
[mb_xy
][0]= h
->intra4x4_pred_mode_cache
[7+8*1];
868 h
->intra4x4_pred_mode
[mb_xy
][1]= h
->intra4x4_pred_mode_cache
[7+8*2];
869 h
->intra4x4_pred_mode
[mb_xy
][2]= h
->intra4x4_pred_mode_cache
[7+8*3];
870 h
->intra4x4_pred_mode
[mb_xy
][3]= h
->intra4x4_pred_mode_cache
[7+8*4];
871 h
->intra4x4_pred_mode
[mb_xy
][4]= h
->intra4x4_pred_mode_cache
[4+8*4];
872 h
->intra4x4_pred_mode
[mb_xy
][5]= h
->intra4x4_pred_mode_cache
[5+8*4];
873 h
->intra4x4_pred_mode
[mb_xy
][6]= h
->intra4x4_pred_mode_cache
[6+8*4];
877 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
879 static inline int check_intra4x4_pred_mode(H264Context
*h
){
880 MpegEncContext
* const s
= &h
->s
;
881 static const int8_t top
[12]= {-1, 0,LEFT_DC_PRED
,-1,-1,-1,-1,-1, 0};
882 static const int8_t left
[12]= { 0,-1, TOP_DC_PRED
, 0,-1,-1,-1, 0,-1,DC_128_PRED
};
885 if(!(h
->top_samples_available
&0x8000)){
887 int status
= top
[ h
->intra4x4_pred_mode_cache
[scan8
[0] + i
] ];
889 av_log(h
->s
.avctx
, AV_LOG_ERROR
, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status
, s
->mb_x
, s
->mb_y
);
892 h
->intra4x4_pred_mode_cache
[scan8
[0] + i
]= status
;
897 if(!(h
->left_samples_available
&0x8000)){
899 int status
= left
[ h
->intra4x4_pred_mode_cache
[scan8
[0] + 8*i
] ];
901 av_log(h
->s
.avctx
, AV_LOG_ERROR
, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status
, s
->mb_x
, s
->mb_y
);
904 h
->intra4x4_pred_mode_cache
[scan8
[0] + 8*i
]= status
;
910 } //FIXME cleanup like next
913 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
915 static inline int check_intra_pred_mode(H264Context
*h
, int mode
){
916 MpegEncContext
* const s
= &h
->s
;
917 static const int8_t top
[7]= {LEFT_DC_PRED8x8
, 1,-1,-1};
918 static const int8_t left
[7]= { TOP_DC_PRED8x8
,-1, 2,-1,DC_128_PRED8x8
};
920 if(mode
< 0 || mode
> 6) {
921 av_log(h
->s
.avctx
, AV_LOG_ERROR
, "out of range intra chroma pred mode at %d %d\n", s
->mb_x
, s
->mb_y
);
925 if(!(h
->top_samples_available
&0x8000)){
928 av_log(h
->s
.avctx
, AV_LOG_ERROR
, "top block unavailable for requested intra mode at %d %d\n", s
->mb_x
, s
->mb_y
);
933 if(!(h
->left_samples_available
&0x8000)){
936 av_log(h
->s
.avctx
, AV_LOG_ERROR
, "left block unavailable for requested intra mode at %d %d\n", s
->mb_x
, s
->mb_y
);
945 * gets the predicted intra4x4 prediction mode.
947 static inline int pred_intra_mode(H264Context
*h
, int n
){
948 const int index8
= scan8
[n
];
949 const int left
= h
->intra4x4_pred_mode_cache
[index8
- 1];
950 const int top
= h
->intra4x4_pred_mode_cache
[index8
- 8];
951 const int min
= FFMIN(left
, top
);
953 tprintf("mode:%d %d min:%d\n", left
,top
, min
);
955 if(min
<0) return DC_PRED
;
959 static inline void write_back_non_zero_count(H264Context
*h
){
960 MpegEncContext
* const s
= &h
->s
;
961 const int mb_xy
= s
->mb_x
+ s
->mb_y
*s
->mb_stride
;
963 h
->non_zero_count
[mb_xy
][0]= h
->non_zero_count_cache
[7+8*1];
964 h
->non_zero_count
[mb_xy
][1]= h
->non_zero_count_cache
[7+8*2];
965 h
->non_zero_count
[mb_xy
][2]= h
->non_zero_count_cache
[7+8*3];
966 h
->non_zero_count
[mb_xy
][3]= h
->non_zero_count_cache
[7+8*4];
967 h
->non_zero_count
[mb_xy
][4]= h
->non_zero_count_cache
[4+8*4];
968 h
->non_zero_count
[mb_xy
][5]= h
->non_zero_count_cache
[5+8*4];
969 h
->non_zero_count
[mb_xy
][6]= h
->non_zero_count_cache
[6+8*4];
971 h
->non_zero_count
[mb_xy
][9]= h
->non_zero_count_cache
[1+8*2];
972 h
->non_zero_count
[mb_xy
][8]= h
->non_zero_count_cache
[2+8*2];
973 h
->non_zero_count
[mb_xy
][7]= h
->non_zero_count_cache
[2+8*1];
975 h
->non_zero_count
[mb_xy
][12]=h
->non_zero_count_cache
[1+8*5];
976 h
->non_zero_count
[mb_xy
][11]=h
->non_zero_count_cache
[2+8*5];
977 h
->non_zero_count
[mb_xy
][10]=h
->non_zero_count_cache
[2+8*4];
981 * gets the predicted number of non zero coefficients.
982 * @param n block index
984 static inline int pred_non_zero_count(H264Context
*h
, int n
){
985 const int index8
= scan8
[n
];
986 const int left
= h
->non_zero_count_cache
[index8
- 1];
987 const int top
= h
->non_zero_count_cache
[index8
- 8];
990 if(i
<64) i
= (i
+1)>>1;
992 tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left
, top
, n
, scan8
[n
], i
&31);
997 static inline int fetch_diagonal_mv(H264Context
*h
, const int16_t **C
, int i
, int list
, int part_width
){
998 const int topright_ref
= h
->ref_cache
[list
][ i
- 8 + part_width
];
1000 if(topright_ref
!= PART_NOT_AVAILABLE
){
1001 *C
= h
->mv_cache
[list
][ i
- 8 + part_width
];
1002 return topright_ref
;
1004 tprintf("topright MV not available\n");
1006 *C
= h
->mv_cache
[list
][ i
- 8 - 1 ];
1007 return h
->ref_cache
[list
][ i
- 8 - 1 ];
1012 * gets the predicted MV.
1013 * @param n the block index
1014 * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1015 * @param mx the x component of the predicted motion vector
1016 * @param my the y component of the predicted motion vector
1018 static inline void pred_motion(H264Context
* const h
, int n
, int part_width
, int list
, int ref
, int * const mx
, int * const my
){
1019 const int index8
= scan8
[n
];
1020 const int top_ref
= h
->ref_cache
[list
][ index8
- 8 ];
1021 const int left_ref
= h
->ref_cache
[list
][ index8
- 1 ];
1022 const int16_t * const A
= h
->mv_cache
[list
][ index8
- 1 ];
1023 const int16_t * const B
= h
->mv_cache
[list
][ index8
- 8 ];
1025 int diagonal_ref
, match_count
;
1027 assert(part_width
==1 || part_width
==2 || part_width
==4);
1037 diagonal_ref
= fetch_diagonal_mv(h
, &C
, index8
, list
, part_width
);
1038 match_count
= (diagonal_ref
==ref
) + (top_ref
==ref
) + (left_ref
==ref
);
1039 tprintf("pred_motion match_count=%d\n", match_count
);
1040 if(match_count
> 1){ //most common
1041 *mx
= mid_pred(A
[0], B
[0], C
[0]);
1042 *my
= mid_pred(A
[1], B
[1], C
[1]);
1043 }else if(match_count
==1){
1047 }else if(top_ref
==ref
){
1055 if(top_ref
== PART_NOT_AVAILABLE
&& diagonal_ref
== PART_NOT_AVAILABLE
&& left_ref
!= PART_NOT_AVAILABLE
){
1059 *mx
= mid_pred(A
[0], B
[0], C
[0]);
1060 *my
= mid_pred(A
[1], B
[1], C
[1]);
1064 tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref
, B
[0], B
[1], diagonal_ref
, C
[0], C
[1], left_ref
, A
[0], A
[1], ref
, *mx
, *my
, h
->s
.mb_x
, h
->s
.mb_y
, n
, list
);
1068 * gets the directionally predicted 16x8 MV.
1069 * @param n the block index
1070 * @param mx the x component of the predicted motion vector
1071 * @param my the y component of the predicted motion vector
1073 static inline void pred_16x8_motion(H264Context
* const h
, int n
, int list
, int ref
, int * const mx
, int * const my
){
1075 const int top_ref
= h
->ref_cache
[list
][ scan8
[0] - 8 ];
1076 const int16_t * const B
= h
->mv_cache
[list
][ scan8
[0] - 8 ];
1078 tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref
, B
[0], B
[1], h
->s
.mb_x
, h
->s
.mb_y
, n
, list
);
1086 const int left_ref
= h
->ref_cache
[list
][ scan8
[8] - 1 ];
1087 const int16_t * const A
= h
->mv_cache
[list
][ scan8
[8] - 1 ];
1089 tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref
, A
[0], A
[1], h
->s
.mb_x
, h
->s
.mb_y
, n
, list
);
1091 if(left_ref
== ref
){
1099 pred_motion(h
, n
, 4, list
, ref
, mx
, my
);
1103 * gets the directionally predicted 8x16 MV.
1104 * @param n the block index
1105 * @param mx the x component of the predicted motion vector
1106 * @param my the y component of the predicted motion vector
1108 static inline void pred_8x16_motion(H264Context
* const h
, int n
, int list
, int ref
, int * const mx
, int * const my
){
1110 const int left_ref
= h
->ref_cache
[list
][ scan8
[0] - 1 ];
1111 const int16_t * const A
= h
->mv_cache
[list
][ scan8
[0] - 1 ];
1113 tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref
, A
[0], A
[1], h
->s
.mb_x
, h
->s
.mb_y
, n
, list
);
1115 if(left_ref
== ref
){
1124 diagonal_ref
= fetch_diagonal_mv(h
, &C
, scan8
[4], list
, 2);
1126 tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref
, C
[0], C
[1], h
->s
.mb_x
, h
->s
.mb_y
, n
, list
);
1128 if(diagonal_ref
== ref
){
1136 pred_motion(h
, n
, 2, list
, ref
, mx
, my
);
1139 static inline void pred_pskip_motion(H264Context
* const h
, int * const mx
, int * const my
){
1140 const int top_ref
= h
->ref_cache
[0][ scan8
[0] - 8 ];
1141 const int left_ref
= h
->ref_cache
[0][ scan8
[0] - 1 ];
1143 tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref
, left_ref
, h
->s
.mb_x
, h
->s
.mb_y
);
1145 if(top_ref
== PART_NOT_AVAILABLE
|| left_ref
== PART_NOT_AVAILABLE
1146 || (top_ref
== 0 && *(uint32_t*)h
->mv_cache
[0][ scan8
[0] - 8 ] == 0)
1147 || (left_ref
== 0 && *(uint32_t*)h
->mv_cache
[0][ scan8
[0] - 1 ] == 0)){
1153 pred_motion(h
, 0, 4, 0, 0, mx
, my
);
1158 static inline void direct_dist_scale_factor(H264Context
* const h
){
1159 const int poc
= h
->s
.current_picture_ptr
->poc
;
1160 const int poc1
= h
->ref_list
[1][0].poc
;
1162 for(i
=0; i
<h
->ref_count
[0]; i
++){
1163 int poc0
= h
->ref_list
[0][i
].poc
;
1164 int td
= clip(poc1
- poc0
, -128, 127);
1165 if(td
== 0 /* FIXME || pic0 is a long-term ref */){
1166 h
->dist_scale_factor
[i
] = 256;
1168 int tb
= clip(poc
- poc0
, -128, 127);
1169 int tx
= (16384 + (ABS(td
) >> 1)) / td
;
1170 h
->dist_scale_factor
[i
] = clip((tb
*tx
+ 32) >> 6, -1024, 1023);
1174 static inline void direct_ref_list_init(H264Context
* const h
){
1175 MpegEncContext
* const s
= &h
->s
;
1176 Picture
* const ref1
= &h
->ref_list
[1][0];
1177 Picture
* const cur
= s
->current_picture_ptr
;
1179 if(cur
->pict_type
== I_TYPE
)
1180 cur
->ref_count
[0] = 0;
1181 if(cur
->pict_type
!= B_TYPE
)
1182 cur
->ref_count
[1] = 0;
1183 for(list
=0; list
<2; list
++){
1184 cur
->ref_count
[list
] = h
->ref_count
[list
];
1185 for(j
=0; j
<h
->ref_count
[list
]; j
++)
1186 cur
->ref_poc
[list
][j
] = h
->ref_list
[list
][j
].poc
;
1188 if(cur
->pict_type
!= B_TYPE
|| h
->direct_spatial_mv_pred
)
1190 for(list
=0; list
<2; list
++){
1191 for(i
=0; i
<ref1
->ref_count
[list
]; i
++){
1192 const int poc
= ref1
->ref_poc
[list
][i
];
1193 h
->map_col_to_list0
[list
][i
] = PART_NOT_AVAILABLE
;
1194 for(j
=0; j
<h
->ref_count
[list
]; j
++)
1195 if(h
->ref_list
[list
][j
].poc
== poc
){
1196 h
->map_col_to_list0
[list
][i
] = j
;
1203 static inline void pred_direct_motion(H264Context
* const h
, int *mb_type
){
1204 MpegEncContext
* const s
= &h
->s
;
1205 const int mb_xy
= s
->mb_x
+ s
->mb_y
*s
->mb_stride
;
1206 const int b8_xy
= 2*s
->mb_x
+ 2*s
->mb_y
*h
->b8_stride
;
1207 const int b4_xy
= 4*s
->mb_x
+ 4*s
->mb_y
*h
->b_stride
;
1208 const int mb_type_col
= h
->ref_list
[1][0].mb_type
[mb_xy
];
1209 const int16_t (*l1mv0
)[2] = (const int16_t (*)[2]) &h
->ref_list
[1][0].motion_val
[0][b4_xy
];
1210 const int16_t (*l1mv1
)[2] = (const int16_t (*)[2]) &h
->ref_list
[1][0].motion_val
[1][b4_xy
];
1211 const int8_t *l1ref0
= &h
->ref_list
[1][0].ref_index
[0][b8_xy
];
1212 const int8_t *l1ref1
= &h
->ref_list
[1][0].ref_index
[1][b8_xy
];
1213 const int is_b8x8
= IS_8X8(*mb_type
);
1217 if(IS_8X8(mb_type_col
) && !h
->sps
.direct_8x8_inference_flag
){
1218 /* FIXME save sub mb types from previous frames (or derive from MVs)
1219 * so we know exactly what block size to use */
1220 sub_mb_type
= MB_TYPE_8x8
|MB_TYPE_P0L0
|MB_TYPE_P0L1
|MB_TYPE_DIRECT2
; /* B_SUB_4x4 */
1221 *mb_type
= MB_TYPE_8x8
|MB_TYPE_L0L1
;
1222 }else if(!is_b8x8
&& (IS_16X16(mb_type_col
) || IS_INTRA(mb_type_col
))){
1223 sub_mb_type
= MB_TYPE_16x16
|MB_TYPE_P0L0
|MB_TYPE_P0L1
|MB_TYPE_DIRECT2
; /* B_SUB_8x8 */
1224 *mb_type
= MB_TYPE_16x16
|MB_TYPE_P0L0
|MB_TYPE_P0L1
|MB_TYPE_DIRECT2
; /* B_16x16 */
1226 sub_mb_type
= MB_TYPE_16x16
|MB_TYPE_P0L0
|MB_TYPE_P0L1
|MB_TYPE_DIRECT2
; /* B_SUB_8x8 */
1227 *mb_type
= MB_TYPE_8x8
|MB_TYPE_L0L1
;
1230 *mb_type
|= MB_TYPE_DIRECT2
;
1232 tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type
, sub_mb_type
, is_b8x8
, mb_type_col
);
1234 if(h
->direct_spatial_mv_pred
){
1239 /* ref = min(neighbors) */
1240 for(list
=0; list
<2; list
++){
1241 int refa
= h
->ref_cache
[list
][scan8
[0] - 1];
1242 int refb
= h
->ref_cache
[list
][scan8
[0] - 8];
1243 int refc
= h
->ref_cache
[list
][scan8
[0] - 8 + 4];
1245 refc
= h
->ref_cache
[list
][scan8
[0] - 8 - 1];
1247 if(ref
[list
] < 0 || (refb
< ref
[list
] && refb
>= 0))
1249 if(ref
[list
] < 0 || (refc
< ref
[list
] && refc
>= 0))
1255 if(ref
[0] < 0 && ref
[1] < 0){
1256 ref
[0] = ref
[1] = 0;
1257 mv
[0][0] = mv
[0][1] =
1258 mv
[1][0] = mv
[1][1] = 0;
1260 for(list
=0; list
<2; list
++){
1262 pred_motion(h
, 0, 4, list
, ref
[list
], &mv
[list
][0], &mv
[list
][1]);
1264 mv
[list
][0] = mv
[list
][1] = 0;
1269 *mb_type
&= ~MB_TYPE_P0L1
;
1270 sub_mb_type
&= ~MB_TYPE_P0L1
;
1271 }else if(ref
[0] < 0){
1272 *mb_type
&= ~MB_TYPE_P0L0
;
1273 sub_mb_type
&= ~MB_TYPE_P0L0
;
1276 if(IS_16X16(*mb_type
)){
1277 fill_rectangle(&h
->ref_cache
[0][scan8
[0]], 4, 4, 8, ref
[0], 1);
1278 fill_rectangle(&h
->ref_cache
[1][scan8
[0]], 4, 4, 8, ref
[1], 1);
1279 if(!IS_INTRA(mb_type_col
)
1280 && ( (l1ref0
[0] == 0 && ABS(l1mv0
[0][0]) <= 1 && ABS(l1mv0
[0][1]) <= 1)
1281 || (l1ref0
[0] < 0 && l1ref1
[0] == 0 && ABS(l1mv1
[0][0]) <= 1 && ABS(l1mv1
[0][1]) <= 1
1282 && (h
->x264_build
>33 || !h
->x264_build
)))){
1284 fill_rectangle(&h
->mv_cache
[0][scan8
[0]], 4, 4, 8, pack16to32(mv
[0][0],mv
[0][1]), 4);
1286 fill_rectangle(&h
->mv_cache
[0][scan8
[0]], 4, 4, 8, 0, 4);
1288 fill_rectangle(&h
->mv_cache
[1][scan8
[0]], 4, 4, 8, pack16to32(mv
[1][0],mv
[1][1]), 4);
1290 fill_rectangle(&h
->mv_cache
[1][scan8
[0]], 4, 4, 8, 0, 4);
1292 fill_rectangle(&h
->mv_cache
[0][scan8
[0]], 4, 4, 8, pack16to32(mv
[0][0],mv
[0][1]), 4);
1293 fill_rectangle(&h
->mv_cache
[1][scan8
[0]], 4, 4, 8, pack16to32(mv
[1][0],mv
[1][1]), 4);
1296 for(i8
=0; i8
<4; i8
++){
1297 const int x8
= i8
&1;
1298 const int y8
= i8
>>1;
1300 if(is_b8x8
&& !IS_DIRECT(h
->sub_mb_type
[i8
]))
1302 h
->sub_mb_type
[i8
] = sub_mb_type
;
1304 fill_rectangle(&h
->mv_cache
[0][scan8
[i8
*4]], 2, 2, 8, pack16to32(mv
[0][0],mv
[0][1]), 4);
1305 fill_rectangle(&h
->mv_cache
[1][scan8
[i8
*4]], 2, 2, 8, pack16to32(mv
[1][0],mv
[1][1]), 4);
1306 fill_rectangle(&h
->ref_cache
[0][scan8
[i8
*4]], 2, 2, 8, ref
[0], 1);
1307 fill_rectangle(&h
->ref_cache
[1][scan8
[i8
*4]], 2, 2, 8, ref
[1], 1);
1310 if(!IS_INTRA(mb_type_col
) && ( l1ref0
[x8
+ y8
*h
->b8_stride
] == 0
1311 || (l1ref0
[x8
+ y8
*h
->b8_stride
] < 0 && l1ref1
[x8
+ y8
*h
->b8_stride
] == 0
1312 && (h
->x264_build
>33 || !h
->x264_build
)))){
1313 const int16_t (*l1mv
)[2]= l1ref0
[x8
+ y8
*h
->b8_stride
] == 0 ? l1mv0
: l1mv1
;
1314 for(i4
=0; i4
<4; i4
++){
1315 const int16_t *mv_col
= l1mv
[x8
*2 + (i4
&1) + (y8
*2 + (i4
>>1))*h
->b_stride
];
1316 if(ABS(mv_col
[0]) <= 1 && ABS(mv_col
[1]) <= 1){
1318 *(uint32_t*)h
->mv_cache
[0][scan8
[i8
*4+i4
]] = 0;
1320 *(uint32_t*)h
->mv_cache
[1][scan8
[i8
*4+i4
]] = 0;
1326 }else{ /* direct temporal mv pred */
1327 if(IS_16X16(*mb_type
)){
1328 fill_rectangle(&h
->ref_cache
[1][scan8
[0]], 4, 4, 8, 0, 1);
1329 if(IS_INTRA(mb_type_col
)){
1330 fill_rectangle(&h
->ref_cache
[0][scan8
[0]], 4, 4, 8, 0, 1);
1331 fill_rectangle(&h
-> mv_cache
[0][scan8
[0]], 4, 4, 8, 0, 4);
1332 fill_rectangle(&h
-> mv_cache
[1][scan8
[0]], 4, 4, 8, 0, 4);
1334 const int ref0
= l1ref0
[0] >= 0 ? h
->map_col_to_list0
[0][l1ref0
[0]]
1335 : h
->map_col_to_list0
[1][l1ref1
[0]];
1336 const int dist_scale_factor
= h
->dist_scale_factor
[ref0
];
1337 const int16_t *mv_col
= l1ref0
[0] >= 0 ? l1mv0
[0] : l1mv1
[0];
1339 mv_l0
[0] = (dist_scale_factor
* mv_col
[0] + 128) >> 8;
1340 mv_l0
[1] = (dist_scale_factor
* mv_col
[1] + 128) >> 8;
1341 fill_rectangle(&h
->ref_cache
[0][scan8
[0]], 4, 4, 8, ref0
, 1);
1342 fill_rectangle(&h
-> mv_cache
[0][scan8
[0]], 4, 4, 8, pack16to32(mv_l0
[0],mv_l0
[1]), 4);
1343 fill_rectangle(&h
-> mv_cache
[1][scan8
[0]], 4, 4, 8, pack16to32(mv_l0
[0]-mv_col
[0],mv_l0
[1]-mv_col
[1]), 4);
1346 for(i8
=0; i8
<4; i8
++){
1347 const int x8
= i8
&1;
1348 const int y8
= i8
>>1;
1349 int ref0
, dist_scale_factor
;
1350 const int16_t (*l1mv
)[2]= l1mv0
;
1352 if(is_b8x8
&& !IS_DIRECT(h
->sub_mb_type
[i8
]))
1354 h
->sub_mb_type
[i8
] = sub_mb_type
;
1355 if(IS_INTRA(mb_type_col
)){
1356 fill_rectangle(&h
->ref_cache
[0][scan8
[i8
*4]], 2, 2, 8, 0, 1);
1357 fill_rectangle(&h
->ref_cache
[1][scan8
[i8
*4]], 2, 2, 8, 0, 1);
1358 fill_rectangle(&h
-> mv_cache
[0][scan8
[i8
*4]], 2, 2, 8, 0, 4);
1359 fill_rectangle(&h
-> mv_cache
[1][scan8
[i8
*4]], 2, 2, 8, 0, 4);
1363 ref0
= l1ref0
[x8
+ y8
*h
->b8_stride
];
1365 ref0
= h
->map_col_to_list0
[0][ref0
];
1367 ref0
= h
->map_col_to_list0
[1][l1ref1
[x8
+ y8
*h
->b8_stride
]];
1370 dist_scale_factor
= h
->dist_scale_factor
[ref0
];
1372 fill_rectangle(&h
->ref_cache
[0][scan8
[i8
*4]], 2, 2, 8, ref0
, 1);
1373 fill_rectangle(&h
->ref_cache
[1][scan8
[i8
*4]], 2, 2, 8, 0, 1);
1374 for(i4
=0; i4
<4; i4
++){
1375 const int16_t *mv_col
= l1mv
[x8
*2 + (i4
&1) + (y8
*2 + (i4
>>1))*h
->b_stride
];
1376 int16_t *mv_l0
= h
->mv_cache
[0][scan8
[i8
*4+i4
]];
1377 mv_l0
[0] = (dist_scale_factor
* mv_col
[0] + 128) >> 8;
1378 mv_l0
[1] = (dist_scale_factor
* mv_col
[1] + 128) >> 8;
1379 *(uint32_t*)h
->mv_cache
[1][scan8
[i8
*4+i4
]] =
1380 pack16to32(mv_l0
[0]-mv_col
[0],mv_l0
[1]-mv_col
[1]);
1387 static inline void write_back_motion(H264Context
*h
, int mb_type
){
1388 MpegEncContext
* const s
= &h
->s
;
1389 const int b_xy
= 4*s
->mb_x
+ 4*s
->mb_y
*h
->b_stride
;
1390 const int b8_xy
= 2*s
->mb_x
+ 2*s
->mb_y
*h
->b8_stride
;
1393 for(list
=0; list
<2; list
++){
1395 if(!USES_LIST(mb_type
, list
)){
1396 if(1){ //FIXME skip or never read if mb_type doesn't use it
1398 *(uint64_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 0 + y
*h
->b_stride
]=
1399 *(uint64_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 2 + y
*h
->b_stride
]= 0;
1401 if( h
->pps
.cabac
) {
1402 /* FIXME needed ? */
1404 *(uint64_t*)h
->mvd_table
[list
][b_xy
+ 0 + y
*h
->b_stride
]=
1405 *(uint64_t*)h
->mvd_table
[list
][b_xy
+ 2 + y
*h
->b_stride
]= 0;
1409 s
->current_picture
.ref_index
[list
][b8_xy
+ 0 + y
*h
->b8_stride
]=
1410 s
->current_picture
.ref_index
[list
][b8_xy
+ 1 + y
*h
->b8_stride
]= LIST_NOT_USED
;
1417 *(uint64_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 0 + y
*h
->b_stride
]= *(uint64_t*)h
->mv_cache
[list
][scan8
[0]+0 + 8*y
];
1418 *(uint64_t*)s
->current_picture
.motion_val
[list
][b_xy
+ 2 + y
*h
->b_stride
]= *(uint64_t*)h
->mv_cache
[list
][scan8
[0]+2 + 8*y
];
1420 if( h
->pps
.cabac
) {
1422 *(uint64_t*)h
->mvd_table
[list
][b_xy
+ 0 + y
*h
->b_stride
]= *(uint64_t*)h
->mvd_cache
[list
][scan8
[0]+0 + 8*y
];
1423 *(uint64_t*)h
->mvd_table
[list
][b_xy
+ 2 + y
*h
->b_stride
]= *(uint64_t*)h
->mvd_cache
[list
][scan8
[0]+2 + 8*y
];
1427 s
->current_picture
.ref_index
[list
][b8_xy
+ 0 + y
*h
->b8_stride
]= h
->ref_cache
[list
][scan8
[0]+0 + 16*y
];
1428 s
->current_picture
.ref_index
[list
][b8_xy
+ 1 + y
*h
->b8_stride
]= h
->ref_cache
[list
][scan8
[0]+2 + 16*y
];
1432 if(h
->slice_type
== B_TYPE
&& h
->pps
.cabac
){
1433 if(IS_8X8(mb_type
)){
1434 h
->direct_table
[b8_xy
+1+0*h
->b8_stride
] = IS_DIRECT(h
->sub_mb_type
[1]) ?
1 : 0;
1435 h
->direct_table
[b8_xy
+0+1*h
->b8_stride
] = IS_DIRECT(h
->sub_mb_type
[2]) ?
1 : 0;
1436 h
->direct_table
[b8_xy
+1+1*h
->b8_stride
] = IS_DIRECT(h
->sub_mb_type
[3]) ?
1 : 0;
1442 * Decodes a network abstraction layer unit.
1443 * @param consumed is the number of bytes used as input
1444 * @param length is the length of the array
1445 * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1446 * @returns decoded bytes, might be src+1 if no escapes
1448 static uint8_t *decode_nal(H264Context
*h
, uint8_t *src
, int *dst_length
, int *consumed
, int length
){
1452 // src[0]&0x80; //forbidden bit
1453 h
->nal_ref_idc
= src
[0]>>5;
1454 h
->nal_unit_type
= src
[0]&0x1F;
1458 for(i
=0; i
<length
; i
++)
1459 printf("%2X ", src
[i
]);
1461 for(i
=0; i
+1<length
; i
+=2){
1462 if(src
[i
]) continue;
1463 if(i
>0 && src
[i
-1]==0) i
--;
1464 if(i
+2<length
&& src
[i
+1]==0 && src
[i
+2]<=3){
1466 /* startcode, so we must be past the end */
1473 if(i
>=length
-1){ //no escaped 0
1474 *dst_length
= length
;
1475 *consumed
= length
+1; //+1 for the header
1479 h
->rbsp_buffer
= av_fast_realloc(h
->rbsp_buffer
, &h
->rbsp_buffer_size
, length
);
1480 dst
= h
->rbsp_buffer
;
1482 //printf("decoding esc\n");
1485 //remove escapes (very rare 1:2^22)
1486 if(si
+2<length
&& src
[si
]==0 && src
[si
+1]==0 && src
[si
+2]<=3){
1487 if(src
[si
+2]==3){ //escape
1492 }else //next start code
1496 dst
[di
++]= src
[si
++];
1500 *consumed
= si
+ 1;//+1 for the header
1501 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1507 * @param src the data which should be escaped
1508 * @param dst the target buffer, dst+1 == src is allowed as a special case
1509 * @param length the length of the src data
1510 * @param dst_length the length of the dst array
1511 * @returns length of escaped data in bytes or -1 if an error occured
1513 static int encode_nal(H264Context
*h
, uint8_t *dst
, uint8_t *src
, int length
, int dst_length
){
1514 int i
, escape_count
, si
, di
;
1518 assert(dst_length
>0);
1520 dst
[0]= (h
->nal_ref_idc
<<5) + h
->nal_unit_type
;
1522 if(length
==0) return 1;
1525 for(i
=0; i
<length
; i
+=2){
1526 if(src
[i
]) continue;
1527 if(i
>0 && src
[i
-1]==0)
1529 if(i
+2<length
&& src
[i
+1]==0 && src
[i
+2]<=3){
1535 if(escape_count
==0){
1537 memcpy(dst
+1, src
, length
);
1541 if(length
+ escape_count
+ 1> dst_length
)
1544 //this should be damn rare (hopefully)
1546 h
->rbsp_buffer
= av_fast_realloc(h
->rbsp_buffer
, &h
->rbsp_buffer_size
, length
+ escape_count
);
1547 temp
= h
->rbsp_buffer
;
1548 //printf("encoding esc\n");
1553 if(si
+2<length
&& src
[si
]==0 && src
[si
+1]==0 && src
[si
+2]<=3){
1554 temp
[di
++]= 0; si
++;
1555 temp
[di
++]= 0; si
++;
1557 temp
[di
++]= src
[si
++];
1560 temp
[di
++]= src
[si
++];
1562 memcpy(dst
+1, temp
, length
+escape_count
);
1564 assert(di
== length
+escape_count
);
1570 * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1572 static void encode_rbsp_trailing(PutBitContext
*pb
){
1575 length
= (-put_bits_count(pb
))&7;
1576 if(length
) put_bits(pb
, length
, 0);
1581 * identifies the exact end of the bitstream
1582 * @return the length of the trailing, or 0 if damaged
1584 static int decode_rbsp_trailing(uint8_t *src
){
1588 tprintf("rbsp trailing %X\n", v
);
1598 * idct tranforms the 16 dc values and dequantize them.
1599 * @param qp quantization parameter
1601 static void h264_luma_dc_dequant_idct_c(DCTELEM
*block
, int qp
){
1602 const int qmul
= dequant_coeff
[qp
][0];
1605 int temp
[16]; //FIXME check if this is a good idea
1606 static const int x_offset
[4]={0, 1*stride
, 4* stride
, 5*stride
};
1607 static const int y_offset
[4]={0, 2*stride
, 8* stride
, 10*stride
};
1609 //memset(block, 64, 2*256);
1612 const int offset
= y_offset
[i
];
1613 const int z0
= block
[offset
+stride
*0] + block
[offset
+stride
*4];
1614 const int z1
= block
[offset
+stride
*0] - block
[offset
+stride
*4];
1615 const int z2
= block
[offset
+stride
*1] - block
[offset
+stride
*5];
1616 const int z3
= block
[offset
+stride
*1] + block
[offset
+stride
*5];
1625 const int offset
= x_offset
[i
];
1626 const int z0
= temp
[4*0+i
] + temp
[4*2+i
];
1627 const int z1
= temp
[4*0+i
] - temp
[4*2+i
];
1628 const int z2
= temp
[4*1+i
] - temp
[4*3+i
];
1629 const int z3
= temp
[4*1+i
] + temp
[4*3+i
];
1631 block
[stride
*0 +offset
]= ((z0
+ z3
)*qmul
+ 2)>>2; //FIXME think about merging this into decode_resdual
1632 block
[stride
*2 +offset
]= ((z1
+ z2
)*qmul
+ 2)>>2;
1633 block
[stride
*8 +offset
]= ((z1
- z2
)*qmul
+ 2)>>2;
1634 block
[stride
*10+offset
]= ((z0
- z3
)*qmul
+ 2)>>2;
1640 * dct tranforms the 16 dc values.
1641 * @param qp quantization parameter ??? FIXME
1643 static void h264_luma_dc_dct_c(DCTELEM
*block
/*, int qp*/){
1644 // const int qmul= dequant_coeff[qp][0];
1646 int temp
[16]; //FIXME check if this is a good idea
1647 static const int x_offset
[4]={0, 1*stride
, 4* stride
, 5*stride
};
1648 static const int y_offset
[4]={0, 2*stride
, 8* stride
, 10*stride
};
1651 const int offset
= y_offset
[i
];
1652 const int z0
= block
[offset
+stride
*0] + block
[offset
+stride
*4];
1653 const int z1
= block
[offset
+stride
*0] - block
[offset
+stride
*4];
1654 const int z2
= block
[offset
+stride
*1] - block
[offset
+stride
*5];
1655 const int z3
= block
[offset
+stride
*1] + block
[offset
+stride
*5];
1664 const int offset
= x_offset
[i
];
1665 const int z0
= temp
[4*0+i
] + temp
[4*2+i
];
1666 const int z1
= temp
[4*0+i
] - temp
[4*2+i
];
1667 const int z2
= temp
[4*1+i
] - temp
[4*3+i
];
1668 const int z3
= temp
[4*1+i
] + temp
[4*3+i
];
1670 block
[stride
*0 +offset
]= (z0
+ z3
)>>1;
1671 block
[stride
*2 +offset
]= (z1
+ z2
)>>1;
1672 block
[stride
*8 +offset
]= (z1
- z2
)>>1;
1673 block
[stride
*10+offset
]= (z0
- z3
)>>1;
1681 static void chroma_dc_dequant_idct_c(DCTELEM
*block
, int qp
){
1682 const int qmul
= dequant_coeff
[qp
][0];
1683 const int stride
= 16*2;
1684 const int xStride
= 16;
1687 a
= block
[stride
*0 + xStride
*0];
1688 b
= block
[stride
*0 + xStride
*1];
1689 c
= block
[stride
*1 + xStride
*0];
1690 d
= block
[stride
*1 + xStride
*1];
1697 block
[stride
*0 + xStride
*0]= ((a
+c
)*qmul
+ 0)>>1;
1698 block
[stride
*0 + xStride
*1]= ((e
+b
)*qmul
+ 0)>>1;
1699 block
[stride
*1 + xStride
*0]= ((a
-c
)*qmul
+ 0)>>1;
1700 block
[stride
*1 + xStride
*1]= ((e
-b
)*qmul
+ 0)>>1;
1704 static void chroma_dc_dct_c(DCTELEM
*block
){
1705 const int stride
= 16*2;
1706 const int xStride
= 16;
1709 a
= block
[stride
*0 + xStride
*0];
1710 b
= block
[stride
*0 + xStride
*1];
1711 c
= block
[stride
*1 + xStride
*0];
1712 d
= block
[stride
*1 + xStride
*1];
1719 block
[stride
*0 + xStride
*0]= (a
+c
);
1720 block
[stride
*0 + xStride
*1]= (e
+b
);
1721 block
[stride
*1 + xStride
*0]= (a
-c
);
1722 block
[stride
*1 + xStride
*1]= (e
-b
);
1727 * gets the chroma qp.
1729 static inline int get_chroma_qp(int chroma_qp_index_offset
, int qscale
){
1731 return chroma_qp
[clip(qscale
+ chroma_qp_index_offset
, 0, 51)];
1736 static void h264_diff_dct_c(DCTELEM
*block
, uint8_t *src1
, uint8_t *src2
, int stride
){
1738 //FIXME try int temp instead of block
1741 const int d0
= src1
[0 + i
*stride
] - src2
[0 + i
*stride
];
1742 const int d1
= src1
[1 + i
*stride
] - src2
[1 + i
*stride
];
1743 const int d2
= src1
[2 + i
*stride
] - src2
[2 + i
*stride
];
1744 const int d3
= src1
[3 + i
*stride
] - src2
[3 + i
*stride
];
1745 const int z0
= d0
+ d3
;
1746 const int z3
= d0
- d3
;
1747 const int z1
= d1
+ d2
;
1748 const int z2
= d1
- d2
;
1750 block
[0 + 4*i
]= z0
+ z1
;
1751 block
[1 + 4*i
]= 2*z3
+ z2
;
1752 block
[2 + 4*i
]= z0
- z1
;
1753 block
[3 + 4*i
]= z3
- 2*z2
;
1757 const int z0
= block
[0*4 + i
] + block
[3*4 + i
];
1758 const int z3
= block
[0*4 + i
] - block
[3*4 + i
];
1759 const int z1
= block
[1*4 + i
] + block
[2*4 + i
];
1760 const int z2
= block
[1*4 + i
] - block
[2*4 + i
];
1762 block
[0*4 + i
]= z0
+ z1
;
1763 block
[1*4 + i
]= 2*z3
+ z2
;
1764 block
[2*4 + i
]= z0
- z1
;
1765 block
[3*4 + i
]= z3
- 2*z2
;
1770 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1771 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1772 static inline int quantize_c(DCTELEM
*block
, uint8_t *scantable
, int qscale
, int intra
, int seperate_dc
){
1774 const int * const quant_table
= quant_coeff
[qscale
];
1775 const int bias
= intra ?
(1<<QUANT_SHIFT
)/3 : (1<<QUANT_SHIFT
)/6;
1776 const unsigned int threshold1
= (1<<QUANT_SHIFT
) - bias
- 1;
1777 const unsigned int threshold2
= (threshold1
<<1);
1783 const int dc_bias
= intra ?
(1<<(QUANT_SHIFT
-2))/3 : (1<<(QUANT_SHIFT
-2))/6;
1784 const unsigned int dc_threshold1
= (1<<(QUANT_SHIFT
-2)) - dc_bias
- 1;
1785 const unsigned int dc_threshold2
= (dc_threshold1
<<1);
1787 int level
= block
[0]*quant_coeff
[qscale
+18][0];
1788 if(((unsigned)(level
+dc_threshold1
))>dc_threshold2
){
1790 level
= (dc_bias
+ level
)>>(QUANT_SHIFT
-2);
1793 level
= (dc_bias
- level
)>>(QUANT_SHIFT
-2);
1796 // last_non_zero = i;
1801 const int dc_bias
= intra ?
(1<<(QUANT_SHIFT
+1))/3 : (1<<(QUANT_SHIFT
+1))/6;
1802 const unsigned int dc_threshold1
= (1<<(QUANT_SHIFT
+1)) - dc_bias
- 1;
1803 const unsigned int dc_threshold2
= (dc_threshold1
<<1);
1805 int level
= block
[0]*quant_table
[0];
1806 if(((unsigned)(level
+dc_threshold1
))>dc_threshold2
){
1808 level
= (dc_bias
+ level
)>>(QUANT_SHIFT
+1);
1811 level
= (dc_bias
- level
)>>(QUANT_SHIFT
+1);
1814 // last_non_zero = i;
1827 const int j
= scantable
[i
];
1828 int level
= block
[j
]*quant_table
[j
];
1830 // if( bias+level >= (1<<(QMAT_SHIFT - 3))
1831 // || bias-level >= (1<<(QMAT_SHIFT - 3))){
1832 if(((unsigned)(level
+threshold1
))>threshold2
){
1834 level
= (bias
+ level
)>>QUANT_SHIFT
;
1837 level
= (bias
- level
)>>QUANT_SHIFT
;
1846 return last_non_zero
;
1849 static void pred4x4_vertical_c(uint8_t *src
, uint8_t *topright
, int stride
){
1850 const uint32_t a
= ((uint32_t*)(src
-stride
))[0];
1851 ((uint32_t*)(src
+0*stride
))[0]= a
;
1852 ((uint32_t*)(src
+1*stride
))[0]= a
;
1853 ((uint32_t*)(src
+2*stride
))[0]= a
;
1854 ((uint32_t*)(src
+3*stride
))[0]= a
;
1857 static void pred4x4_horizontal_c(uint8_t *src
, uint8_t *topright
, int stride
){
1858 ((uint32_t*)(src
+0*stride
))[0]= src
[-1+0*stride
]*0x01010101;
1859 ((uint32_t*)(src
+1*stride
))[0]= src
[-1+1*stride
]*0x01010101;
1860 ((uint32_t*)(src
+2*stride
))[0]= src
[-1+2*stride
]*0x01010101;
1861 ((uint32_t*)(src
+3*stride
))[0]= src
[-1+3*stride
]*0x01010101;
1864 static void pred4x4_dc_c(uint8_t *src
, uint8_t *topright
, int stride
){
1865 const int dc
= ( src
[-stride
] + src
[1-stride
] + src
[2-stride
] + src
[3-stride
]
1866 + src
[-1+0*stride
] + src
[-1+1*stride
] + src
[-1+2*stride
] + src
[-1+3*stride
] + 4) >>3;
1868 ((uint32_t*)(src
+0*stride
))[0]=
1869 ((uint32_t*)(src
+1*stride
))[0]=
1870 ((uint32_t*)(src
+2*stride
))[0]=
1871 ((uint32_t*)(src
+3*stride
))[0]= dc
* 0x01010101;
1874 static void pred4x4_left_dc_c(uint8_t *src
, uint8_t *topright
, int stride
){
1875 const int dc
= ( src
[-1+0*stride
] + src
[-1+1*stride
] + src
[-1+2*stride
] + src
[-1+3*stride
] + 2) >>2;
1877 ((uint32_t*)(src
+0*stride
))[0]=
1878 ((uint32_t*)(src
+1*stride
))[0]=
1879 ((uint32_t*)(src
+2*stride
))[0]=
1880 ((uint32_t*)(src
+3*stride
))[0]= dc
* 0x01010101;
1883 static void pred4x4_top_dc_c(uint8_t *src
, uint8_t *topright
, int stride
){
1884 const int dc
= ( src
[-stride
] + src
[1-stride
] + src
[2-stride
] + src
[3-stride
] + 2) >>2;
1886 ((uint32_t*)(src
+0*stride
))[0]=
1887 ((uint32_t*)(src
+1*stride
))[0]=
1888 ((uint32_t*)(src
+2*stride
))[0]=
1889 ((uint32_t*)(src
+3*stride
))[0]= dc
* 0x01010101;
1892 static void pred4x4_128_dc_c(uint8_t *src
, uint8_t *topright
, int stride
){
1893 ((uint32_t*)(src
+0*stride
))[0]=
1894 ((uint32_t*)(src
+1*stride
))[0]=
1895 ((uint32_t*)(src
+2*stride
))[0]=
1896 ((uint32_t*)(src
+3*stride
))[0]= 128U*0x01010101U
;
1900 #define LOAD_TOP_RIGHT_EDGE\
1901 const int t4= topright[0];\
1902 const int t5= topright[1];\
1903 const int t6= topright[2];\
1904 const int t7= topright[3];\
1906 #define LOAD_LEFT_EDGE\
1907 const int l0= src[-1+0*stride];\
1908 const int l1= src[-1+1*stride];\
1909 const int l2= src[-1+2*stride];\
1910 const int l3= src[-1+3*stride];\
1912 #define LOAD_TOP_EDGE\
1913 const int t0= src[ 0-1*stride];\
1914 const int t1= src[ 1-1*stride];\
1915 const int t2= src[ 2-1*stride];\
1916 const int t3= src[ 3-1*stride];\
1918 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1919 const int lt
= src
[-1-1*stride
];
1923 src
[0+3*stride
]=(l3
+ 2*l2
+ l1
+ 2)>>2;
1925 src
[1+3*stride
]=(l2
+ 2*l1
+ l0
+ 2)>>2;
1928 src
[2+3*stride
]=(l1
+ 2*l0
+ lt
+ 2)>>2;
1932 src
[3+3*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
1935 src
[3+2*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
1937 src
[3+1*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
1938 src
[3+0*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
1941 static void pred4x4_down_left_c(uint8_t *src
, uint8_t *topright
, int stride
){
1946 src
[0+0*stride
]=(t0
+ t2
+ 2*t1
+ 2)>>2;
1948 src
[0+1*stride
]=(t1
+ t3
+ 2*t2
+ 2)>>2;
1951 src
[0+2*stride
]=(t2
+ t4
+ 2*t3
+ 2)>>2;
1955 src
[0+3*stride
]=(t3
+ t5
+ 2*t4
+ 2)>>2;
1958 src
[1+3*stride
]=(t4
+ t6
+ 2*t5
+ 2)>>2;
1960 src
[2+3*stride
]=(t5
+ t7
+ 2*t6
+ 2)>>2;
1961 src
[3+3*stride
]=(t6
+ 3*t7
+ 2)>>2;
1964 static void pred4x4_vertical_right_c(uint8_t *src
, uint8_t *topright
, int stride
){
1965 const int lt
= src
[-1-1*stride
];
1968 const __attribute__((unused
)) int unu
= l3
;
1971 src
[1+2*stride
]=(lt
+ t0
+ 1)>>1;
1973 src
[2+2*stride
]=(t0
+ t1
+ 1)>>1;
1975 src
[3+2*stride
]=(t1
+ t2
+ 1)>>1;
1976 src
[3+0*stride
]=(t2
+ t3
+ 1)>>1;
1978 src
[1+3*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
1980 src
[2+3*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
1982 src
[3+3*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
1983 src
[3+1*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
1984 src
[0+2*stride
]=(lt
+ 2*l0
+ l1
+ 2)>>2;
1985 src
[0+3*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
1988 static void pred4x4_vertical_left_c(uint8_t *src
, uint8_t *topright
, int stride
){
1991 const __attribute__((unused
)) int unu
= t7
;
1993 src
[0+0*stride
]=(t0
+ t1
+ 1)>>1;
1995 src
[0+2*stride
]=(t1
+ t2
+ 1)>>1;
1997 src
[1+2*stride
]=(t2
+ t3
+ 1)>>1;
1999 src
[2+2*stride
]=(t3
+ t4
+ 1)>>1;
2000 src
[3+2*stride
]=(t4
+ t5
+ 1)>>1;
2001 src
[0+1*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
2003 src
[0+3*stride
]=(t1
+ 2*t2
+ t3
+ 2)>>2;
2005 src
[1+3*stride
]=(t2
+ 2*t3
+ t4
+ 2)>>2;
2007 src
[2+3*stride
]=(t3
+ 2*t4
+ t5
+ 2)>>2;
2008 src
[3+3*stride
]=(t4
+ 2*t5
+ t6
+ 2)>>2;
2011 static void pred4x4_horizontal_up_c(uint8_t *src
, uint8_t *topright
, int stride
){
2014 src
[0+0*stride
]=(l0
+ l1
+ 1)>>1;
2015 src
[1+0*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
2017 src
[0+1*stride
]=(l1
+ l2
+ 1)>>1;
2019 src
[1+1*stride
]=(l1
+ 2*l2
+ l3
+ 2)>>2;
2021 src
[0+2*stride
]=(l2
+ l3
+ 1)>>1;
2023 src
[1+2*stride
]=(l2
+ 2*l3
+ l3
+ 2)>>2;
2032 static void pred4x4_horizontal_down_c(uint8_t *src
, uint8_t *topright
, int stride
){
2033 const int lt
= src
[-1-1*stride
];
2036 const __attribute__((unused
)) int unu
= t3
;
2039 src
[2+1*stride
]=(lt
+ l0
+ 1)>>1;
2041 src
[3+1*stride
]=(l0
+ 2*lt
+ t0
+ 2)>>2;
2042 src
[2+0*stride
]=(lt
+ 2*t0
+ t1
+ 2)>>2;
2043 src
[3+0*stride
]=(t0
+ 2*t1
+ t2
+ 2)>>2;
2045 src
[2+2*stride
]=(l0
+ l1
+ 1)>>1;
2047 src
[3+2*stride
]=(lt
+ 2*l0
+ l1
+ 2)>>2;
2049 src
[2+3*stride
]=(l1
+ l2
+ 1)>>1;
2051 src
[3+3*stride
]=(l0
+ 2*l1
+ l2
+ 2)>>2;
2052 src
[0+3*stride
]=(l2
+ l3
+ 1)>>1;
2053 src
[1+3*stride
]=(l1
+ 2*l2
+ l3
+ 2)>>2;
2056 static void pred16x16_vertical_c(uint8_t *src
, int stride
){
2058 const uint32_t a
= ((uint32_t*)(src
-stride
))[0];
2059 const uint32_t b
= ((uint32_t*)(src
-stride
))[1];
2060 const uint32_t c
= ((uint32_t*)(src
-stride
))[2];
2061 const uint32_t d
= ((uint32_t*)(src
-stride
))[3];
2063 for(i
=0; i
<16; i
++){
2064 ((uint32_t*)(src
+i
*stride
))[0]= a
;
2065 ((uint32_t*)(src
+i
*stride
))[1]= b
;
2066 ((uint32_t*)(src
+i
*stride
))[2]= c
;
2067 ((uint32_t*)(src
+i
*stride
))[3]= d
;
2071 static void pred16x16_horizontal_c(uint8_t *src
, int stride
){
2074 for(i
=0; i
<16; i
++){
2075 ((uint32_t*)(src
+i
*stride
))[0]=
2076 ((uint32_t*)(src
+i
*stride
))[1]=
2077 ((uint32_t*)(src
+i
*stride
))[2]=
2078 ((uint32_t*)(src
+i
*stride
))[3]= src
[-1+i
*stride
]*0x01010101;
2082 static void pred16x16_dc_c(uint8_t *src
, int stride
){
2086 dc
+= src
[-1+i
*stride
];
2093 dc
= 0x01010101*((dc
+ 16)>>5);
2095 for(i
=0; i
<16; i
++){
2096 ((uint32_t*)(src
+i
*stride
))[0]=
2097 ((uint32_t*)(src
+i
*stride
))[1]=
2098 ((uint32_t*)(src
+i
*stride
))[2]=
2099 ((uint32_t*)(src
+i
*stride
))[3]= dc
;
2103 static void pred16x16_left_dc_c(uint8_t *src
, int stride
){
2107 dc
+= src
[-1+i
*stride
];
2110 dc
= 0x01010101*((dc
+ 8)>>4);
2112 for(i
=0; i
<16; i
++){
2113 ((uint32_t*)(src
+i
*stride
))[0]=
2114 ((uint32_t*)(src
+i
*stride
))[1]=
2115 ((uint32_t*)(src
+i
*stride
))[2]=
2116 ((uint32_t*)(src
+i
*stride
))[3]= dc
;
2120 static void pred16x16_top_dc_c(uint8_t *src
, int stride
){
2126 dc
= 0x01010101*((dc
+ 8)>>4);
2128 for(i
=0; i
<16; i
++){
2129 ((uint32_t*)(src
+i
*stride
))[0]=
2130 ((uint32_t*)(src
+i
*stride
))[1]=
2131 ((uint32_t*)(src
+i
*stride
))[2]=
2132 ((uint32_t*)(src
+i
*stride
))[3]= dc
;
2136 static void pred16x16_128_dc_c(uint8_t *src
, int stride
){
2139 for(i
=0; i
<16; i
++){
2140 ((uint32_t*)(src
+i
*stride
))[0]=
2141 ((uint32_t*)(src
+i
*stride
))[1]=
2142 ((uint32_t*)(src
+i
*stride
))[2]=
2143 ((uint32_t*)(src
+i
*stride
))[3]= 0x01010101U
*128U;
2147 static inline void pred16x16_plane_compat_c(uint8_t *src
, int stride
, const int svq3
){
2150 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
2151 const uint8_t * const src0
= src
+7-stride
;
2152 const uint8_t *src1
= src
+8*stride
-1;
2153 const uint8_t *src2
= src1
-2*stride
; // == src+6*stride-1;
2154 int H
= src0
[1] - src0
[-1];
2155 int V
= src1
[0] - src2
[ 0];
2156 for(k
=2; k
<=8; ++k
) {
2157 src1
+= stride
; src2
-= stride
;
2158 H
+= k
*(src0
[k
] - src0
[-k
]);
2159 V
+= k
*(src1
[0] - src2
[ 0]);
2162 H
= ( 5*(H
/4) ) / 16;
2163 V
= ( 5*(V
/4) ) / 16;
2165 /* required for 100% accuracy */
2166 i
= H
; H
= V
; V
= i
;
2168 H
= ( 5*H
+32 ) >> 6;
2169 V
= ( 5*V
+32 ) >> 6;
2172 a
= 16*(src1
[0] + src2
[16] + 1) - 7*(V
+H
);
2173 for(j
=16; j
>0; --j
) {
2176 for(i
=-16; i
<0; i
+=4) {
2177 src
[16+i
] = cm
[ (b
) >> 5 ];
2178 src
[17+i
] = cm
[ (b
+ H
) >> 5 ];
2179 src
[18+i
] = cm
[ (b
+2*H
) >> 5 ];
2180 src
[19+i
] = cm
[ (b
+3*H
) >> 5 ];
2187 static void pred16x16_plane_c(uint8_t *src
, int stride
){
2188 pred16x16_plane_compat_c(src
, stride
, 0);
2191 static void pred8x8_vertical_c(uint8_t *src
, int stride
){
2193 const uint32_t a
= ((uint32_t*)(src
-stride
))[0];
2194 const uint32_t b
= ((uint32_t*)(src
-stride
))[1];
2197 ((uint32_t*)(src
+i
*stride
))[0]= a
;
2198 ((uint32_t*)(src
+i
*stride
))[1]= b
;
2202 static void pred8x8_horizontal_c(uint8_t *src
, int stride
){
2206 ((uint32_t*)(src
+i
*stride
))[0]=
2207 ((uint32_t*)(src
+i
*stride
))[1]= src
[-1+i
*stride
]*0x01010101;
2211 static void pred8x8_128_dc_c(uint8_t *src
, int stride
){
2215 ((uint32_t*)(src
+i
*stride
))[0]=
2216 ((uint32_t*)(src
+i
*stride
))[1]= 0x01010101U
*128U;
2220 static void pred8x8_left_dc_c(uint8_t *src
, int stride
){
2226 dc0
+= src
[-1+i
*stride
];
2227 dc2
+= src
[-1+(i
+4)*stride
];
2229 dc0
= 0x01010101*((dc0
+ 2)>>2);
2230 dc2
= 0x01010101*((dc2
+ 2)>>2);
2233 ((uint32_t*)(src
+i
*stride
))[0]=
2234 ((uint32_t*)(src
+i
*stride
))[1]= dc0
;
2237 ((uint32_t*)(src
+i
*stride
))[0]=
2238 ((uint32_t*)(src
+i
*stride
))[1]= dc2
;
2242 static void pred8x8_top_dc_c(uint8_t *src
, int stride
){
2248 dc0
+= src
[i
-stride
];
2249 dc1
+= src
[4+i
-stride
];
2251 dc0
= 0x01010101*((dc0
+ 2)>>2);
2252 dc1
= 0x01010101*((dc1
+ 2)>>2);
2255 ((uint32_t*)(src
+i
*stride
))[0]= dc0
;
2256 ((uint32_t*)(src
+i
*stride
))[1]= dc1
;
2259 ((uint32_t*)(src
+i
*stride
))[0]= dc0
;
2260 ((uint32_t*)(src
+i
*stride
))[1]= dc1
;
2265 static void pred8x8_dc_c(uint8_t *src
, int stride
){
2267 int dc0
, dc1
, dc2
, dc3
;
2271 dc0
+= src
[-1+i
*stride
] + src
[i
-stride
];
2272 dc1
+= src
[4+i
-stride
];
2273 dc2
+= src
[-1+(i
+4)*stride
];
2275 dc3
= 0x01010101*((dc1
+ dc2
+ 4)>>3);
2276 dc0
= 0x01010101*((dc0
+ 4)>>3);
2277 dc1
= 0x01010101*((dc1
+ 2)>>2);
2278 dc2
= 0x01010101*((dc2
+ 2)>>2);
2281 ((uint32_t*)(src
+i
*stride
))[0]= dc0
;
2282 ((uint32_t*)(src
+i
*stride
))[1]= dc1
;
2285 ((uint32_t*)(src
+i
*stride
))[0]= dc2
;
2286 ((uint32_t*)(src
+i
*stride
))[1]= dc3
;
2290 static void pred8x8_plane_c(uint8_t *src
, int stride
){
2293 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
2294 const uint8_t * const src0
= src
+3-stride
;
2295 const uint8_t *src1
= src
+4*stride
-1;
2296 const uint8_t *src2
= src1
-2*stride
; // == src+2*stride-1;
2297 int H
= src0
[1] - src0
[-1];
2298 int V
= src1
[0] - src2
[ 0];
2299 for(k
=2; k
<=4; ++k
) {
2300 src1
+= stride
; src2
-= stride
;
2301 H
+= k
*(src0
[k
] - src0
[-k
]);
2302 V
+= k
*(src1
[0] - src2
[ 0]);
2304 H
= ( 17*H
+16 ) >> 5;
2305 V
= ( 17*V
+16 ) >> 5;
2307 a
= 16*(src1
[0] + src2
[8]+1) - 3*(V
+H
);
2308 for(j
=8; j
>0; --j
) {
2311 src
[0] = cm
[ (b
) >> 5 ];
2312 src
[1] = cm
[ (b
+ H
) >> 5 ];
2313 src
[2] = cm
[ (b
+2*H
) >> 5 ];
2314 src
[3] = cm
[ (b
+3*H
) >> 5 ];
2315 src
[4] = cm
[ (b
+4*H
) >> 5 ];
2316 src
[5] = cm
[ (b
+5*H
) >> 5 ];
2317 src
[6] = cm
[ (b
+6*H
) >> 5 ];
2318 src
[7] = cm
[ (b
+7*H
) >> 5 ];
2323 #define SRC(x,y) src[(x)+(y)*stride]
2325 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2326 #define PREDICT_8x8_LOAD_LEFT \
2327 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2328 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2329 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2330 const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2333 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2334 #define PREDICT_8x8_LOAD_TOP \
2335 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2336 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2337 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2338 const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2339 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2342 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2343 #define PREDICT_8x8_LOAD_TOPRIGHT \
2344 int t8, t9, t10, t11, t12, t13, t14, t15; \
2345 if(has_topright) { \
2346 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2347 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2348 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2350 #define PREDICT_8x8_LOAD_TOPLEFT \
2351 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2353 #define PREDICT_8x8_DC(v) \
2355 for( y = 0; y < 8; y++ ) { \
2356 ((uint32_t*)src)[0] = \
2357 ((uint32_t*)src)[1] = v; \
2361 static void pred8x8l_128_dc_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2363 PREDICT_8x8_DC(0x80808080);
2365 static void pred8x8l_left_dc_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2367 PREDICT_8x8_LOAD_LEFT
;
2368 const uint32_t dc
= ((l0
+l1
+l2
+l3
+l4
+l5
+l6
+l7
+4) >> 3) * 0x01010101;
2371 static void pred8x8l_top_dc_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2373 PREDICT_8x8_LOAD_TOP
;
2374 const uint32_t dc
= ((t0
+t1
+t2
+t3
+t4
+t5
+t6
+t7
+4) >> 3) * 0x01010101;
2377 static void pred8x8l_dc_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2379 PREDICT_8x8_LOAD_LEFT
;
2380 PREDICT_8x8_LOAD_TOP
;
2381 const uint32_t dc
= ((l0
+l1
+l2
+l3
+l4
+l5
+l6
+l7
2382 +t0
+t1
+t2
+t3
+t4
+t5
+t6
+t7
+8) >> 4) * 0x01010101;
2385 static void pred8x8l_horizontal_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2387 PREDICT_8x8_LOAD_LEFT
;
2388 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2389 ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2390 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2393 static void pred8x8l_vertical_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2396 PREDICT_8x8_LOAD_TOP
;
2405 for( y
= 1; y
< 8; y
++ )
2406 *(uint64_t*)(src
+y
*stride
) = *(uint64_t*)src
;
2408 static void pred8x8l_down_left_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2410 PREDICT_8x8_LOAD_TOP
;
2411 PREDICT_8x8_LOAD_TOPRIGHT
;
2412 SRC(0,0)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
2413 SRC(0,1)=SRC(1,0)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
2414 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
2415 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
2416 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
2417 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
2418 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6
+ 2*t7
+ t8
+ 2) >> 2;
2419 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7
+ 2*t8
+ t9
+ 2) >> 2;
2420 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8
+ 2*t9
+ t10
+ 2) >> 2;
2421 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9
+ 2*t10
+ t11
+ 2) >> 2;
2422 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10
+ 2*t11
+ t12
+ 2) >> 2;
2423 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11
+ 2*t12
+ t13
+ 2) >> 2;
2424 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12
+ 2*t13
+ t14
+ 2) >> 2;
2425 SRC(6,7)=SRC(7,6)= (t13
+ 2*t14
+ t15
+ 2) >> 2;
2426 SRC(7,7)= (t14
+ 3*t15
+ 2) >> 2;
2428 static void pred8x8l_down_right_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2430 PREDICT_8x8_LOAD_TOP
;
2431 PREDICT_8x8_LOAD_LEFT
;
2432 PREDICT_8x8_LOAD_TOPLEFT
;
2433 SRC(0,7)= (l7
+ 2*l6
+ l5
+ 2) >> 2;
2434 SRC(0,6)=SRC(1,7)= (l6
+ 2*l5
+ l4
+ 2) >> 2;
2435 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5
+ 2*l4
+ l3
+ 2) >> 2;
2436 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4
+ 2*l3
+ l2
+ 2) >> 2;
2437 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3
+ 2*l2
+ l1
+ 2) >> 2;
2438 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2
+ 2*l1
+ l0
+ 2) >> 2;
2439 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1
+ 2*l0
+ lt
+ 2) >> 2;
2440 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
2441 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt
+ 2*t0
+ t1
+ 2) >> 2;
2442 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
2443 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
2444 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
2445 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
2446 SRC(6,0)=SRC(7,1)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
2447 SRC(7,0)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
2450 static void pred8x8l_vertical_right_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2452 PREDICT_8x8_LOAD_TOP
;
2453 PREDICT_8x8_LOAD_LEFT
;
2454 PREDICT_8x8_LOAD_TOPLEFT
;
2455 SRC(0,6)= (l5
+ 2*l4
+ l3
+ 2) >> 2;
2456 SRC(0,7)= (l6
+ 2*l5
+ l4
+ 2) >> 2;
2457 SRC(0,4)=SRC(1,6)= (l3
+ 2*l2
+ l1
+ 2) >> 2;
2458 SRC(0,5)=SRC(1,7)= (l4
+ 2*l3
+ l2
+ 2) >> 2;
2459 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1
+ 2*l0
+ lt
+ 2) >> 2;
2460 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2
+ 2*l1
+ l0
+ 2) >> 2;
2461 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
2462 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt
+ t0
+ 1) >> 1;
2463 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt
+ 2*t0
+ t1
+ 2) >> 2;
2464 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0
+ t1
+ 1) >> 1;
2465 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
2466 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1
+ t2
+ 1) >> 1;
2467 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
2468 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2
+ t3
+ 1) >> 1;
2469 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
2470 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3
+ t4
+ 1) >> 1;
2471 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3
+ 2*t4
+ t5
+ 2) >> 2;
2472 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4
+ t5
+ 1) >> 1;
2473 SRC(6,1)=SRC(7,3)= (t4
+ 2*t5
+ t6
+ 2) >> 2;
2474 SRC(6,0)=SRC(7,2)= (t5
+ t6
+ 1) >> 1;
2475 SRC(7,1)= (t5
+ 2*t6
+ t7
+ 2) >> 2;
2476 SRC(7,0)= (t6
+ t7
+ 1) >> 1;
2478 static void pred8x8l_horizontal_down_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2480 PREDICT_8x8_LOAD_TOP
;
2481 PREDICT_8x8_LOAD_LEFT
;
2482 PREDICT_8x8_LOAD_TOPLEFT
;
2483 SRC(0,7)= (l6
+ l7
+ 1) >> 1;
2484 SRC(1,7)= (l5
+ 2*l6
+ l7
+ 2) >> 2;
2485 SRC(0,6)=SRC(2,7)= (l5
+ l6
+ 1) >> 1;
2486 SRC(1,6)=SRC(3,7)= (l4
+ 2*l5
+ l6
+ 2) >> 2;
2487 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4
+ l5
+ 1) >> 1;
2488 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3
+ 2*l4
+ l5
+ 2) >> 2;
2489 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3
+ l4
+ 1) >> 1;
2490 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2
+ 2*l3
+ l4
+ 2) >> 2;
2491 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2
+ l3
+ 1) >> 1;
2492 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1
+ 2*l2
+ l3
+ 2) >> 2;
2493 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1
+ l2
+ 1) >> 1;
2494 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0
+ 2*l1
+ l2
+ 2) >> 2;
2495 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0
+ l1
+ 1) >> 1;
2496 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt
+ 2*l0
+ l1
+ 2) >> 2;
2497 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt
+ l0
+ 1) >> 1;
2498 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0
+ 2*lt
+ t0
+ 2) >> 2;
2499 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1
+ 2*t0
+ lt
+ 2) >> 2;
2500 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2
+ 2*t1
+ t0
+ 2) >> 2;
2501 SRC(4,0)=SRC(6,1)= (t3
+ 2*t2
+ t1
+ 2) >> 2;
2502 SRC(5,0)=SRC(7,1)= (t4
+ 2*t3
+ t2
+ 2) >> 2;
2503 SRC(6,0)= (t5
+ 2*t4
+ t3
+ 2) >> 2;
2504 SRC(7,0)= (t6
+ 2*t5
+ t4
+ 2) >> 2;
2506 static void pred8x8l_vertical_left_c(uint8_t *src
, int has_topleft
, int has_topright
, int stride
)
2508 PREDICT_8x8_LOAD_TOP
;
2509 PREDICT_8x8_LOAD_TOPRIGHT
;
2510 SRC(0,0)= (t0
+ t1
+ 1) >> 1;
2511 SRC(0,1)= (t0
+ 2*t1
+ t2
+ 2) >> 2;
2512 SRC(0,2)=SRC(1,0)= (t1
+ t2
+ 1) >> 1;
2513 SRC(0,3)=SRC(1,1)= (t1
+ 2*t2
+ t3
+ 2) >> 2;
2514 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2
+ t3
+ 1) >> 1;
2515 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2
+ 2*t3
+ t4
+ 2) >> 2;
2516 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3
+ t4
+ 1) >> 1;
2517 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3
+ 2*t4
+ t5
+ 2) >> 2;