2750173b14c6ee261644509aa108d7a6b5cef691
[libav.git] / libavcodec / h264.c
1 /*
2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 */
22
23 /**
24 * @file h264.c
25 * H.264 / AVC / MPEG4 part10 codec.
26 * @author Michael Niedermayer <michaelni@gmx.at>
27 */
28
29 #include "common.h"
30 #include "dsputil.h"
31 #include "avcodec.h"
32 #include "mpegvideo.h"
33 #include "h264data.h"
34 #include "golomb.h"
35
36 #include "cabac.h"
37
38 //#undef NDEBUG
39 #include <assert.h>
40
41 #define interlaced_dct interlaced_dct_is_a_bad_name
42 #define mb_intra mb_intra_isnt_initalized_see_mb_type
43
44 #define LUMA_DC_BLOCK_INDEX 25
45 #define CHROMA_DC_BLOCK_INDEX 26
46
47 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
48 #define COEFF_TOKEN_VLC_BITS 8
49 #define TOTAL_ZEROS_VLC_BITS 9
50 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
51 #define RUN_VLC_BITS 3
52 #define RUN7_VLC_BITS 6
53
54 #define MAX_SPS_COUNT 32
55 #define MAX_PPS_COUNT 256
56
57 #define MAX_MMCO_COUNT 66
58
59 /* Compiling in interlaced support reduces the speed
60 * of progressive decoding by about 2%. */
61 #define ALLOW_INTERLACE
62
63 #ifdef ALLOW_INTERLACE
64 #define MB_MBAFF h->mb_mbaff
65 #define MB_FIELD h->mb_field_decoding_flag
66 #define FRAME_MBAFF h->mb_aff_frame
67 #else
68 #define MB_MBAFF 0
69 #define MB_FIELD 0
70 #define FRAME_MBAFF 0
71 #undef IS_INTERLACED
72 #define IS_INTERLACED(mb_type) 0
73 #endif
74
75 /**
76 * Sequence parameter set
77 */
78 typedef struct SPS{
79
80 int profile_idc;
81 int level_idc;
82 int transform_bypass; ///< qpprime_y_zero_transform_bypass_flag
83 int log2_max_frame_num; ///< log2_max_frame_num_minus4 + 4
84 int poc_type; ///< pic_order_cnt_type
85 int log2_max_poc_lsb; ///< log2_max_pic_order_cnt_lsb_minus4
86 int delta_pic_order_always_zero_flag;
87 int offset_for_non_ref_pic;
88 int offset_for_top_to_bottom_field;
89 int poc_cycle_length; ///< num_ref_frames_in_pic_order_cnt_cycle
90 int ref_frame_count; ///< num_ref_frames
91 int gaps_in_frame_num_allowed_flag;
92 int mb_width; ///< frame_width_in_mbs_minus1 + 1
93 int mb_height; ///< frame_height_in_mbs_minus1 + 1
94 int frame_mbs_only_flag;
95 int mb_aff; ///<mb_adaptive_frame_field_flag
96 int direct_8x8_inference_flag;
97 int crop; ///< frame_cropping_flag
98 int crop_left; ///< frame_cropping_rect_left_offset
99 int crop_right; ///< frame_cropping_rect_right_offset
100 int crop_top; ///< frame_cropping_rect_top_offset
101 int crop_bottom; ///< frame_cropping_rect_bottom_offset
102 int vui_parameters_present_flag;
103 AVRational sar;
104 int timing_info_present_flag;
105 uint32_t num_units_in_tick;
106 uint32_t time_scale;
107 int fixed_frame_rate_flag;
108 short offset_for_ref_frame[256]; //FIXME dyn aloc?
109 int bitstream_restriction_flag;
110 int num_reorder_frames;
111 int scaling_matrix_present;
112 uint8_t scaling_matrix4[6][16];
113 uint8_t scaling_matrix8[2][64];
114 }SPS;
115
116 /**
117 * Picture parameter set
118 */
119 typedef struct PPS{
120 unsigned int sps_id;
121 int cabac; ///< entropy_coding_mode_flag
122 int pic_order_present; ///< pic_order_present_flag
123 int slice_group_count; ///< num_slice_groups_minus1 + 1
124 int mb_slice_group_map_type;
125 unsigned int ref_count[2]; ///< num_ref_idx_l0/1_active_minus1 + 1
126 int weighted_pred; ///< weighted_pred_flag
127 int weighted_bipred_idc;
128 int init_qp; ///< pic_init_qp_minus26 + 26
129 int init_qs; ///< pic_init_qs_minus26 + 26
130 int chroma_qp_index_offset;
131 int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
132 int constrained_intra_pred; ///< constrained_intra_pred_flag
133 int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
134 int transform_8x8_mode; ///< transform_8x8_mode_flag
135 uint8_t scaling_matrix4[6][16];
136 uint8_t scaling_matrix8[2][64];
137 }PPS;
138
139 /**
140 * Memory management control operation opcode.
141 */
142 typedef enum MMCOOpcode{
143 MMCO_END=0,
144 MMCO_SHORT2UNUSED,
145 MMCO_LONG2UNUSED,
146 MMCO_SHORT2LONG,
147 MMCO_SET_MAX_LONG,
148 MMCO_RESET,
149 MMCO_LONG,
150 } MMCOOpcode;
151
152 /**
153 * Memory management control operation.
154 */
155 typedef struct MMCO{
156 MMCOOpcode opcode;
157 int short_frame_num;
158 int long_index;
159 } MMCO;
160
161 /**
162 * H264Context
163 */
164 typedef struct H264Context{
165 MpegEncContext s;
166 int nal_ref_idc;
167 int nal_unit_type;
168 uint8_t *rbsp_buffer;
169 unsigned int rbsp_buffer_size;
170
171 /**
172 * Used to parse AVC variant of h264
173 */
174 int is_avc; ///< this flag is != 0 if codec is avc1
175 int got_avcC; ///< flag used to parse avcC data only once
176 int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
177
178 int chroma_qp; //QPc
179
180 int prev_mb_skipped;
181 int next_mb_skipped;
182
183 //prediction stuff
184 int chroma_pred_mode;
185 int intra16x16_pred_mode;
186
187 int top_mb_xy;
188 int left_mb_xy[2];
189
190 int8_t intra4x4_pred_mode_cache[5*8];
191 int8_t (*intra4x4_pred_mode)[8];
192 void (*pred4x4 [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
193 void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
194 void (*pred8x8 [4+3])(uint8_t *src, int stride);
195 void (*pred16x16[4+3])(uint8_t *src, int stride);
196 unsigned int topleft_samples_available;
197 unsigned int top_samples_available;
198 unsigned int topright_samples_available;
199 unsigned int left_samples_available;
200 uint8_t (*top_borders[2])[16+2*8];
201 uint8_t left_border[2*(17+2*9)];
202
203 /**
204 * non zero coeff count cache.
205 * is 64 if not available.
206 */
207 DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
208 uint8_t (*non_zero_count)[16];
209
210 /**
211 * Motion vector cache.
212 */
213 DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
214 DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
215 #define LIST_NOT_USED -1 //FIXME rename?
216 #define PART_NOT_AVAILABLE -2
217
218 /**
219 * is 1 if the specific list MV&references are set to 0,0,-2.
220 */
221 int mv_cache_clean[2];
222
223 /**
224 * number of neighbors (top and/or left) that used 8x8 dct
225 */
226 int neighbor_transform_size;
227
228 /**
229 * block_offset[ 0..23] for frame macroblocks
230 * block_offset[24..47] for field macroblocks
231 */
232 int block_offset[2*(16+8)];
233
234 uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
235 uint32_t *mb2b8_xy;
236 int b_stride; //FIXME use s->b4_stride
237 int b8_stride;
238
239 int mb_linesize; ///< may be equal to s->linesize or s->linesize*2, for mbaff
240 int mb_uvlinesize;
241
242 int emu_edge_width;
243 int emu_edge_height;
244
245 int halfpel_flag;
246 int thirdpel_flag;
247
248 int unknown_svq3_flag;
249 int next_slice_index;
250
251 SPS sps_buffer[MAX_SPS_COUNT];
252 SPS sps; ///< current sps
253
254 PPS pps_buffer[MAX_PPS_COUNT];
255 /**
256 * current pps
257 */
258 PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
259
260 uint32_t dequant4_buffer[6][52][16];
261 uint32_t dequant8_buffer[2][52][64];
262 uint32_t (*dequant4_coeff[6])[16];
263 uint32_t (*dequant8_coeff[2])[64];
264 int dequant_coeff_pps; ///< reinit tables when pps changes
265
266 int slice_num;
267 uint8_t *slice_table_base;
268 uint8_t *slice_table; ///< slice_table_base + 2*mb_stride + 1
269 int slice_type;
270 int slice_type_fixed;
271
272 //interlacing specific flags
273 int mb_aff_frame;
274 int mb_field_decoding_flag;
275 int mb_mbaff; ///< mb_aff_frame && mb_field_decoding_flag
276
277 unsigned int sub_mb_type[4];
278
279 //POC stuff
280 int poc_lsb;
281 int poc_msb;
282 int delta_poc_bottom;
283 int delta_poc[2];
284 int frame_num;
285 int prev_poc_msb; ///< poc_msb of the last reference pic for POC type 0
286 int prev_poc_lsb; ///< poc_lsb of the last reference pic for POC type 0
287 int frame_num_offset; ///< for POC type 2
288 int prev_frame_num_offset; ///< for POC type 2
289 int prev_frame_num; ///< frame_num of the last pic for POC type 1/2
290
291 /**
292 * frame_num for frames or 2*frame_num for field pics.
293 */
294 int curr_pic_num;
295
296 /**
297 * max_frame_num or 2*max_frame_num for field pics.
298 */
299 int max_pic_num;
300
301 //Weighted pred stuff
302 int use_weight;
303 int use_weight_chroma;
304 int luma_log2_weight_denom;
305 int chroma_log2_weight_denom;
306 int luma_weight[2][48];
307 int luma_offset[2][48];
308 int chroma_weight[2][48][2];
309 int chroma_offset[2][48][2];
310 int implicit_weight[48][48];
311
312 //deblock
313 int deblocking_filter; ///< disable_deblocking_filter_idc with 1<->0
314 int slice_alpha_c0_offset;
315 int slice_beta_offset;
316
317 int redundant_pic_count;
318
319 int direct_spatial_mv_pred;
320 int dist_scale_factor[16];
321 int dist_scale_factor_field[32];
322 int map_col_to_list0[2][16];
323 int map_col_to_list0_field[2][32];
324
325 /**
326 * num_ref_idx_l0/1_active_minus1 + 1
327 */
328 unsigned int ref_count[2]; ///< counts frames or fields, depending on current mb mode
329 unsigned int list_count;
330 Picture *short_ref[32];
331 Picture *long_ref[32];
332 Picture default_ref_list[2][32];
333 Picture ref_list[2][48]; ///< 0..15: frame refs, 16..47: mbaff field refs
334 Picture *delayed_pic[18]; //FIXME size?
335 Picture *delayed_output_pic;
336
337 /**
338 * memory management control operations buffer.
339 */
340 MMCO mmco[MAX_MMCO_COUNT];
341 int mmco_index;
342
343 int long_ref_count; ///< number of actual long term references
344 int short_ref_count; ///< number of actual short term references
345
346 //data partitioning
347 GetBitContext intra_gb;
348 GetBitContext inter_gb;
349 GetBitContext *intra_gb_ptr;
350 GetBitContext *inter_gb_ptr;
351
352 DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
353 DCTELEM mb_padding[256]; ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not to large or ensure that there is some unused stuff after mb
354
355 /**
356 * Cabac
357 */
358 CABACContext cabac;
359 uint8_t cabac_state[460];
360 int cabac_init_idc;
361
362 /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
363 uint16_t *cbp_table;
364 int cbp;
365 int top_cbp;
366 int left_cbp;
367 /* chroma_pred_mode for i4x4 or i16x16, else 0 */
368 uint8_t *chroma_pred_mode_table;
369 int last_qscale_diff;
370 int16_t (*mvd_table[2])[2];
371 DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
372 uint8_t *direct_table;
373 uint8_t direct_cache[5*8];
374
375 uint8_t zigzag_scan[16];
376 uint8_t zigzag_scan8x8[64];
377 uint8_t zigzag_scan8x8_cavlc[64];
378 uint8_t field_scan[16];
379 uint8_t field_scan8x8[64];
380 uint8_t field_scan8x8_cavlc[64];
381 const uint8_t *zigzag_scan_q0;
382 const uint8_t *zigzag_scan8x8_q0;
383 const uint8_t *zigzag_scan8x8_cavlc_q0;
384 const uint8_t *field_scan_q0;
385 const uint8_t *field_scan8x8_q0;
386 const uint8_t *field_scan8x8_cavlc_q0;
387
388 int x264_build;
389 }H264Context;
390
391 static VLC coeff_token_vlc[4];
392 static VLC chroma_dc_coeff_token_vlc;
393
394 static VLC total_zeros_vlc[15];
395 static VLC chroma_dc_total_zeros_vlc[3];
396
397 static VLC run_vlc[6];
398 static VLC run7_vlc;
399
400 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
401 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
402 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
403 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
404
405 static av_always_inline uint32_t pack16to32(int a, int b){
406 #ifdef WORDS_BIGENDIAN
407 return (b&0xFFFF) + (a<<16);
408 #else
409 return (a&0xFFFF) + (b<<16);
410 #endif
411 }
412
413 const uint8_t ff_rem6[52]={
414 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
415 };
416
417 const uint8_t ff_div6[52]={
418 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
419 };
420
421
422 /**
423 * fill a rectangle.
424 * @param h height of the rectangle, should be a constant
425 * @param w width of the rectangle, should be a constant
426 * @param size the size of val (1 or 4), should be a constant
427 */
428 static av_always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
429 uint8_t *p= (uint8_t*)vp;
430 assert(size==1 || size==4);
431 assert(w<=4);
432
433 w *= size;
434 stride *= size;
435
436 assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
437 assert((stride&(w-1))==0);
438 if(w==2){
439 const uint16_t v= size==4 ? val : val*0x0101;
440 *(uint16_t*)(p + 0*stride)= v;
441 if(h==1) return;
442 *(uint16_t*)(p + 1*stride)= v;
443 if(h==2) return;
444 *(uint16_t*)(p + 2*stride)=
445 *(uint16_t*)(p + 3*stride)= v;
446 }else if(w==4){
447 const uint32_t v= size==4 ? val : val*0x01010101;
448 *(uint32_t*)(p + 0*stride)= v;
449 if(h==1) return;
450 *(uint32_t*)(p + 1*stride)= v;
451 if(h==2) return;
452 *(uint32_t*)(p + 2*stride)=
453 *(uint32_t*)(p + 3*stride)= v;
454 }else if(w==8){
455 //gcc can't optimize 64bit math on x86_32
456 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
457 const uint64_t v= val*0x0100000001ULL;
458 *(uint64_t*)(p + 0*stride)= v;
459 if(h==1) return;
460 *(uint64_t*)(p + 1*stride)= v;
461 if(h==2) return;
462 *(uint64_t*)(p + 2*stride)=
463 *(uint64_t*)(p + 3*stride)= v;
464 }else if(w==16){
465 const uint64_t v= val*0x0100000001ULL;
466 *(uint64_t*)(p + 0+0*stride)=
467 *(uint64_t*)(p + 8+0*stride)=
468 *(uint64_t*)(p + 0+1*stride)=
469 *(uint64_t*)(p + 8+1*stride)= v;
470 if(h==2) return;
471 *(uint64_t*)(p + 0+2*stride)=
472 *(uint64_t*)(p + 8+2*stride)=
473 *(uint64_t*)(p + 0+3*stride)=
474 *(uint64_t*)(p + 8+3*stride)= v;
475 #else
476 *(uint32_t*)(p + 0+0*stride)=
477 *(uint32_t*)(p + 4+0*stride)= val;
478 if(h==1) return;
479 *(uint32_t*)(p + 0+1*stride)=
480 *(uint32_t*)(p + 4+1*stride)= val;
481 if(h==2) return;
482 *(uint32_t*)(p + 0+2*stride)=
483 *(uint32_t*)(p + 4+2*stride)=
484 *(uint32_t*)(p + 0+3*stride)=
485 *(uint32_t*)(p + 4+3*stride)= val;
486 }else if(w==16){
487 *(uint32_t*)(p + 0+0*stride)=
488 *(uint32_t*)(p + 4+0*stride)=
489 *(uint32_t*)(p + 8+0*stride)=
490 *(uint32_t*)(p +12+0*stride)=
491 *(uint32_t*)(p + 0+1*stride)=
492 *(uint32_t*)(p + 4+1*stride)=
493 *(uint32_t*)(p + 8+1*stride)=
494 *(uint32_t*)(p +12+1*stride)= val;
495 if(h==2) return;
496 *(uint32_t*)(p + 0+2*stride)=
497 *(uint32_t*)(p + 4+2*stride)=
498 *(uint32_t*)(p + 8+2*stride)=
499 *(uint32_t*)(p +12+2*stride)=
500 *(uint32_t*)(p + 0+3*stride)=
501 *(uint32_t*)(p + 4+3*stride)=
502 *(uint32_t*)(p + 8+3*stride)=
503 *(uint32_t*)(p +12+3*stride)= val;
504 #endif
505 }else
506 assert(0);
507 assert(h==4);
508 }
509
510 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
511 MpegEncContext * const s = &h->s;
512 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
513 int topleft_xy, top_xy, topright_xy, left_xy[2];
514 int topleft_type, top_type, topright_type, left_type[2];
515 int left_block[8];
516 int i;
517
518 //FIXME deblocking could skip the intra and nnz parts.
519 if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[mb_xy-s->mb_stride]) && !FRAME_MBAFF)
520 return;
521
522 //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
523
524 top_xy = mb_xy - s->mb_stride;
525 topleft_xy = top_xy - 1;
526 topright_xy= top_xy + 1;
527 left_xy[1] = left_xy[0] = mb_xy-1;
528 left_block[0]= 0;
529 left_block[1]= 1;
530 left_block[2]= 2;
531 left_block[3]= 3;
532 left_block[4]= 7;
533 left_block[5]= 10;
534 left_block[6]= 8;
535 left_block[7]= 11;
536 if(FRAME_MBAFF){
537 const int pair_xy = s->mb_x + (s->mb_y & ~1)*s->mb_stride;
538 const int top_pair_xy = pair_xy - s->mb_stride;
539 const int topleft_pair_xy = top_pair_xy - 1;
540 const int topright_pair_xy = top_pair_xy + 1;
541 const int topleft_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
542 const int top_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
543 const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
544 const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
545 const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
546 const int bottom = (s->mb_y & 1);
547 tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
548 if (bottom
549 ? !curr_mb_frame_flag // bottom macroblock
550 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
551 ) {
552 top_xy -= s->mb_stride;
553 }
554 if (bottom
555 ? !curr_mb_frame_flag // bottom macroblock
556 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
557 ) {
558 topleft_xy -= s->mb_stride;
559 }
560 if (bottom
561 ? !curr_mb_frame_flag // bottom macroblock
562 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
563 ) {
564 topright_xy -= s->mb_stride;
565 }
566 if (left_mb_frame_flag != curr_mb_frame_flag) {
567 left_xy[1] = left_xy[0] = pair_xy - 1;
568 if (curr_mb_frame_flag) {
569 if (bottom) {
570 left_block[0]= 2;
571 left_block[1]= 2;
572 left_block[2]= 3;
573 left_block[3]= 3;
574 left_block[4]= 8;
575 left_block[5]= 11;
576 left_block[6]= 8;
577 left_block[7]= 11;
578 } else {
579 left_block[0]= 0;
580 left_block[1]= 0;
581 left_block[2]= 1;
582 left_block[3]= 1;
583 left_block[4]= 7;
584 left_block[5]= 10;
585 left_block[6]= 7;
586 left_block[7]= 10;
587 }
588 } else {
589 left_xy[1] += s->mb_stride;
590 //left_block[0]= 0;
591 left_block[1]= 2;
592 left_block[2]= 0;
593 left_block[3]= 2;
594 //left_block[4]= 7;
595 left_block[5]= 10;
596 left_block[6]= 7;
597 left_block[7]= 10;
598 }
599 }
600 }
601
602 h->top_mb_xy = top_xy;
603 h->left_mb_xy[0] = left_xy[0];
604 h->left_mb_xy[1] = left_xy[1];
605 if(for_deblock){
606 topleft_type = 0;
607 topright_type = 0;
608 top_type = h->slice_table[top_xy ] < 255 ? s->current_picture.mb_type[top_xy] : 0;
609 left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
610 left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
611
612 if(FRAME_MBAFF && !IS_INTRA(mb_type)){
613 int list;
614 int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
615 for(i=0; i<16; i++)
616 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
617 for(list=0; list<h->list_count; list++){
618 if(USES_LIST(mb_type,list)){
619 uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
620 uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
621 int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
622 for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
623 dst[0] = src[0];
624 dst[1] = src[1];
625 dst[2] = src[2];
626 dst[3] = src[3];
627 }
628 *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
629 *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
630 ref += h->b8_stride;
631 *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
632 *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
633 }else{
634 fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
635 fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
636 }
637 }
638 }
639 }else{
640 topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
641 top_type = h->slice_table[top_xy ] == h->slice_num ? s->current_picture.mb_type[top_xy] : 0;
642 topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
643 left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
644 left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
645 }
646
647 if(IS_INTRA(mb_type)){
648 h->topleft_samples_available=
649 h->top_samples_available=
650 h->left_samples_available= 0xFFFF;
651 h->topright_samples_available= 0xEEEA;
652
653 if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
654 h->topleft_samples_available= 0xB3FF;
655 h->top_samples_available= 0x33FF;
656 h->topright_samples_available= 0x26EA;
657 }
658 for(i=0; i<2; i++){
659 if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
660 h->topleft_samples_available&= 0xDF5F;
661 h->left_samples_available&= 0x5F5F;
662 }
663 }
664
665 if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
666 h->topleft_samples_available&= 0x7FFF;
667
668 if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
669 h->topright_samples_available&= 0xFBFF;
670
671 if(IS_INTRA4x4(mb_type)){
672 if(IS_INTRA4x4(top_type)){
673 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
674 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
675 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
676 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
677 }else{
678 int pred;
679 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
680 pred= -1;
681 else{
682 pred= 2;
683 }
684 h->intra4x4_pred_mode_cache[4+8*0]=
685 h->intra4x4_pred_mode_cache[5+8*0]=
686 h->intra4x4_pred_mode_cache[6+8*0]=
687 h->intra4x4_pred_mode_cache[7+8*0]= pred;
688 }
689 for(i=0; i<2; i++){
690 if(IS_INTRA4x4(left_type[i])){
691 h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
692 h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
693 }else{
694 int pred;
695 if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
696 pred= -1;
697 else{
698 pred= 2;
699 }
700 h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
701 h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
702 }
703 }
704 }
705 }
706
707
708 /*
709 0 . T T. T T T T
710 1 L . .L . . . .
711 2 L . .L . . . .
712 3 . T TL . . . .
713 4 L . .L . . . .
714 5 L . .. . . . .
715 */
716 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
717 if(top_type){
718 h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
719 h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
720 h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
721 h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
722
723 h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
724 h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
725
726 h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
727 h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
728
729 }else{
730 h->non_zero_count_cache[4+8*0]=
731 h->non_zero_count_cache[5+8*0]=
732 h->non_zero_count_cache[6+8*0]=
733 h->non_zero_count_cache[7+8*0]=
734
735 h->non_zero_count_cache[1+8*0]=
736 h->non_zero_count_cache[2+8*0]=
737
738 h->non_zero_count_cache[1+8*3]=
739 h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
740
741 }
742
743 for (i=0; i<2; i++) {
744 if(left_type[i]){
745 h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
746 h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
747 h->non_zero_count_cache[0+8*1 + 8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
748 h->non_zero_count_cache[0+8*4 + 8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
749 }else{
750 h->non_zero_count_cache[3+8*1 + 2*8*i]=
751 h->non_zero_count_cache[3+8*2 + 2*8*i]=
752 h->non_zero_count_cache[0+8*1 + 8*i]=
753 h->non_zero_count_cache[0+8*4 + 8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
754 }
755 }
756
757 if( h->pps.cabac ) {
758 // top_cbp
759 if(top_type) {
760 h->top_cbp = h->cbp_table[top_xy];
761 } else if(IS_INTRA(mb_type)) {
762 h->top_cbp = 0x1C0;
763 } else {
764 h->top_cbp = 0;
765 }
766 // left_cbp
767 if (left_type[0]) {
768 h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
769 } else if(IS_INTRA(mb_type)) {
770 h->left_cbp = 0x1C0;
771 } else {
772 h->left_cbp = 0;
773 }
774 if (left_type[0]) {
775 h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
776 }
777 if (left_type[1]) {
778 h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
779 }
780 }
781
782 #if 1
783 if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
784 int list;
785 for(list=0; list<h->list_count; list++){
786 if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
787 /*if(!h->mv_cache_clean[list]){
788 memset(h->mv_cache [list], 0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
789 memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
790 h->mv_cache_clean[list]= 1;
791 }*/
792 continue;
793 }
794 h->mv_cache_clean[list]= 0;
795
796 if(USES_LIST(top_type, list)){
797 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
798 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
799 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
800 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
801 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
802 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
803 h->ref_cache[list][scan8[0] + 0 - 1*8]=
804 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
805 h->ref_cache[list][scan8[0] + 2 - 1*8]=
806 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
807 }else{
808 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
809 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
810 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
811 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
812 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
813 }
814
815 for(i=0; i<2; i++){
816 int cache_idx = scan8[0] - 1 + i*2*8;
817 if(USES_LIST(left_type[i], list)){
818 const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
819 const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
820 *(uint32_t*)h->mv_cache[list][cache_idx ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
821 *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
822 h->ref_cache[list][cache_idx ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
823 h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
824 }else{
825 *(uint32_t*)h->mv_cache [list][cache_idx ]=
826 *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
827 h->ref_cache[list][cache_idx ]=
828 h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
829 }
830 }
831
832 if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
833 continue;
834
835 if(USES_LIST(topleft_type, list)){
836 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
837 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
838 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
839 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
840 }else{
841 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
842 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
843 }
844
845 if(USES_LIST(topright_type, list)){
846 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
847 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
848 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
849 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
850 }else{
851 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
852 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
853 }
854
855 if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
856 continue;
857
858 h->ref_cache[list][scan8[5 ]+1] =
859 h->ref_cache[list][scan8[7 ]+1] =
860 h->ref_cache[list][scan8[13]+1] = //FIXME remove past 3 (init somewhere else)
861 h->ref_cache[list][scan8[4 ]] =
862 h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
863 *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
864 *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
865 *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
866 *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
867 *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
868
869 if( h->pps.cabac ) {
870 /* XXX beurk, Load mvd */
871 if(USES_LIST(top_type, list)){
872 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
873 *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
874 *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
875 *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
876 *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
877 }else{
878 *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
879 *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
880 *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
881 *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
882 }
883 if(USES_LIST(left_type[0], list)){
884 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
885 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
886 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
887 }else{
888 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
889 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
890 }
891 if(USES_LIST(left_type[1], list)){
892 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
893 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
894 *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
895 }else{
896 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
897 *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
898 }
899 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
900 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
901 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
902 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
903 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
904
905 if(h->slice_type == B_TYPE){
906 fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
907
908 if(IS_DIRECT(top_type)){
909 *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
910 }else if(IS_8X8(top_type)){
911 int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
912 h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
913 h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
914 }else{
915 *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
916 }
917
918 if(IS_DIRECT(left_type[0]))
919 h->direct_cache[scan8[0] - 1 + 0*8]= 1;
920 else if(IS_8X8(left_type[0]))
921 h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
922 else
923 h->direct_cache[scan8[0] - 1 + 0*8]= 0;
924
925 if(IS_DIRECT(left_type[1]))
926 h->direct_cache[scan8[0] - 1 + 2*8]= 1;
927 else if(IS_8X8(left_type[1]))
928 h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
929 else
930 h->direct_cache[scan8[0] - 1 + 2*8]= 0;
931 }
932 }
933
934 if(FRAME_MBAFF){
935 #define MAP_MVS\
936 MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
937 MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
938 MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
939 MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
940 MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
941 MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
942 MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
943 MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
944 MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
945 MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
946 if(MB_FIELD){
947 #define MAP_F2F(idx, mb_type)\
948 if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
949 h->ref_cache[list][idx] <<= 1;\
950 h->mv_cache[list][idx][1] /= 2;\
951 h->mvd_cache[list][idx][1] /= 2;\
952 }
953 MAP_MVS
954 #undef MAP_F2F
955 }else{
956 #define MAP_F2F(idx, mb_type)\
957 if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
958 h->ref_cache[list][idx] >>= 1;\
959 h->mv_cache[list][idx][1] <<= 1;\
960 h->mvd_cache[list][idx][1] <<= 1;\
961 }
962 MAP_MVS
963 #undef MAP_F2F
964 }
965 }
966 }
967 }
968 #endif
969
970 h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
971 }
972
973 static inline void write_back_intra_pred_mode(H264Context *h){
974 MpegEncContext * const s = &h->s;
975 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
976
977 h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
978 h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
979 h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
980 h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
981 h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
982 h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
983 h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
984 }
985
986 /**
987 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
988 */
989 static inline int check_intra4x4_pred_mode(H264Context *h){
990 MpegEncContext * const s = &h->s;
991 static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
992 static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
993 int i;
994
995 if(!(h->top_samples_available&0x8000)){
996 for(i=0; i<4; i++){
997 int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
998 if(status<0){
999 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1000 return -1;
1001 } else if(status){
1002 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1003 }
1004 }
1005 }
1006
1007 if(!(h->left_samples_available&0x8000)){
1008 for(i=0; i<4; i++){
1009 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1010 if(status<0){
1011 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1012 return -1;
1013 } else if(status){
1014 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1015 }
1016 }
1017 }
1018
1019 return 0;
1020 } //FIXME cleanup like next
1021
1022 /**
1023 * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1024 */
1025 static inline int check_intra_pred_mode(H264Context *h, int mode){
1026 MpegEncContext * const s = &h->s;
1027 static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1028 static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1029
1030 if(mode > 6U) {
1031 av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1032 return -1;
1033 }
1034
1035 if(!(h->top_samples_available&0x8000)){
1036 mode= top[ mode ];
1037 if(mode<0){
1038 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1039 return -1;
1040 }
1041 }
1042
1043 if(!(h->left_samples_available&0x8000)){
1044 mode= left[ mode ];
1045 if(mode<0){
1046 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1047 return -1;
1048 }
1049 }
1050
1051 return mode;
1052 }
1053
1054 /**
1055 * gets the predicted intra4x4 prediction mode.
1056 */
1057 static inline int pred_intra_mode(H264Context *h, int n){
1058 const int index8= scan8[n];
1059 const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1060 const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1061 const int min= FFMIN(left, top);
1062
1063 tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
1064
1065 if(min<0) return DC_PRED;
1066 else return min;
1067 }
1068
1069 static inline void write_back_non_zero_count(H264Context *h){
1070 MpegEncContext * const s = &h->s;
1071 const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1072
1073 h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1074 h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1075 h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1076 h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1077 h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1078 h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1079 h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1080
1081 h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1082 h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1083 h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1084
1085 h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1086 h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1087 h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1088
1089 if(FRAME_MBAFF){
1090 // store all luma nnzs, for deblocking
1091 int v = 0, i;
1092 for(i=0; i<16; i++)
1093 v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1094 *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1095 }
1096 }
1097
1098 /**
1099 * gets the predicted number of non zero coefficients.
1100 * @param n block index
1101 */
1102 static inline int pred_non_zero_count(H264Context *h, int n){
1103 const int index8= scan8[n];
1104 const int left= h->non_zero_count_cache[index8 - 1];
1105 const int top = h->non_zero_count_cache[index8 - 8];
1106 int i= left + top;
1107
1108 if(i<64) i= (i+1)>>1;
1109
1110 tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1111
1112 return i&31;
1113 }
1114
1115 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1116 const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1117 MpegEncContext *s = &h->s;
1118
1119 /* there is no consistent mapping of mvs to neighboring locations that will
1120 * make mbaff happy, so we can't move all this logic to fill_caches */
1121 if(FRAME_MBAFF){
1122 const uint32_t *mb_types = s->current_picture_ptr->mb_type;
1123 const int16_t *mv;
1124 *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1125 *C = h->mv_cache[list][scan8[0]-2];
1126
1127 if(!MB_FIELD
1128 && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1129 int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1130 if(IS_INTERLACED(mb_types[topright_xy])){
1131 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1132 const int x4 = X4, y4 = Y4;\
1133 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1134 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1135 return LIST_NOT_USED;\
1136 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1137 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1138 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1139 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1140
1141 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1142 }
1143 }
1144 if(topright_ref == PART_NOT_AVAILABLE
1145 && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1146 && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1147 if(!MB_FIELD
1148 && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1149 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1150 }
1151 if(MB_FIELD
1152 && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1153 && i >= scan8[0]+8){
1154 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1155 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1156 }
1157 }
1158 #undef SET_DIAG_MV
1159 }
1160
1161 if(topright_ref != PART_NOT_AVAILABLE){
1162 *C= h->mv_cache[list][ i - 8 + part_width ];
1163 return topright_ref;
1164 }else{
1165 tprintf(s->avctx, "topright MV not available\n");
1166
1167 *C= h->mv_cache[list][ i - 8 - 1 ];
1168 return h->ref_cache[list][ i - 8 - 1 ];
1169 }
1170 }
1171
1172 /**
1173 * gets the predicted MV.
1174 * @param n the block index
1175 * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1176 * @param mx the x component of the predicted motion vector
1177 * @param my the y component of the predicted motion vector
1178 */
1179 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1180 const int index8= scan8[n];
1181 const int top_ref= h->ref_cache[list][ index8 - 8 ];
1182 const int left_ref= h->ref_cache[list][ index8 - 1 ];
1183 const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1184 const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1185 const int16_t * C;
1186 int diagonal_ref, match_count;
1187
1188 assert(part_width==1 || part_width==2 || part_width==4);
1189
1190 /* mv_cache
1191 B . . A T T T T
1192 U . . L . . , .
1193 U . . L . . . .
1194 U . . L . . , .
1195 . . . L . . . .
1196 */
1197
1198 diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1199 match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1200 tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
1201 if(match_count > 1){ //most common
1202 *mx= mid_pred(A[0], B[0], C[0]);
1203 *my= mid_pred(A[1], B[1], C[1]);
1204 }else if(match_count==1){
1205 if(left_ref==ref){
1206 *mx= A[0];
1207 *my= A[1];
1208 }else if(top_ref==ref){
1209 *mx= B[0];
1210 *my= B[1];
1211 }else{
1212 *mx= C[0];
1213 *my= C[1];
1214 }
1215 }else{
1216 if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1217 *mx= A[0];
1218 *my= A[1];
1219 }else{
1220 *mx= mid_pred(A[0], B[0], C[0]);
1221 *my= mid_pred(A[1], B[1], C[1]);
1222 }
1223 }
1224
1225 tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1226 }
1227
1228 /**
1229 * gets the directionally predicted 16x8 MV.
1230 * @param n the block index
1231 * @param mx the x component of the predicted motion vector
1232 * @param my the y component of the predicted motion vector
1233 */
1234 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1235 if(n==0){
1236 const int top_ref= h->ref_cache[list][ scan8[0] - 8 ];
1237 const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1238
1239 tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1240
1241 if(top_ref == ref){
1242 *mx= B[0];
1243 *my= B[1];
1244 return;
1245 }
1246 }else{
1247 const int left_ref= h->ref_cache[list][ scan8[8] - 1 ];
1248 const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1249
1250 tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1251
1252 if(left_ref == ref){
1253 *mx= A[0];
1254 *my= A[1];
1255 return;
1256 }
1257 }
1258
1259 //RARE
1260 pred_motion(h, n, 4, list, ref, mx, my);
1261 }
1262
1263 /**
1264 * gets the directionally predicted 8x16 MV.
1265 * @param n the block index
1266 * @param mx the x component of the predicted motion vector
1267 * @param my the y component of the predicted motion vector
1268 */
1269 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1270 if(n==0){
1271 const int left_ref= h->ref_cache[list][ scan8[0] - 1 ];
1272 const int16_t * const A= h->mv_cache[list][ scan8[0] - 1 ];
1273
1274 tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1275
1276 if(left_ref == ref){
1277 *mx= A[0];
1278 *my= A[1];
1279 return;
1280 }
1281 }else{
1282 const int16_t * C;
1283 int diagonal_ref;
1284
1285 diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1286
1287 tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1288
1289 if(diagonal_ref == ref){
1290 *mx= C[0];
1291 *my= C[1];
1292 return;
1293 }
1294 }
1295
1296 //RARE
1297 pred_motion(h, n, 2, list, ref, mx, my);
1298 }
1299
1300 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1301 const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1302 const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1303
1304 tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1305
1306 if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1307 || (top_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1308 || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1309
1310 *mx = *my = 0;
1311 return;
1312 }
1313
1314 pred_motion(h, 0, 4, 0, 0, mx, my);
1315
1316 return;
1317 }
1318
1319 static inline void direct_dist_scale_factor(H264Context * const h){
1320 const int poc = h->s.current_picture_ptr->poc;
1321 const int poc1 = h->ref_list[1][0].poc;
1322 int i;
1323 for(i=0; i<h->ref_count[0]; i++){
1324 int poc0 = h->ref_list[0][i].poc;
1325 int td = av_clip(poc1 - poc0, -128, 127);
1326 if(td == 0 /* FIXME || pic0 is a long-term ref */){
1327 h->dist_scale_factor[i] = 256;
1328 }else{
1329 int tb = av_clip(poc - poc0, -128, 127);
1330 int tx = (16384 + (FFABS(td) >> 1)) / td;
1331 h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
1332 }
1333 }
1334 if(FRAME_MBAFF){
1335 for(i=0; i<h->ref_count[0]; i++){
1336 h->dist_scale_factor_field[2*i] =
1337 h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1338 }
1339 }
1340 }
1341 static inline void direct_ref_list_init(H264Context * const h){
1342 MpegEncContext * const s = &h->s;
1343 Picture * const ref1 = &h->ref_list[1][0];
1344 Picture * const cur = s->current_picture_ptr;
1345 int list, i, j;
1346 if(cur->pict_type == I_TYPE)
1347 cur->ref_count[0] = 0;
1348 if(cur->pict_type != B_TYPE)
1349 cur->ref_count[1] = 0;
1350 for(list=0; list<2; list++){
1351 cur->ref_count[list] = h->ref_count[list];
1352 for(j=0; j<h->ref_count[list]; j++)
1353 cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1354 }
1355 if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1356 return;
1357 for(list=0; list<2; list++){
1358 for(i=0; i<ref1->ref_count[list]; i++){
1359 const int poc = ref1->ref_poc[list][i];
1360 h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1361 for(j=0; j<h->ref_count[list]; j++)
1362 if(h->ref_list[list][j].poc == poc){
1363 h->map_col_to_list0[list][i] = j;
1364 break;
1365 }
1366 }
1367 }
1368 if(FRAME_MBAFF){
1369 for(list=0; list<2; list++){
1370 for(i=0; i<ref1->ref_count[list]; i++){
1371 j = h->map_col_to_list0[list][i];
1372 h->map_col_to_list0_field[list][2*i] = 2*j;
1373 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1374 }
1375 }
1376 }
1377 }
1378
1379 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1380 MpegEncContext * const s = &h->s;
1381 const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
1382 const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1383 const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1384 const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1385 const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1386 const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1387 const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1388 const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1389 const int is_b8x8 = IS_8X8(*mb_type);
1390 unsigned int sub_mb_type;
1391 int i8, i4;
1392
1393 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1394 if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1395 /* FIXME save sub mb types from previous frames (or derive from MVs)
1396 * so we know exactly what block size to use */
1397 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1398 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1;
1399 }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1400 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1401 *mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1402 }else{
1403 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1404 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1;
1405 }
1406 if(!is_b8x8)
1407 *mb_type |= MB_TYPE_DIRECT2;
1408 if(MB_FIELD)
1409 *mb_type |= MB_TYPE_INTERLACED;
1410
1411 tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1412
1413 if(h->direct_spatial_mv_pred){
1414 int ref[2];
1415 int mv[2][2];
1416 int list;
1417
1418 /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1419
1420 /* ref = min(neighbors) */
1421 for(list=0; list<2; list++){
1422 int refa = h->ref_cache[list][scan8[0] - 1];
1423 int refb = h->ref_cache[list][scan8[0] - 8];
1424 int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1425 if(refc == -2)
1426 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1427 ref[list] = refa;
1428 if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1429 ref[list] = refb;
1430 if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1431 ref[list] = refc;
1432 if(ref[list] < 0)
1433 ref[list] = -1;
1434 }
1435
1436 if(ref[0] < 0 && ref[1] < 0){
1437 ref[0] = ref[1] = 0;
1438 mv[0][0] = mv[0][1] =
1439 mv[1][0] = mv[1][1] = 0;
1440 }else{
1441 for(list=0; list<2; list++){
1442 if(ref[list] >= 0)
1443 pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1444 else
1445 mv[list][0] = mv[list][1] = 0;
1446 }
1447 }
1448
1449 if(ref[1] < 0){
1450 *mb_type &= ~MB_TYPE_P0L1;
1451 sub_mb_type &= ~MB_TYPE_P0L1;
1452 }else if(ref[0] < 0){
1453 *mb_type &= ~MB_TYPE_P0L0;
1454 sub_mb_type &= ~MB_TYPE_P0L0;
1455 }
1456
1457 if(IS_16X16(*mb_type)){
1458 int a=0, b=0;
1459
1460 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1461 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1462 if(!IS_INTRA(mb_type_col)
1463 && ( (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1464 || (l1ref0[0] < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1465 && (h->x264_build>33 || !h->x264_build)))){
1466 if(ref[0] > 0)
1467 a= pack16to32(mv[0][0],mv[0][1]);
1468 if(ref[1] > 0)
1469 b= pack16to32(mv[1][0],mv[1][1]);
1470 }else{
1471 a= pack16to32(mv[0][0],mv[0][1]);
1472 b= pack16to32(mv[1][0],mv[1][1]);
1473 }
1474 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1475 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1476 }else{
1477 for(i8=0; i8<4; i8++){
1478 const int x8 = i8&1;
1479 const int y8 = i8>>1;
1480
1481 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1482 continue;
1483 h->sub_mb_type[i8] = sub_mb_type;
1484
1485 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1486 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1487 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1488 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1489
1490 /* col_zero_flag */
1491 if(!IS_INTRA(mb_type_col) && ( l1ref0[x8 + y8*h->b8_stride] == 0
1492 || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1493 && (h->x264_build>33 || !h->x264_build)))){
1494 const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1495 if(IS_SUB_8X8(sub_mb_type)){
1496 const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1497 if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1498 if(ref[0] == 0)
1499 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1500 if(ref[1] == 0)
1501 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1502 }
1503 }else
1504 for(i4=0; i4<4; i4++){
1505 const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1506 if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1507 if(ref[0] == 0)
1508 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1509 if(ref[1] == 0)
1510 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1511 }
1512 }
1513 }
1514 }
1515 }
1516 }else{ /* direct temporal mv pred */
1517 const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1518 const int *dist_scale_factor = h->dist_scale_factor;
1519
1520 if(FRAME_MBAFF){
1521 if(IS_INTERLACED(*mb_type)){
1522 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1523 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1524 dist_scale_factor = h->dist_scale_factor_field;
1525 }
1526 if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1527 /* FIXME assumes direct_8x8_inference == 1 */
1528 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1529 int mb_types_col[2];
1530 int y_shift;
1531
1532 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1533 | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1534 | (*mb_type & MB_TYPE_INTERLACED);
1535 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1536
1537 if(IS_INTERLACED(*mb_type)){
1538 /* frame to field scaling */
1539 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1540 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1541 if(s->mb_y&1){
1542 l1ref0 -= 2*h->b8_stride;
1543 l1ref1 -= 2*h->b8_stride;
1544 l1mv0 -= 4*h->b_stride;
1545 l1mv1 -= 4*h->b_stride;
1546 }
1547 y_shift = 0;
1548
1549 if( (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1550 && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1551 && !is_b8x8)
1552 *mb_type |= MB_TYPE_16x8;
1553 else
1554 *mb_type |= MB_TYPE_8x8;
1555 }else{
1556 /* field to frame scaling */
1557 /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1558 * but in MBAFF, top and bottom POC are equal */
1559 int dy = (s->mb_y&1) ? 1 : 2;
1560 mb_types_col[0] =
1561 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1562 l1ref0 += dy*h->b8_stride;
1563 l1ref1 += dy*h->b8_stride;
1564 l1mv0 += 2*dy*h->b_stride;
1565 l1mv1 += 2*dy*h->b_stride;
1566 y_shift = 2;
1567
1568 if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1569 && !is_b8x8)
1570 *mb_type |= MB_TYPE_16x16;
1571 else
1572 *mb_type |= MB_TYPE_8x8;
1573 }
1574
1575 for(i8=0; i8<4; i8++){
1576 const int x8 = i8&1;
1577 const int y8 = i8>>1;
1578 int ref0, scale;
1579 const int16_t (*l1mv)[2]= l1mv0;
1580
1581 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1582 continue;
1583 h->sub_mb_type[i8] = sub_mb_type;
1584
1585 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1586 if(IS_INTRA(mb_types_col[y8])){
1587 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1588 fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1589 fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1590 continue;
1591 }
1592
1593 ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1594 if(ref0 >= 0)
1595 ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1596 else{
1597 ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1598 l1mv= l1mv1;
1599 }
1600 scale = dist_scale_factor[ref0];
1601 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1602
1603 {
1604 const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1605 int my_col = (mv_col[1]<<y_shift)/2;
1606 int mx = (scale * mv_col[0] + 128) >> 8;
1607 int my = (scale * my_col + 128) >> 8;
1608 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1609 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1610 }
1611 }
1612 return;
1613 }
1614 }
1615
1616 /* one-to-one mv scaling */
1617
1618 if(IS_16X16(*mb_type)){
1619 int ref, mv0, mv1;
1620
1621 fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1622 if(IS_INTRA(mb_type_col)){
1623 ref=mv0=mv1=0;
1624 }else{
1625 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1626 : map_col_to_list0[1][l1ref1[0]];
1627 const int scale = dist_scale_factor[ref0];
1628 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1629 int mv_l0[2];
1630 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1631 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1632 ref= ref0;
1633 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1634 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1635 }
1636 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1637 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1638 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1639 }else{
1640 for(i8=0; i8<4; i8++){
1641 const int x8 = i8&1;
1642 const int y8 = i8>>1;
1643 int ref0, scale;
1644 const int16_t (*l1mv)[2]= l1mv0;
1645
1646 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1647 continue;
1648 h->sub_mb_type[i8] = sub_mb_type;
1649 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1650 if(IS_INTRA(mb_type_col)){
1651 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1652 fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1653 fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1654 continue;
1655 }
1656
1657 ref0 = l1ref0[x8 + y8*h->b8_stride];
1658 if(ref0 >= 0)
1659 ref0 = map_col_to_list0[0][ref0];
1660 else{
1661 ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1662 l1mv= l1mv1;
1663 }
1664 scale = dist_scale_factor[ref0];
1665
1666 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1667 if(IS_SUB_8X8(sub_mb_type)){
1668 const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1669 int mx = (scale * mv_col[0] + 128) >> 8;
1670 int my = (scale * mv_col[1] + 128) >> 8;
1671 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1672 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1673 }else
1674 for(i4=0; i4<4; i4++){
1675 const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1676 int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1677 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1678 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1679 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1680 pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1681 }
1682 }
1683 }
1684 }
1685 }
1686
1687 static inline void write_back_motion(H264Context *h, int mb_type){
1688 MpegEncContext * const s = &h->s;
1689 const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1690 const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1691 int list;
1692
1693 if(!USES_LIST(mb_type, 0))
1694 fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1695
1696 for(list=0; list<h->list_count; list++){
1697 int y;
1698 if(!USES_LIST(mb_type, list))
1699 continue;
1700
1701 for(y=0; y<4; y++){
1702 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1703 *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1704 }
1705 if( h->pps.cabac ) {
1706 if(IS_SKIP(mb_type))
1707 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1708 else
1709 for(y=0; y<4; y++){
1710 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1711 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1712 }
1713 }
1714
1715 {
1716 int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1717 ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1718 ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1719 ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1720 ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1721 }
1722 }
1723
1724 if(h->slice_type == B_TYPE && h->pps.cabac){
1725 if(IS_8X8(mb_type)){
1726 uint8_t *direct_table = &h->direct_table[b8_xy];
1727 direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1728 direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1729 direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1730 }
1731 }
1732 }
1733
1734 /**
1735 * Decodes a network abstraction layer unit.
1736 * @param consumed is the number of bytes used as input
1737 * @param length is the length of the array
1738 * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1739 * @returns decoded bytes, might be src+1 if no escapes
1740 */
1741 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1742 int i, si, di;
1743 uint8_t *dst;
1744
1745 // src[0]&0x80; //forbidden bit
1746 h->nal_ref_idc= src[0]>>5;
1747 h->nal_unit_type= src[0]&0x1F;
1748
1749 src++; length--;
1750 #if 0
1751 for(i=0; i<length; i++)
1752 printf("%2X ", src[i]);
1753 #endif
1754 for(i=0; i+1<length; i+=2){
1755 if(src[i]) continue;
1756 if(i>0 && src[i-1]==0) i--;
1757 if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1758 if(src[i+2]!=3){
1759 /* startcode, so we must be past the end */
1760 length=i;
1761 }
1762 break;
1763 }
1764 }
1765
1766 if(i>=length-1){ //no escaped 0
1767 *dst_length= length;
1768 *consumed= length+1; //+1 for the header
1769 return src;
1770 }
1771
1772 h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1773 dst= h->rbsp_buffer;
1774
1775 if (dst == NULL){
1776 return NULL;
1777 }
1778
1779 //printf("decoding esc\n");
1780 si=di=0;
1781 while(si<length){
1782 //remove escapes (very rare 1:2^22)
1783 if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1784 if(src[si+2]==3){ //escape
1785 dst[di++]= 0;
1786 dst[di++]= 0;
1787 si+=3;
1788 continue;
1789 }else //next start code
1790 break;
1791 }
1792
1793 dst[di++]= src[si++];
1794 }
1795
1796 *dst_length= di;
1797 *consumed= si + 1;//+1 for the header
1798 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1799 return dst;
1800 }
1801
1802 /**
1803 * identifies the exact end of the bitstream
1804 * @return the length of the trailing, or 0 if damaged
1805 */
1806 static int decode_rbsp_trailing(H264Context *h, uint8_t *src){
1807 int v= *src;
1808 int r;
1809
1810 tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1811
1812 for(r=1; r<9; r++){
1813 if(v&1) return r;
1814 v>>=1;
1815 }
1816 return 0;
1817 }
1818
1819 /**
1820 * idct tranforms the 16 dc values and dequantize them.
1821 * @param qp quantization parameter
1822 */
1823 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1824 #define stride 16
1825 int i;
1826 int temp[16]; //FIXME check if this is a good idea
1827 static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride};
1828 static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1829
1830 //memset(block, 64, 2*256);
1831 //return;
1832 for(i=0; i<4; i++){
1833 const int offset= y_offset[i];
1834 const int z0= block[offset+stride*0] + block[offset+stride*4];
1835 const int z1= block[offset+stride*0] - block[offset+stride*4];
1836 const int z2= block[offset+stride*1] - block[offset+stride*5];
1837 const int z3= block[offset+stride*1] + block[offset+stride*5];
1838
1839 temp[4*i+0]= z0+z3;
1840 temp[4*i+1]= z1+z2;
1841 temp[4*i+2]= z1-z2;
1842 temp[4*i+3]= z0-z3;
1843 }
1844
1845 for(i=0; i<4; i++){
1846 const int offset= x_offset[i];
1847 const int z0= temp[4*0+i] + temp[4*2+i];
1848 const int z1= temp[4*0+i] - temp[4*2+i];
1849 const int z2= temp[4*1+i] - temp[4*3+i];
1850 const int z3= temp[4*1+i] + temp[4*3+i];
1851
1852 block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1853 block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1854 block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1855 block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1856 }
1857 }
1858
1859 #if 0
1860 /**
1861 * dct tranforms the 16 dc values.
1862 * @param qp quantization parameter ??? FIXME
1863 */
1864 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1865 // const int qmul= dequant_coeff[qp][0];
1866 int i;
1867 int temp[16]; //FIXME check if this is a good idea
1868 static const int x_offset[4]={0, 1*stride, 4* stride, 5*stride};
1869 static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1870
1871 for(i=0; i<4; i++){
1872 const int offset= y_offset[i];
1873 const int z0= block[offset+stride*0] + block[offset+stride*4];
1874 const int z1= block[offset+stride*0] - block[offset+stride*4];
1875 const int z2= block[offset+stride*1] - block[offset+stride*5];
1876 const int z3= block[offset+stride*1] + block[offset+stride*5];
1877
1878 temp[4*i+0]= z0+z3;
1879 temp[4*i+1]= z1+z2;
1880 temp[4*i+2]= z1-z2;
1881 temp[4*i+3]= z0-z3;
1882 }
1883
1884 for(i=0; i<4; i++){
1885 const int offset= x_offset[i];
1886 const int z0= temp[4*0+i] + temp[4*2+i];
1887 const int z1= temp[4*0+i] - temp[4*2+i];
1888 const int z2= temp[4*1+i] - temp[4*3+i];
1889 const int z3= temp[4*1+i] + temp[4*3+i];
1890
1891 block[stride*0 +offset]= (z0 + z3)>>1;
1892 block[stride*2 +offset]= (z1 + z2)>>1;
1893 block[stride*8 +offset]= (z1 - z2)>>1;
1894 block[stride*10+offset]= (z0 - z3)>>1;
1895 }
1896 }
1897 #endif
1898
1899 #undef xStride
1900 #undef stride
1901
1902 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1903 const int stride= 16*2;
1904 const int xStride= 16;
1905 int a,b,c,d,e;
1906
1907 a= block[stride*0 + xStride*0];
1908 b= block[stride*0 + xStride*1];
1909 c= block[stride*1 + xStride*0];
1910 d= block[stride*1 + xStride*1];
1911
1912 e= a-b;
1913 a= a+b;
1914 b= c-d;
1915 c= c+d;
1916
1917 block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1918 block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1919 block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1920 block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1921 }
1922
1923 #if 0
1924 static void chroma_dc_dct_c(DCTELEM *block){
1925 const int stride= 16*2;
1926 const int xStride= 16;
1927 int a,b,c,d,e;
1928
1929 a= block[stride*0 + xStride*0];
1930 b= block[stride*0 + xStride*1];
1931 c= block[stride*1 + xStride*0];
1932 d= block[stride*1 + xStride*1];
1933
1934 e= a-b;
1935 a= a+b;
1936 b= c-d;
1937 c= c+d;
1938
1939 block[stride*0 + xStride*0]= (a+c);
1940 block[stride*0 + xStride*1]= (e+b);
1941 block[stride*1 + xStride*0]= (a-c);
1942 block[stride*1 + xStride*1]= (e-b);
1943 }
1944 #endif
1945
1946 /**
1947 * gets the chroma qp.
1948 */
1949 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1950
1951 return chroma_qp[av_clip(qscale + chroma_qp_index_offset, 0, 51)];
1952 }
1953
1954 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1955 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1956 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1957 int i;
1958 const int * const quant_table= quant_coeff[qscale];
1959 const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1960 const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1961 const unsigned int threshold2= (threshold1<<1);
1962 int last_non_zero;
1963
1964 if(separate_dc){
1965 if(qscale<=18){
1966 //avoid overflows
1967 const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1968 const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1969 const unsigned int dc_threshold2= (dc_threshold1<<1);
1970
1971 int level= block[0]*quant_coeff[qscale+18][0];
1972 if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1973 if(level>0){
1974 level= (dc_bias + level)>>(QUANT_SHIFT-2);
1975 block[0]= level;
1976 }else{
1977 level= (dc_bias - level)>>(QUANT_SHIFT-2);
1978 block[0]= -level;
1979 }
1980 // last_non_zero = i;
1981 }else{
1982 block[0]=0;
1983 }
1984 }else{
1985 const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1986 const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1987 const unsigned int dc_threshold2= (dc_threshold1<<1);
1988
1989 int level= block[0]*quant_table[0];
1990 if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1991 if(level>0){
1992 level= (dc_bias + level)>>(QUANT_SHIFT+1);
1993 block[0]= level;
1994 }else{
1995 level= (dc_bias - level)>>(QUANT_SHIFT+1);
1996 block[0]= -level;
1997 }
1998 // last_non_zero = i;
1999 }else{
2000 block[0]=0;
2001 }
2002 }
2003 last_non_zero= 0;
2004 i=1;
2005 }else{
2006 last_non_zero= -1;
2007 i=0;
2008 }
2009
2010 for(; i<16; i++){
2011 const int j= scantable[i];
2012 int level= block[j]*quant_table[j];
2013
2014 // if( bias+level >= (1<<(QMAT_SHIFT - 3))
2015 // || bias-level >= (1<<(QMAT_SHIFT - 3))){
2016 if(((unsigned)(level+threshold1))>threshold2){
2017 if(level>0){
2018 level= (bias + level)>>QUANT_SHIFT;
2019 block[j]= level;
2020 }else{
2021 level= (bias - level)>>QUANT_SHIFT;
2022 block[j]= -level;
2023 }
2024 last_non_zero = i;
2025 }else{
2026 block[j]=0;
2027 }
2028 }
2029
2030 return last_non_zero;
2031 }
2032
2033 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2034 const uint32_t a= ((uint32_t*)(src-stride))[0];
2035 ((uint32_t*)(src+0*stride))[0]= a;
2036 ((uint32_t*)(src+1*stride))[0]= a;
2037 ((uint32_t*)(src+2*stride))[0]= a;
2038 ((uint32_t*)(src+3*stride))[0]= a;
2039 }
2040
2041 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2042 ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2043 ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2044 ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2045 ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2046 }
2047
2048 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2049 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2050 + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2051
2052 ((uint32_t*)(src+0*stride))[0]=
2053 ((uint32_t*)(src+1*stride))[0]=
2054 ((uint32_t*)(src+2*stride))[0]=
2055 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2056 }
2057
2058 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2059 const int dc= ( src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2060
2061 ((uint32_t*)(src+0*stride))[0]=
2062 ((uint32_t*)(src+1*stride))[0]=
2063 ((uint32_t*)(src+2*stride))[0]=
2064 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2065 }
2066
2067 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2068 const int dc= ( src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2069
2070 ((uint32_t*)(src+0*stride))[0]=
2071 ((uint32_t*)(src+1*stride))[0]=
2072 ((uint32_t*)(src+2*stride))[0]=
2073 ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2074 }
2075
2076 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2077 ((uint32_t*)(src+0*stride))[0]=
2078 ((uint32_t*)(src+1*stride))[0]=
2079 ((uint32_t*)(src+2*stride))[0]=
2080 ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2081 }
2082
2083
2084 #define LOAD_TOP_RIGHT_EDGE\
2085 const int t4= topright[0];\
2086 const int t5= topright[1];\
2087 const int t6= topright[2];\
2088 const int t7= topright[3];\
2089
2090 #define LOAD_LEFT_EDGE\
2091 const int l0= src[-1+0*stride];\
2092 const int l1= src[-1+1*stride];\
2093 const int l2= src[-1+2*stride];\
2094 const int l3= src[-1+3*stride];\
2095
2096 #define LOAD_TOP_EDGE\
2097 const int t0= src[ 0-1*stride];\
2098 const int t1= src[ 1-1*stride];\
2099 const int t2= src[ 2-1*stride];\
2100 const int t3= src[ 3-1*stride];\
2101
2102 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2103 const int lt= src[-1-1*stride];
2104 LOAD_TOP_EDGE
2105 LOAD_LEFT_EDGE
2106
2107 src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2108 src[0+2*stride]=
2109 src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2110 src[0+1*stride]=
2111 src[1+2*stride]=
2112 src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2113 src[0+0*stride]=
2114 src[1+1*stride]=
2115 src[2+2*stride]=
2116 src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2117 src[1+0*stride]=
2118 src[2+1*stride]=
2119 src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2120 src[2+0*stride]=
2121 src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2122 src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2123 }
2124
2125 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2126 LOAD_TOP_EDGE
2127 LOAD_TOP_RIGHT_EDGE
2128 // LOAD_LEFT_EDGE
2129
2130 src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2131 src[1+0*stride]=
2132 src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2133 src[2+0*stride]=
2134 src[1+1*stride]=
2135 src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2136 src[3+0*stride]=
2137 src[2+1*stride]=
2138 src[1+2*stride]=
2139 src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2140 src[3+1*stride]=
2141 src[2+2*stride]=
2142 src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2143 src[3+2*stride]=
2144 src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2145 src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2146 }
2147
2148 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2149 const int lt= src[-1-1*stride];
2150 LOAD_TOP_EDGE
2151 LOAD_LEFT_EDGE
2152 const __attribute__((unused)) int unu= l3;
2153
2154 src[0+0*stride]=
2155 src[1+2*stride]=(lt + t0 + 1)>>1;
2156 src[1+0*stride]=
2157 src[2+2*stride]=(t0 + t1 + 1)>>1;
2158 src[2+0*stride]=
2159 src[3+2*stride]=(t1 + t2 + 1)>>1;
2160 src[3+0*stride]=(t2 + t3 + 1)>>1;
2161 src[0+1*stride]=
2162 src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2163 src[1+1*stride]=
2164 src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2165 src[2+1*stride]=
2166 src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2167 src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2168 src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2169 src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2170 }
2171
2172 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2173 LOAD_TOP_EDGE
2174 LOAD_TOP_RIGHT_EDGE
2175 const __attribute__((unused)) int unu= t7;
2176
2177 src[0+0*stride]=(t0 + t1 + 1)>>1;
2178 src[1+0*stride]=
2179 src[0+2*stride]=(t1 + t2 + 1)>>1;
2180 src[2+0*stride]=
2181 src[1+2*stride]=(t2 + t3 + 1)>>1;
2182 src[3+0*stride]=
2183 src[2+2*stride]=(t3 + t4+ 1)>>1;
2184 src[3+2*stride]=(t4 + t5+ 1)>>1;
2185 src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2186 src[1+1*stride]=
2187 src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2188 src[2+1*stride]=
2189 src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2190 src[3+1*stride]=
2191 src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2192 src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2193 }
2194
2195 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2196 LOAD_LEFT_EDGE
2197
2198 src[0+0*stride]=(l0 + l1 + 1)>>1;
2199 src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2200 src[2+0*stride]=
2201 src[0+1*stride]=(l1 + l2 + 1)>>1;
2202 src[3+0*stride]=
2203 src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2204 src[2+1*stride]=
2205 src[0+2*stride]=(l2 + l3 + 1)>>1;
2206 src[3+1*stride]=
2207 src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2208 src[3+2*stride]=
2209 src[1+3*stride]=
2210 src[0+3*stride]=
2211 src[2+2*stride]=
2212 src[2+3*stride]=
2213 src[3+3*stride]=l3;
2214 }
2215
2216 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2217 const int lt= src[-1-1*stride];
2218 LOAD_TOP_EDGE
2219 LOAD_LEFT_EDGE
2220 const __attribute__((unused)) int unu= t3;
2221
2222 src[0+0*stride]=
2223 src[2+1*stride]=(lt + l0 + 1)>>1;
2224 src[1+0*stride]=
2225 src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2226 src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2227 src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2228 src[0+1*stride]=
2229 src[2+2*stride]=(l0 + l1 + 1)>>1;
2230 src[1+1*stride]=
2231 src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2232 src[0+2*stride]=
2233 src[2+3*stride]=(l1 + l2+ 1)>>1;
2234 src[1+2*stride]=
2235 src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2236 src[0+3*stride]=(l2 + l3 + 1)>>1;
2237 src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2238 }
2239
2240 void ff_pred16x16_vertical_c(uint8_t *src, int stride){
2241 int i;
2242 const uint32_t a= ((uint32_t*)(src-stride))[0];
2243 const uint32_t b= ((uint32_t*)(src-stride))[1];
2244 const uint32_t c= ((uint32_t*)(src-stride))[2];
2245 const uint32_t d= ((uint32_t*)(src-stride))[3];
2246
2247 for(i=0; i<16; i++){
2248 ((uint32_t*)(src+i*stride))[0]= a;
2249 ((uint32_t*)(src+i*stride))[1]= b;
2250 ((uint32_t*)(src+i*stride))[2]= c;
2251 ((uint32_t*)(src+i*stride))[3]= d;
2252 }
2253 }
2254
2255 void ff_pred16x16_horizontal_c(uint8_t *src, int stride){
2256 int i;
2257
2258 for(i=0; i<16; i++){
2259 ((uint32_t*)(src+i*stride))[0]=
2260 ((uint32_t*)(src+i*stride))[1]=
2261 ((uint32_t*)(src+i*stride))[2]=
2262 ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2263 }
2264 }
2265
2266 void ff_pred16x16_dc_c(uint8_t *src, int stride){
2267 int i, dc=0;
2268
2269 for(i=0;i<16; i++){
2270 dc+= src[-1+i*stride];
2271 }
2272
2273 for(i=0;i<16; i++){
2274 dc+= src[i-stride];
2275 }
2276
2277 dc= 0x01010101*((dc + 16)>>5);
2278
2279 for(i=0; i<16; i++){
2280 ((uint32_t*)(src+i*stride))[0]=
2281 ((uint32_t*)(src+i*stride))[1]=
2282 ((uint32_t*)(src+i*stride))[2]=
2283 ((uint32_t*)(src+i*stride))[3]= dc;
2284 }
2285 }
2286
2287 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2288 int i, dc=0;
2289
2290 for(i=0;i<16; i++){
2291 dc+= src[-1+i*stride];
2292 }
2293
2294 dc= 0x01010101*((dc + 8)>>4);
2295
2296 for(i=0; i<16; i++){
2297 ((uint32_t*)(src+i*stride))[0]=
2298 ((uint32_t*)(src+i*stride))[1]=
2299 ((uint32_t*)(src+i*stride))[2]=
2300 ((uint32_t*)(src+i*stride))[3]= dc;
2301 }
2302 }
2303
2304 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2305 int i, dc=0;
2306
2307 for(i=0;i<16; i++){
2308 dc+= src[i-stride];
2309 }
2310 dc= 0x01010101*((dc + 8)>>4);
2311
2312 for(i=0; i<16; i++){
2313 ((uint32_t*)(src+i*stride))[0]=
2314 ((uint32_t*)(src+i*stride))[1]=
2315 ((uint32_t*)(src+i*stride))[2]=
2316 ((uint32_t*)(src+i*stride))[3]= dc;
2317 }
2318 }
2319
2320 void ff_pred16x16_128_dc_c(uint8_t *src, int stride){
2321 int i;
2322
2323 for(i=0; i<16; i++){
2324 ((uint32_t*)(src+i*stride))[0]=
2325 ((uint32_t*)(src+i*stride))[1]=
2326 ((uint32_t*)(src+i*stride))[2]=
2327 ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2328 }
2329 }
2330
2331 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2332 int i, j, k;
2333 int a;
2334 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2335 const uint8_t * const src0 = src+7-stride;
2336 const uint8_t *src1 = src+8*stride-1;
2337 const uint8_t *src2 = src1-2*stride; // == src+6*stride-1;
2338 int H = src0[1] - src0[-1];
2339 int V = src1[0] - src2[ 0];
2340 for(k=2; k<=8; ++k) {
2341 src1 += stride; src2 -= stride;
2342 H += k*(src0[k] - src0[-k]);
2343 V += k*(src1[0] - src2[ 0]);
2344 }
2345 if(svq3){
2346 H = ( 5*(H/4) ) / 16;
2347 V = ( 5*(V/4) ) / 16;
2348
2349 /* required for 100% accuracy */
2350 i = H; H = V; V = i;
2351 }else{
2352 H = ( 5*H+32 ) >> 6;
2353 V = ( 5*V+32 ) >> 6;
2354 }
2355
2356 a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2357 for(j=16; j>0; --j) {
2358 int b = a;
2359 a += V;
2360 for(i=-16; i<0; i+=4) {
2361 src[16+i] = cm[ (b ) >> 5 ];
2362 src[17+i] = cm[ (b+ H) >> 5 ];
2363 src[18+i] = cm[ (b+2*H) >> 5 ];
2364 src[19+i] = cm[ (b+3*H) >> 5 ];
2365 b += 4*H;
2366 }
2367 src += stride;
2368 }
2369 }
2370
2371 void ff_pred16x16_plane_c(uint8_t *src, int stride){
2372 pred16x16_plane_compat_c(src, stride, 0);
2373 }
2374
2375 void ff_pred8x8_vertical_c(uint8_t *src, int stride){
2376 int i;
2377 const uint32_t a= ((uint32_t*)(src-stride))[0];
2378 const uint32_t b= ((uint32_t*)(src-stride))[1];
2379
2380 for(i=0; i<8; i++){
2381 ((uint32_t*)(src+i*stride))[0]= a;
2382 ((uint32_t*)(src+i*stride))[1]= b;
2383 }
2384 }
2385
2386 void ff_pred8x8_horizontal_c(uint8_t *src, int stride){
2387 int i;
2388
2389 for(i=0; i<8; i++){
2390 ((uint32_t*)(src+i*stride))[0]=
2391 ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2392 }
2393 }
2394
2395 void ff_pred8x8_128_dc_c(uint8_t *src, int stride){
2396 int i;
2397
2398 for(i=0; i<8; i++){
2399 ((uint32_t*)(src+i*stride))[0]=
2400 ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2401 }
2402 }
2403
2404 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2405 int i;
2406 int dc0, dc2;
2407
2408 dc0=dc2=0;
2409 for(i=0;i<4; i++){
2410 dc0+= src[-1+i*stride];
2411 dc2+= src[-1+(i+4)*stride];
2412 }
2413 dc0= 0x01010101*((dc0 + 2)>>2);
2414 dc2= 0x01010101*((dc2 + 2)>>2);
2415
2416 for(i=0; i<4; i++){
2417 ((uint32_t*)(src+i*stride))[0]=
2418 ((uint32_t*)(src+i*stride))[1]= dc0;
2419 }
2420 for(i=4; i<8; i++){
2421 ((uint32_t*)(src+i*stride))[0]=
2422 ((uint32_t*)(src+i*stride))[1]= dc2;
2423 }
2424 }
2425
2426 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2427 int i;
2428 int dc0, dc1;
2429
2430 dc0=dc1=0;
2431 for(i=0;i<4; i++){
2432 dc0+= src[i-stride];
2433 dc1+= src[4+i-stride];
2434 }
2435 dc0= 0x01010101*((dc0 + 2)>>2);
2436 dc1= 0x01010101*((dc1 + 2)>>2);
2437
2438 for(i=0; i<4; i++){
2439 ((uint32_t*)(src+i*stride))[0]= dc0;
2440 ((uint32_t*)(src+i*stride))[1]= dc1;
2441 }
2442 for(i=4; i<8; i++){
2443 ((uint32_t*)(src+i*stride))[0]= dc0;
2444 ((uint32_t*)(src+i*stride))[1]= dc1;
2445 }
2446 }
2447
2448
2449 void ff_pred8x8_dc_c(uint8_t *src, int stride){
2450 int i;
2451 int dc0, dc1, dc2, dc3;
2452
2453 dc0=dc1=dc2=0;
2454 for(i=0;i<4; i++){
2455 dc0+= src[-1+i*stride] + src[i-stride];
2456 dc1+= src[4+i-stride];
2457 dc2+= src[-1+(i+4)*stride];
2458 }
2459 dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2460 dc0= 0x01010101*((dc0 + 4)>>3);
2461 dc1= 0x01010101*((dc1 + 2)>>2);
2462 dc2= 0x01010101*((dc2 + 2)>>2);
2463
2464 for(i=0; i<4; i++){
2465 ((uint32_t*)(src+i*stride))[0]= dc0;
2466 ((uint32_t*)(src+i*stride))[1]= dc1;
2467 }
2468 for(i=4; i<8; i++){
2469 ((uint32_t*)(src+i*stride))[0]= dc2;
2470 ((uint32_t*)(src+i*stride))[1]= dc3;
2471 }
2472 }
2473
2474 void ff_pred8x8_plane_c(uint8_t *src, int stride){
2475 int j, k;
2476 int a;
2477 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2478 const uint8_t * const src0 = src+3-stride;
2479 const uint8_t *src1 = src+4*stride-1;
2480 const uint8_t *src2 = src1-2*stride; // == src+2*stride-1;
2481 int H = src0[1] - src0[-1];
2482 int V = src1[0] - src2[ 0];
2483 for(k=2; k<=4; ++k) {
2484 src1 += stride; src2 -= stride;
2485 H += k*(src0[k] - src0[-k]);
2486 V += k*(src1[0] - src2[ 0]);
2487 }
2488 H = ( 17*H+16 ) >> 5;
2489 V = ( 17*V+16 ) >> 5;
2490
2491 a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2492 for(j=8; j>0; --j) {
2493 int b = a;
2494 a += V;
2495 src[0] = cm[ (b ) >> 5 ];
2496 src[1] = cm[ (b+ H) >> 5 ];
2497 src[2] = cm[ (b+2*H) >> 5 ];
2498 src[3] = cm[ (b+3*H) >> 5 ];
2499 src[4] = cm[ (b+4*H) >> 5 ];
2500 src[5] = cm[ (b+5*H) >> 5 ];
2501 src[6] = cm[ (b+6*H) >> 5 ];
2502 src[7] = cm[ (b+7*H) >> 5 ];
2503 src += stride;
2504 }
2505 }
2506
2507 #define SRC(x,y) src[(x)+(y)*stride]
2508 #define PL(y) \
2509 const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2510 #define PREDICT_8x8_LOAD_LEFT \
2511 const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2512 + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2513 PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2514 const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2515
2516 #define PT(x) \
2517 const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2518 #define PREDICT_8x8_LOAD_TOP \
2519 const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2520 + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2521 PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2522 const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2523 + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2524
2525 #define PTR(x) \
2526 t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2527 #define PREDICT_8x8_LOAD_TOPRIGHT \
2528 int t8, t9, t10, t11, t12, t13, t14, t15; \
2529 if(has_topright) { \
2530 PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2531 t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2532 } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2533
2534 #define PREDICT_8x8_LOAD_TOPLEFT \
2535 const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2536
2537 #define PREDICT_8x8_DC(v) \
2538 int y; \
2539 for( y = 0; y < 8; y++ ) { \
2540 ((uint32_t*)src)[0] = \
2541 ((uint32_t*)src)[1] = v; \
2542 src += stride; \
2543 }
2544
2545 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2546 {
2547 PREDICT_8x8_DC(0x80808080);
2548 }
2549 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2550 {
2551 PREDICT_8x8_LOAD_LEFT;
2552 const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2553 PREDICT_8x8_DC(dc);
2554 }
2555 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2556 {
2557 PREDICT_8x8_LOAD_TOP;
2558 const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2559 PREDICT_8x8_DC(dc);
2560 }
2561 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2562 {
2563 PREDICT_8x8_LOAD_LEFT;
2564 PREDICT_8x8_LOAD_TOP;
2565 const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2566 +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2567 PREDICT_8x8_DC(dc);
2568 }
2569 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2570 {
2571 PREDICT_8x8_LOAD_LEFT;
2572 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2573 ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2574 ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2575 #undef ROW
2576 }
2577 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2578 {
2579 int y;
2580 PREDICT_8x8_LOAD_TOP;
2581 src[0] = t0;
2582 src[1] = t1;
2583 src[2] = t2;
2584 src[3] = t3;
2585 src[4] = t4;
2586 src[5] = t5;
2587 src[6] = t6;
2588 src[7] = t7;
2589 for( y = 1; y < 8; y++ )
2590 *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2591 }
2592 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2593 {
2594 PREDICT_8x8_LOAD_TOP;
2595 PREDICT_8x8_LOAD_TOPRIGHT;
2596 SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2597 SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2598 SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2599 SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2600 SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2601 SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2602 SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2603 SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2604 SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2605 SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2606 SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2607 SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2608 SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2609 SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2610 SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2611 }
2612 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2613 {
2614 PREDICT_8x8_LOAD_TOP;
2615 PREDICT_8x8_LOAD_LEFT;
2616 PREDICT_8x8_LOAD_TOPLEFT;
2617 SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2618 SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2619 SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2620 SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2621 SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2622 SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2623 SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2624 SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2625 SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2626 SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2627 SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2628 SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2629 SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2630 SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2631 SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2632
2633 }
2634 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2635 {
2636 PREDICT_8x8_LOAD_TOP;
2637 PREDICT_8x8_LOAD_LEFT;
2638 PREDICT_8x8_LOAD_TOPLEFT;
2639 SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2640 SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2641 SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2642 SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2643 SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2644 SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2645 SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2646 SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2647 SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2648 SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2649 SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2650 SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2651 SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2652 SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2653 SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2654 SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2655 SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2656 SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2657 SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2658 SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2659 SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2660 SRC(7,0)= (t6 + t7 + 1) >> 1;
2661 }
2662 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2663 {
2664 PREDICT_8x8_LOAD_TOP;
2665 PREDICT_8x8_LOAD_LEFT;
2666 PREDICT_8x8_LOAD_TOPLEFT;
2667 SRC(0,7)= (l6 + l7 + 1) >> 1;
2668 SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2669 SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2670 SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2671 SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2672 SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2673 SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2674 SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2675 SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2676 SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2677 SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2678 SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2679 SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2680 SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2681 SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2682 SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2683 SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2684 SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2685 SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2686 SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2687 SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2688 SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2689 }
2690 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2691 {
2692 PREDICT_8x8_LOAD_TOP;
2693 PREDICT_8x8_LOAD_TOPRIGHT;
2694 SRC(0,0)= (t0 + t1 + 1) >> 1;
2695 SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2696 SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2697 SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2698 SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2699 SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2700 SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2701 SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2702 SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2703 SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2704 SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2705 SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2706 SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2707 SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2708 SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2709 SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2710 SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2711 SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2712 SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2713 SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2714 SRC(7,6)= (t10 + t11 + 1) >> 1;
2715 SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2716 }
2717 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2718 {
2719 PREDICT_8x8_LOAD_LEFT;
2720 SRC(0,0)= (l0 + l1 + 1) >> 1;
2721 SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2722 SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2723 SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2724 SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2725 SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2726 SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2727 SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2728 SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2729 SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2730 SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2731 SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2732 SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2733 SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2734 SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2735 SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2736 SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2737 SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2738 }
2739 #undef PREDICT_8x8_LOAD_LEFT
2740 #undef PREDICT_8x8_LOAD_TOP
2741 #undef PREDICT_8x8_LOAD_TOPLEFT
2742 #undef PREDICT_8x8_LOAD_TOPRIGHT
2743 #undef PREDICT_8x8_DC
2744 #undef PTR
2745 #undef PT
2746 #undef PL
2747 #undef SRC
2748
2749 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2750 uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2751 int src_x_offset, int src_y_offset,
2752 qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2753 MpegEncContext * const s = &h->s;
2754 const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2755 int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2756 const int luma_xy= (mx&3) + ((my&3)<<2);
2757 uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2758 uint8_t * src_cb, * src_cr;
2759 int extra_width= h->emu_edge_width;
2760 int extra_height= h->emu_edge_height;
2761 int emu=0;
2762 const int full_mx= mx>>2;
2763 const int full_my= my>>2;
2764 const int pic_width = 16*s->mb_width;
2765 const int pic_height = 16*s->mb_height >> MB_MBAFF;
2766
2767 if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
2768 return;
2769
2770 if(mx&7) extra_width -= 3;
2771 if(my&7) extra_height -= 3;
2772
2773 if( full_mx < 0-extra_width
2774 || full_my < 0-extra_height
2775 || full_mx + 16/*FIXME*/ > pic_width + extra_width
2776 || full_my + 16/*FIXME*/ > pic_height + extra_height){
2777 ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2778 src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2779 emu=1;
2780 }
2781
2782 qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2783 if(!square){
2784 qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2785 }
2786
2787 if(s->flags&CODEC_FLAG_GRAY) return;
2788
2789