d9902c4c484dd13256cb600ae1001aaeb0063630
[libav.git] / libavcodec / vp8.c
1 /*
2 * VP8 compatible video decoder
3 *
4 * Copyright (C) 2010 David Conrad
5 * Copyright (C) 2010 Ronald S. Bultje
6 * Copyright (C) 2010 Jason Garrett-Glaser
7 * Copyright (C) 2012 Daniel Kang
8 *
9 * This file is part of Libav.
10 *
11 * Libav is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * Libav is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with Libav; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26 #include "libavutil/imgutils.h"
27 #include "avcodec.h"
28 #include "internal.h"
29 #include "vp8.h"
30 #include "vp8data.h"
31 #include "rectangle.h"
32 #include "thread.h"
33
34 #if ARCH_ARM
35 # include "arm/vp8.h"
36 #endif
37
38 static void free_buffers(VP8Context *s)
39 {
40 int i;
41 if (s->thread_data)
42 for (i = 0; i < MAX_THREADS; i++) {
43 av_freep(&s->thread_data[i].filter_strength);
44 av_freep(&s->thread_data[i].edge_emu_buffer);
45 }
46 av_freep(&s->thread_data);
47 av_freep(&s->macroblocks_base);
48 av_freep(&s->intra4x4_pred_mode_top);
49 av_freep(&s->top_nnz);
50 av_freep(&s->top_border);
51
52 s->macroblocks = NULL;
53 }
54
55 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
56 {
57 int ret;
58 if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
59 return ret;
60 if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
61 f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
62 } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
63 ff_thread_release_buffer(s->avctx, f);
64 return AVERROR(ENOMEM);
65 }
66 return 0;
67 }
68
69 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
70 {
71 if (f->ref_index[0]) {
72 if (prefer_delayed_free) {
73 /* Upon a size change, we want to free the maps but other threads may still
74 * be using them, so queue them. Upon a seek, all threads are inactive so
75 * we want to cache one to prevent re-allocation in the next decoding
76 * iteration, but the rest we can free directly. */
77 int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
78 if (s->num_maps_to_be_freed < max_queued_maps) {
79 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
80 } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
81 av_free(f->ref_index[0]);
82 } /* else: MEMLEAK (should never happen, but better that than crash) */
83 f->ref_index[0] = NULL;
84 } else /* vp8_decode_free() */ {
85 av_free(f->ref_index[0]);
86 }
87 }
88 ff_thread_release_buffer(s->avctx, f);
89 }
90
91 static void vp8_decode_flush_impl(AVCodecContext *avctx,
92 int prefer_delayed_free, int can_direct_free, int free_mem)
93 {
94 VP8Context *s = avctx->priv_data;
95 int i;
96
97 if (!avctx->internal->is_copy) {
98 for (i = 0; i < 5; i++)
99 if (s->frames[i].data[0])
100 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
101 }
102 memset(s->framep, 0, sizeof(s->framep));
103
104 if (free_mem) {
105 free_buffers(s);
106 s->maps_are_invalid = 1;
107 }
108 }
109
110 static void vp8_decode_flush(AVCodecContext *avctx)
111 {
112 vp8_decode_flush_impl(avctx, 1, 1, 0);
113 }
114
115 static int update_dimensions(VP8Context *s, int width, int height)
116 {
117 AVCodecContext *avctx = s->avctx;
118 int i;
119
120 if (width != s->avctx->width ||
121 height != s->avctx->height) {
122 if (av_image_check_size(width, height, 0, s->avctx))
123 return AVERROR_INVALIDDATA;
124
125 vp8_decode_flush_impl(s->avctx, 1, 0, 1);
126
127 avcodec_set_dimensions(s->avctx, width, height);
128 }
129
130 s->mb_width = (s->avctx->coded_width +15) / 16;
131 s->mb_height = (s->avctx->coded_height+15) / 16;
132
133 s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
134 if (!s->mb_layout) { // Frame threading and one thread
135 s->macroblocks_base = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
136 s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
137 }
138 else // Sliced threading
139 s->macroblocks_base = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
140 s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
141 s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
142 s->thread_data = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
143
144 for (i = 0; i < MAX_THREADS; i++) {
145 s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
146 #if HAVE_THREADS
147 pthread_mutex_init(&s->thread_data[i].lock, NULL);
148 pthread_cond_init(&s->thread_data[i].cond, NULL);
149 #endif
150 }
151
152 if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
153 (!s->intra4x4_pred_mode_top && !s->mb_layout))
154 return AVERROR(ENOMEM);
155
156 s->macroblocks = s->macroblocks_base + 1;
157
158 return 0;
159 }
160
161 static void parse_segment_info(VP8Context *s)
162 {
163 VP56RangeCoder *c = &s->c;
164 int i;
165
166 s->segmentation.update_map = vp8_rac_get(c);
167
168 if (vp8_rac_get(c)) { // update segment feature data
169 s->segmentation.absolute_vals = vp8_rac_get(c);
170
171 for (i = 0; i < 4; i++)
172 s->segmentation.base_quant[i] = vp8_rac_get_sint(c, 7);
173
174 for (i = 0; i < 4; i++)
175 s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
176 }
177 if (s->segmentation.update_map)
178 for (i = 0; i < 3; i++)
179 s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
180 }
181
182 static void update_lf_deltas(VP8Context *s)
183 {
184 VP56RangeCoder *c = &s->c;
185 int i;
186
187 for (i = 0; i < 4; i++) {
188 if (vp8_rac_get(c)) {
189 s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
190
191 if (vp8_rac_get(c))
192 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
193 }
194 }
195
196 for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
197 if (vp8_rac_get(c)) {
198 s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
199
200 if (vp8_rac_get(c))
201 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
202 }
203 }
204 }
205
206 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
207 {
208 const uint8_t *sizes = buf;
209 int i;
210
211 s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
212
213 buf += 3*(s->num_coeff_partitions-1);
214 buf_size -= 3*(s->num_coeff_partitions-1);
215 if (buf_size < 0)
216 return -1;
217
218 for (i = 0; i < s->num_coeff_partitions-1; i++) {
219 int size = AV_RL24(sizes + 3*i);
220 if (buf_size - size < 0)
221 return -1;
222
223 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
224 buf += size;
225 buf_size -= size;
226 }
227 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
228
229 return 0;
230 }
231
232 static void get_quants(VP8Context *s)
233 {
234 VP56RangeCoder *c = &s->c;
235 int i, base_qi;
236
237 int yac_qi = vp8_rac_get_uint(c, 7);
238 int ydc_delta = vp8_rac_get_sint(c, 4);
239 int y2dc_delta = vp8_rac_get_sint(c, 4);
240 int y2ac_delta = vp8_rac_get_sint(c, 4);
241 int uvdc_delta = vp8_rac_get_sint(c, 4);
242 int uvac_delta = vp8_rac_get_sint(c, 4);
243
244 for (i = 0; i < 4; i++) {
245 if (s->segmentation.enabled) {
246 base_qi = s->segmentation.base_quant[i];
247 if (!s->segmentation.absolute_vals)
248 base_qi += yac_qi;
249 } else
250 base_qi = yac_qi;
251
252 s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
253 s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
254 s->qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
255 /* 101581>>16 is equivalent to 155/100 */
256 s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
257 s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
258 s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
259
260 s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
261 s->qmat[i].chroma_qmul[0] = FFMIN(s->qmat[i].chroma_qmul[0], 132);
262 }
263 }
264
265 /**
266 * Determine which buffers golden and altref should be updated with after this frame.
267 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
268 *
269 * Intra frames update all 3 references
270 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
271 * If the update (golden|altref) flag is set, it's updated with the current frame
272 * if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
273 * If the flag is not set, the number read means:
274 * 0: no update
275 * 1: VP56_FRAME_PREVIOUS
276 * 2: update golden with altref, or update altref with golden
277 */
278 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
279 {
280 VP56RangeCoder *c = &s->c;
281
282 if (update)
283 return VP56_FRAME_CURRENT;
284
285 switch (vp8_rac_get_uint(c, 2)) {
286 case 1:
287 return VP56_FRAME_PREVIOUS;
288 case 2:
289 return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
290 }
291 return VP56_FRAME_NONE;
292 }
293
294 static void update_refs(VP8Context *s)
295 {
296 VP56RangeCoder *c = &s->c;
297
298 int update_golden = vp8_rac_get(c);
299 int update_altref = vp8_rac_get(c);
300
301 s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
302 s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
303 }
304
305 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
306 {
307 VP56RangeCoder *c = &s->c;
308 int header_size, hscale, vscale, i, j, k, l, m, ret;
309 int width = s->avctx->width;
310 int height = s->avctx->height;
311
312 s->keyframe = !(buf[0] & 1);
313 s->profile = (buf[0]>>1) & 7;
314 s->invisible = !(buf[0] & 0x10);
315 header_size = AV_RL24(buf) >> 5;
316 buf += 3;
317 buf_size -= 3;
318
319 if (s->profile > 3)
320 av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
321
322 if (!s->profile)
323 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
324 else // profile 1-3 use bilinear, 4+ aren't defined so whatever
325 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
326
327 if (header_size > buf_size - 7*s->keyframe) {
328 av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
329 return AVERROR_INVALIDDATA;
330 }
331
332 if (s->keyframe) {
333 if (AV_RL24(buf) != 0x2a019d) {
334 av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
335 return AVERROR_INVALIDDATA;
336 }
337 width = AV_RL16(buf+3) & 0x3fff;
338 height = AV_RL16(buf+5) & 0x3fff;
339 hscale = buf[4] >> 6;
340 vscale = buf[6] >> 6;
341 buf += 7;
342 buf_size -= 7;
343
344 if (hscale || vscale)
345 av_log_missing_feature(s->avctx, "Upscaling", 1);
346
347 s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
348 for (i = 0; i < 4; i++)
349 for (j = 0; j < 16; j++)
350 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
351 sizeof(s->prob->token[i][j]));
352 memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
353 memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
354 memcpy(s->prob->mvc , vp8_mv_default_prob , sizeof(s->prob->mvc));
355 memset(&s->segmentation, 0, sizeof(s->segmentation));
356 memset(&s->lf_delta, 0, sizeof(s->lf_delta));
357 }
358
359 ff_vp56_init_range_decoder(c, buf, header_size);
360 buf += header_size;
361 buf_size -= header_size;
362
363 if (s->keyframe) {
364 if (vp8_rac_get(c))
365 av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
366 vp8_rac_get(c); // whether we can skip clamping in dsp functions
367 }
368
369 if ((s->segmentation.enabled = vp8_rac_get(c)))
370 parse_segment_info(s);
371 else
372 s->segmentation.update_map = 0; // FIXME: move this to some init function?
373
374 s->filter.simple = vp8_rac_get(c);
375 s->filter.level = vp8_rac_get_uint(c, 6);
376 s->filter.sharpness = vp8_rac_get_uint(c, 3);
377
378 if ((s->lf_delta.enabled = vp8_rac_get(c)))
379 if (vp8_rac_get(c))
380 update_lf_deltas(s);
381
382 if (setup_partitions(s, buf, buf_size)) {
383 av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
384 return AVERROR_INVALIDDATA;
385 }
386
387 if (!s->macroblocks_base || /* first frame */
388 width != s->avctx->width || height != s->avctx->height) {
389 if ((ret = update_dimensions(s, width, height)) < 0)
390 return ret;
391 }
392
393 get_quants(s);
394
395 if (!s->keyframe) {
396 update_refs(s);
397 s->sign_bias[VP56_FRAME_GOLDEN] = vp8_rac_get(c);
398 s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
399 }
400
401 // if we aren't saving this frame's probabilities for future frames,
402 // make a copy of the current probabilities
403 if (!(s->update_probabilities = vp8_rac_get(c)))
404 s->prob[1] = s->prob[0];
405
406 s->update_last = s->keyframe || vp8_rac_get(c);
407
408 for (i = 0; i < 4; i++)
409 for (j = 0; j < 8; j++)
410 for (k = 0; k < 3; k++)
411 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
412 if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
413 int prob = vp8_rac_get_uint(c, 8);
414 for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
415 s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
416 }
417
418 if ((s->mbskip_enabled = vp8_rac_get(c)))
419 s->prob->mbskip = vp8_rac_get_uint(c, 8);
420
421 if (!s->keyframe) {
422 s->prob->intra = vp8_rac_get_uint(c, 8);
423 s->prob->last = vp8_rac_get_uint(c, 8);
424 s->prob->golden = vp8_rac_get_uint(c, 8);
425
426 if (vp8_rac_get(c))
427 for (i = 0; i < 4; i++)
428 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
429 if (vp8_rac_get(c))
430 for (i = 0; i < 3; i++)
431 s->prob->pred8x8c[i] = vp8_rac_get_uint(c, 8);
432
433 // 17.2 MV probability update
434 for (i = 0; i < 2; i++)
435 for (j = 0; j < 19; j++)
436 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
437 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
438 }
439
440 return 0;
441 }
442
443 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
444 {
445 dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
446 dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
447 }
448
449 /**
450 * Motion vector coding, 17.1.
451 */
452 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
453 {
454 int bit, x = 0;
455
456 if (vp56_rac_get_prob_branchy(c, p[0])) {
457 int i;
458
459 for (i = 0; i < 3; i++)
460 x += vp56_rac_get_prob(c, p[9 + i]) << i;
461 for (i = 9; i > 3; i--)
462 x += vp56_rac_get_prob(c, p[9 + i]) << i;
463 if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
464 x += 8;
465 } else {
466 // small_mvtree
467 const uint8_t *ps = p+2;
468 bit = vp56_rac_get_prob(c, *ps);
469 ps += 1 + 3*bit;
470 x += 4*bit;
471 bit = vp56_rac_get_prob(c, *ps);
472 ps += 1 + bit;
473 x += 2*bit;
474 x += vp56_rac_get_prob(c, *ps);
475 }
476
477 return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
478 }
479
480 static av_always_inline
481 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
482 {
483 if (left == top)
484 return vp8_submv_prob[4-!!left];
485 if (!top)
486 return vp8_submv_prob[2];
487 return vp8_submv_prob[1-!!left];
488 }
489
490 /**
491 * Split motion vector prediction, 16.4.
492 * @returns the number of motion vectors parsed (2, 4 or 16)
493 */
494 static av_always_inline
495 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
496 {
497 int part_idx;
498 int n, num;
499 VP8Macroblock *top_mb;
500 VP8Macroblock *left_mb = &mb[-1];
501 const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
502 *mbsplits_top,
503 *mbsplits_cur, *firstidx;
504 VP56mv *top_mv;
505 VP56mv *left_mv = left_mb->bmv;
506 VP56mv *cur_mv = mb->bmv;
507
508 if (!layout) // layout is inlined, s->mb_layout is not
509 top_mb = &mb[2];
510 else
511 top_mb = &mb[-s->mb_width-1];
512 mbsplits_top = vp8_mbsplits[top_mb->partitioning];
513 top_mv = top_mb->bmv;
514
515 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
516 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
517 part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
518 } else {
519 part_idx = VP8_SPLITMVMODE_8x8;
520 }
521 } else {
522 part_idx = VP8_SPLITMVMODE_4x4;
523 }
524
525 num = vp8_mbsplit_count[part_idx];
526 mbsplits_cur = vp8_mbsplits[part_idx],
527 firstidx = vp8_mbfirstidx[part_idx];
528 mb->partitioning = part_idx;
529
530 for (n = 0; n < num; n++) {
531 int k = firstidx[n];
532 uint32_t left, above;
533 const uint8_t *submv_prob;
534
535 if (!(k & 3))
536 left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
537 else
538 left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
539 if (k <= 3)
540 above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
541 else
542 above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
543
544 submv_prob = get_submv_prob(left, above);
545
546 if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
547 if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
548 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
549 mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
550 mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
551 } else {
552 AV_ZERO32(&mb->bmv[n]);
553 }
554 } else {
555 AV_WN32A(&mb->bmv[n], above);
556 }
557 } else {
558 AV_WN32A(&mb->bmv[n], left);
559 }
560 }
561
562 return num;
563 }
564
565 static av_always_inline
566 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
567 {
568 VP8Macroblock *mb_edge[3] = { 0 /* top */,
569 mb - 1 /* left */,
570 0 /* top-left */ };
571 enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
572 enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
573 int idx = CNT_ZERO;
574 int cur_sign_bias = s->sign_bias[mb->ref_frame];
575 int8_t *sign_bias = s->sign_bias;
576 VP56mv near_mv[4];
577 uint8_t cnt[4] = { 0 };
578 VP56RangeCoder *c = &s->c;
579
580 if (!layout) { // layout is inlined (s->mb_layout is not)
581 mb_edge[0] = mb + 2;
582 mb_edge[2] = mb + 1;
583 }
584 else {
585 mb_edge[0] = mb - s->mb_width-1;
586 mb_edge[2] = mb - s->mb_width-2;
587 }
588
589 AV_ZERO32(&near_mv[0]);
590 AV_ZERO32(&near_mv[1]);
591 AV_ZERO32(&near_mv[2]);
592
593 /* Process MB on top, left and top-left */
594 #define MV_EDGE_CHECK(n)\
595 {\
596 VP8Macroblock *edge = mb_edge[n];\
597 int edge_ref = edge->ref_frame;\
598 if (edge_ref != VP56_FRAME_CURRENT) {\
599 uint32_t mv = AV_RN32A(&edge->mv);\
600 if (mv) {\
601 if (cur_sign_bias != sign_bias[edge_ref]) {\
602 /* SWAR negate of the values in mv. */\
603 mv = ~mv;\
604 mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
605 }\
606 if (!n || mv != AV_RN32A(&near_mv[idx]))\
607 AV_WN32A(&near_mv[++idx], mv);\
608 cnt[idx] += 1 + (n != 2);\
609 } else\
610 cnt[CNT_ZERO] += 1 + (n != 2);\
611 }\
612 }
613
614 MV_EDGE_CHECK(0)
615 MV_EDGE_CHECK(1)
616 MV_EDGE_CHECK(2)
617
618 mb->partitioning = VP8_SPLITMVMODE_NONE;
619 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
620 mb->mode = VP8_MVMODE_MV;
621
622 /* If we have three distinct MVs, merge first and last if they're the same */
623 if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
624 cnt[CNT_NEAREST] += 1;
625
626 /* Swap near and nearest if necessary */
627 if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
628 FFSWAP(uint8_t, cnt[CNT_NEAREST], cnt[CNT_NEAR]);
629 FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
630 }
631
632 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
633 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
634
635 /* Choose the best mv out of 0,0 and the nearest mv */
636 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
637 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode == VP8_MVMODE_SPLIT) +
638 (mb_edge[VP8_EDGE_TOP]->mode == VP8_MVMODE_SPLIT)) * 2 +
639 (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
640
641 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
642 mb->mode = VP8_MVMODE_SPLIT;
643 mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
644 } else {
645 mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
646 mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
647 mb->bmv[0] = mb->mv;
648 }
649 } else {
650 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
651 mb->bmv[0] = mb->mv;
652 }
653 } else {
654 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
655 mb->bmv[0] = mb->mv;
656 }
657 } else {
658 mb->mode = VP8_MVMODE_ZERO;
659 AV_ZERO32(&mb->mv);
660 mb->bmv[0] = mb->mv;
661 }
662 }
663
664 static av_always_inline
665 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
666 int mb_x, int keyframe, int layout)
667 {
668 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
669
670 if (layout == 1) {
671 VP8Macroblock *mb_top = mb - s->mb_width - 1;
672 memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
673 }
674 if (keyframe) {
675 int x, y;
676 uint8_t* top;
677 uint8_t* const left = s->intra4x4_pred_mode_left;
678 if (layout == 1)
679 top = mb->intra4x4_pred_mode_top;
680 else
681 top = s->intra4x4_pred_mode_top + 4 * mb_x;
682 for (y = 0; y < 4; y++) {
683 for (x = 0; x < 4; x++) {
684 const uint8_t *ctx;
685 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
686 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
687 left[y] = top[x] = *intra4x4;
688 intra4x4++;
689 }
690 }
691 } else {
692 int i;
693 for (i = 0; i < 16; i++)
694 intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
695 }
696 }
697
698 static av_always_inline
699 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
700 uint8_t *segment, uint8_t *ref, int layout)
701 {
702 VP56RangeCoder *c = &s->c;
703
704 if (s->segmentation.update_map)
705 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
706 else if (s->segmentation.enabled)
707 *segment = ref ? *ref : *segment;
708 mb->segment = *segment;
709
710 mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
711
712 if (s->keyframe) {
713 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
714
715 if (mb->mode == MODE_I4x4) {
716 decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
717 } else {
718 const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
719 if (s->mb_layout == 1)
720 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
721 else
722 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
723 AV_WN32A( s->intra4x4_pred_mode_left, modes);
724 }
725
726 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
727 mb->ref_frame = VP56_FRAME_CURRENT;
728 } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
729 // inter MB, 16.2
730 if (vp56_rac_get_prob_branchy(c, s->prob->last))
731 mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
732 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
733 else
734 mb->ref_frame = VP56_FRAME_PREVIOUS;
735 s->ref_count[mb->ref_frame-1]++;
736
737 // motion vectors, 16.3
738 decode_mvs(s, mb, mb_x, mb_y, layout);
739 } else {
740 // intra MB, 16.1
741 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
742
743 if (mb->mode == MODE_I4x4)
744 decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
745
746 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
747 mb->ref_frame = VP56_FRAME_CURRENT;
748 mb->partitioning = VP8_SPLITMVMODE_NONE;
749 AV_ZERO32(&mb->bmv[0]);
750 }
751 }
752
753 #ifndef decode_block_coeffs_internal
754 /**
755 * @param r arithmetic bitstream reader context
756 * @param block destination for block coefficients
757 * @param probs probabilities to use when reading trees from the bitstream
758 * @param i initial coeff index, 0 unless a separate DC block is coded
759 * @param qmul array holding the dc/ac dequant factor at position 0/1
760 * @return 0 if no coeffs were decoded
761 * otherwise, the index of the last coeff decoded plus one
762 */
763 static int decode_block_coeffs_internal(VP56RangeCoder *r, DCTELEM block[16],
764 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
765 int i, uint8_t *token_prob, int16_t qmul[2])
766 {
767 VP56RangeCoder c = *r;
768 goto skip_eob;
769 do {
770 int coeff;
771 if (!vp56_rac_get_prob_branchy(&c, token_prob[0])) // DCT_EOB
772 break;
773
774 skip_eob:
775 if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
776 if (++i == 16)
777 break; // invalid input; blocks should end with EOB
778 token_prob = probs[i][0];
779 goto skip_eob;
780 }
781
782 if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
783 coeff = 1;
784 token_prob = probs[i+1][1];
785 } else {
786 if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
787 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
788 if (coeff)
789 coeff += vp56_rac_get_prob(&c, token_prob[5]);
790 coeff += 2;
791 } else {
792 // DCT_CAT*
793 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
794 if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
795 coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
796 } else { // DCT_CAT2
797 coeff = 7;
798 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
799 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
800 }
801 } else { // DCT_CAT3 and up
802 int a = vp56_rac_get_prob(&c, token_prob[8]);
803 int b = vp56_rac_get_prob(&c, token_prob[9+a]);
804 int cat = (a<<1) + b;
805 coeff = 3 + (8<<cat);
806 coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
807 }
808 }
809 token_prob = probs[i+1][2];
810 }
811 block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
812 } while (++i < 16);
813
814 *r = c;
815 return i;
816 }
817 #endif
818
819 /**
820 * @param c arithmetic bitstream reader context
821 * @param block destination for block coefficients
822 * @param probs probabilities to use when reading trees from the bitstream
823 * @param i initial coeff index, 0 unless a separate DC block is coded
824 * @param zero_nhood the initial prediction context for number of surrounding
825 * all-zero blocks (only left/top, so 0-2)
826 * @param qmul array holding the dc/ac dequant factor at position 0/1
827 * @return 0 if no coeffs were decoded
828 * otherwise, the index of the last coeff decoded plus one
829 */
830 static av_always_inline
831 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
832 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
833 int i, int zero_nhood, int16_t qmul[2])
834 {
835 uint8_t *token_prob = probs[i][zero_nhood];
836 if (!vp56_rac_get_prob_branchy(c, token_prob[0])) // DCT_EOB
837 return 0;
838 return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
839 }
840
841 static av_always_inline
842 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
843 uint8_t t_nnz[9], uint8_t l_nnz[9])
844 {
845 int i, x, y, luma_start = 0, luma_ctx = 3;
846 int nnz_pred, nnz, nnz_total = 0;
847 int segment = mb->segment;
848 int block_dc = 0;
849
850 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
851 nnz_pred = t_nnz[8] + l_nnz[8];
852
853 // decode DC values and do hadamard
854 nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
855 s->qmat[segment].luma_dc_qmul);
856 l_nnz[8] = t_nnz[8] = !!nnz;
857 if (nnz) {
858 nnz_total += nnz;
859 block_dc = 1;
860 if (nnz == 1)
861 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
862 else
863 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
864 }
865 luma_start = 1;
866 luma_ctx = 0;
867 }
868
869 // luma blocks
870 for (y = 0; y < 4; y++)
871 for (x = 0; x < 4; x++) {
872 nnz_pred = l_nnz[y] + t_nnz[x];
873 nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
874 nnz_pred, s->qmat[segment].luma_qmul);
875 // nnz+block_dc may be one more than the actual last index, but we don't care
876 td->non_zero_count_cache[y][x] = nnz + block_dc;
877 t_nnz[x] = l_nnz[y] = !!nnz;
878 nnz_total += nnz;
879 }
880
881 // chroma blocks
882 // TODO: what to do about dimensions? 2nd dim for luma is x,
883 // but for chroma it's (y<<1)|x
884 for (i = 4; i < 6; i++)
885 for (y = 0; y < 2; y++)
886 for (x = 0; x < 2; x++) {
887 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
888 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
889 nnz_pred, s->qmat[segment].chroma_qmul);
890 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
891 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
892 nnz_total += nnz;
893 }
894
895 // if there were no coded coeffs despite the macroblock not being marked skip,
896 // we MUST not do the inner loop filter and should not do IDCT
897 // Since skip isn't used for bitstream prediction, just manually set it.
898 if (!nnz_total)
899 mb->skip = 1;
900 }
901
902 static av_always_inline
903 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
904 int linesize, int uvlinesize, int simple)
905 {
906 AV_COPY128(top_border, src_y + 15*linesize);
907 if (!simple) {
908 AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
909 AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
910 }
911 }
912
913 static av_always_inline
914 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
915 int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
916 int simple, int xchg)
917 {
918 uint8_t *top_border_m1 = top_border-32; // for TL prediction
919 src_y -= linesize;
920 src_cb -= uvlinesize;
921 src_cr -= uvlinesize;
922
923 #define XCHG(a,b,xchg) do { \
924 if (xchg) AV_SWAP64(b,a); \
925 else AV_COPY64(b,a); \
926 } while (0)
927
928 XCHG(top_border_m1+8, src_y-8, xchg);
929 XCHG(top_border, src_y, xchg);
930 XCHG(top_border+8, src_y+8, 1);
931 if (mb_x < mb_width-1)
932 XCHG(top_border+32, src_y+16, 1);
933
934 // only copy chroma for normal loop filter
935 // or to initialize the top row to 127
936 if (!simple || !mb_y) {
937 XCHG(top_border_m1+16, src_cb-8, xchg);
938 XCHG(top_border_m1+24, src_cr-8, xchg);
939 XCHG(top_border+16, src_cb, 1);
940 XCHG(top_border+24, src_cr, 1);
941 }
942 }
943
944 static av_always_inline
945 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
946 {
947 if (!mb_x) {
948 return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
949 } else {
950 return mb_y ? mode : LEFT_DC_PRED8x8;
951 }
952 }
953
954 static av_always_inline
955 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
956 {
957 if (!mb_x) {
958 return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
959 } else {
960 return mb_y ? mode : HOR_PRED8x8;
961 }
962 }
963
964 static av_always_inline
965 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
966 {
967 if (mode == DC_PRED8x8) {
968 return check_dc_pred8x8_mode(mode, mb_x, mb_y);
969 } else {
970 return mode;
971 }
972 }
973
974 static av_always_inline
975 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
976 {
977 switch (mode) {
978 case DC_PRED8x8:
979 return check_dc_pred8x8_mode(mode, mb_x, mb_y);
980 case VERT_PRED8x8:
981 return !mb_y ? DC_127_PRED8x8 : mode;
982 case HOR_PRED8x8:
983 return !mb_x ? DC_129_PRED8x8 : mode;
984 case PLANE_PRED8x8 /*TM*/:
985 return check_tm_pred8x8_mode(mode, mb_x, mb_y);
986 }
987 return mode;
988 }
989
990 static av_always_inline
991 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
992 {
993 if (!mb_x) {
994 return mb_y ? VERT_VP8_PRED : DC_129_PRED;
995 } else {
996 return mb_y ? mode : HOR_VP8_PRED;
997 }
998 }
999
1000 static av_always_inline
1001 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1002 {
1003 switch (mode) {
1004 case VERT_PRED:
1005 if (!mb_x && mb_y) {
1006 *copy_buf = 1;
1007 return mode;
1008 }
1009 /* fall-through */
1010 case DIAG_DOWN_LEFT_PRED:
1011 case VERT_LEFT_PRED:
1012 return !mb_y ? DC_127_PRED : mode;
1013 case HOR_PRED:
1014 if (!mb_y) {
1015 *copy_buf = 1;
1016 return mode;
1017 }
1018 /* fall-through */
1019 case HOR_UP_PRED:
1020 return !mb_x ? DC_129_PRED : mode;
1021 case TM_VP8_PRED:
1022 return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1023 case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1024 case DIAG_DOWN_RIGHT_PRED:
1025 case VERT_RIGHT_PRED:
1026 case HOR_DOWN_PRED:
1027 if (!mb_y || !mb_x)
1028 *copy_buf = 1;
1029 return mode;
1030 }
1031 return mode;
1032 }
1033
1034 static av_always_inline
1035 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1036 VP8Macroblock *mb, int mb_x, int mb_y)
1037 {
1038 AVCodecContext *avctx = s->avctx;
1039 int x, y, mode, nnz;
1040 uint32_t tr;
1041
1042 // for the first row, we need to run xchg_mb_border to init the top edge to 127
1043 // otherwise, skip it if we aren't going to deblock
1044 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1045 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1046 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1047 s->filter.simple, 1);
1048
1049 if (mb->mode < MODE_I4x4) {
1050 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1051 mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1052 } else {
1053 mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1054 }
1055 s->hpc.pred16x16[mode](dst[0], s->linesize);
1056 } else {
1057 uint8_t *ptr = dst[0];
1058 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1059 uint8_t tr_top[4] = { 127, 127, 127, 127 };
1060
1061 // all blocks on the right edge of the macroblock use bottom edge
1062 // the top macroblock for their topright edge
1063 uint8_t *tr_right = ptr - s->linesize + 16;
1064
1065 // if we're on the right edge of the frame, said edge is extended
1066 // from the top macroblock
1067 if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1068 mb_x == s->mb_width-1) {
1069 tr = tr_right[-1]*0x01010101u;
1070 tr_right = (uint8_t *)&tr;
1071 }
1072
1073 if (mb->skip)
1074 AV_ZERO128(td->non_zero_count_cache);
1075
1076 for (y = 0; y < 4; y++) {
1077 uint8_t *topright = ptr + 4 - s->linesize;
1078 for (x = 0; x < 4; x++) {
1079 int copy = 0, linesize = s->linesize;
1080 uint8_t *dst = ptr+4*x;
1081 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1082
1083 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1084 topright = tr_top;
1085 } else if (x == 3)
1086 topright = tr_right;
1087
1088 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1089 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1090 if (copy) {
1091 dst = copy_dst + 12;
1092 linesize = 8;
1093 if (!(mb_y + y)) {
1094 copy_dst[3] = 127U;
1095 AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1096 } else {
1097 AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1098 if (!(mb_x + x)) {
1099 copy_dst[3] = 129U;
1100 } else {
1101 copy_dst[3] = ptr[4*x-s->linesize-1];
1102 }
1103 }
1104 if (!(mb_x + x)) {
1105 copy_dst[11] =
1106 copy_dst[19] =
1107 copy_dst[27] =
1108 copy_dst[35] = 129U;
1109 } else {
1110 copy_dst[11] = ptr[4*x -1];
1111 copy_dst[19] = ptr[4*x+s->linesize -1];
1112 copy_dst[27] = ptr[4*x+s->linesize*2-1];
1113 copy_dst[35] = ptr[4*x+s->linesize*3-1];
1114 }
1115 }
1116 } else {
1117 mode = intra4x4[x];
1118 }
1119 s->hpc.pred4x4[mode](dst, topright, linesize);
1120 if (copy) {
1121 AV_COPY32(ptr+4*x , copy_dst+12);
1122 AV_COPY32(ptr+4*x+s->linesize , copy_dst+20);
1123 AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1124 AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1125 }
1126
1127 nnz = td->non_zero_count_cache[y][x];
1128 if (nnz) {
1129 if (nnz == 1)
1130 s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1131 else
1132 s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1133 }
1134 topright += 4;
1135 }
1136
1137 ptr += 4*s->linesize;
1138 intra4x4 += 4;
1139 }
1140 }
1141
1142 if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1143 mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1144 } else {
1145 mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1146 }
1147 s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1148 s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1149
1150 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1151 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1152 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1153 s->filter.simple, 0);
1154 }
1155
1156 static const uint8_t subpel_idx[3][8] = {
1157 { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1158 // also function pointer index
1159 { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1160 { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1161 };
1162
1163 /**
1164 * luma MC function
1165 *
1166 * @param s VP8 decoding context
1167 * @param dst target buffer for block data at block position
1168 * @param ref reference picture buffer at origin (0, 0)
1169 * @param mv motion vector (relative to block position) to get pixel data from
1170 * @param x_off horizontal position of block from origin (0, 0)
1171 * @param y_off vertical position of block from origin (0, 0)
1172 * @param block_w width of block (16, 8 or 4)
1173 * @param block_h height of block (always same as block_w)
1174 * @param width width of src/dst plane data
1175 * @param height height of src/dst plane data
1176 * @param linesize size of a single line of plane data, including padding
1177 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1178 */
1179 static av_always_inline
1180 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1181 AVFrame *ref, const VP56mv *mv,
1182 int x_off, int y_off, int block_w, int block_h,
1183 int width, int height, int linesize,
1184 vp8_mc_func mc_func[3][3])
1185 {
1186 uint8_t *src = ref->data[0];
1187
1188 if (AV_RN32A(mv)) {
1189
1190 int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1191 int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1192
1193 x_off += mv->x >> 2;
1194 y_off += mv->y >> 2;
1195
1196 // edge emulation
1197 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1198 src += y_off * linesize + x_off;
1199 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1200 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1201 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1202 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1203 x_off - mx_idx, y_off - my_idx, width, height);
1204 src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1205 }
1206 mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1207 } else {
1208 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1209 mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1210 }
1211 }
1212
1213 /**
1214 * chroma MC function
1215 *
1216 * @param s VP8 decoding context
1217 * @param dst1 target buffer for block data at block position (U plane)
1218 * @param dst2 target buffer for block data at block position (V plane)
1219 * @param ref reference picture buffer at origin (0, 0)
1220 * @param mv motion vector (relative to block position) to get pixel data from
1221 * @param x_off horizontal position of block from origin (0, 0)
1222 * @param y_off vertical position of block from origin (0, 0)
1223 * @param block_w width of block (16, 8 or 4)
1224 * @param block_h height of block (always same as block_w)
1225 * @param width width of src/dst plane data
1226 * @param height height of src/dst plane data
1227 * @param linesize size of a single line of plane data, including padding
1228 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1229 */
1230 static av_always_inline
1231 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1232 AVFrame *ref, const VP56mv *mv, int x_off, int y_off,
1233 int block_w, int block_h, int width, int height, int linesize,
1234 vp8_mc_func mc_func[3][3])
1235 {
1236 uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1237
1238 if (AV_RN32A(mv)) {
1239 int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1240 int my = mv->y&7, my_idx = subpel_idx[0][my];
1241
1242 x_off += mv->x >> 3;
1243 y_off += mv->y >> 3;
1244
1245 // edge emulation
1246 src1 += y_off * linesize + x_off;
1247 src2 += y_off * linesize + x_off;
1248 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1249 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1250 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1251 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1252 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1253 x_off - mx_idx, y_off - my_idx, width, height);
1254 src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1255 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1256
1257 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1258 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1259 x_off - mx_idx, y_off - my_idx, width, height);
1260 src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1261 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1262 } else {
1263 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1264 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1265 }
1266 } else {
1267 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1268 mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1269 mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1270 }
1271 }
1272
1273 static av_always_inline
1274 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1275 AVFrame *ref_frame, int x_off, int y_off,
1276 int bx_off, int by_off,
1277 int block_w, int block_h,
1278 int width, int height, VP56mv *mv)
1279 {
1280 VP56mv uvmv = *mv;
1281
1282 /* Y */
1283 vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1284 ref_frame, mv, x_off + bx_off, y_off + by_off,
1285 block_w, block_h, width, height, s->linesize,
1286 s->put_pixels_tab[block_w == 8]);
1287
1288 /* U/V */
1289 if (s->profile == 3) {
1290 uvmv.x &= ~7;
1291 uvmv.y &= ~7;
1292 }
1293 x_off >>= 1; y_off >>= 1;
1294 bx_off >>= 1; by_off >>= 1;
1295 width >>= 1; height >>= 1;
1296 block_w >>= 1; block_h >>= 1;
1297 vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1298 dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1299 &uvmv, x_off + bx_off, y_off + by_off,
1300 block_w, block_h, width, height, s->uvlinesize,
1301 s->put_pixels_tab[1 + (block_w == 4)]);
1302 }
1303
1304 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1305 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1306 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1307 {
1308 /* Don't prefetch refs that haven't been used very often this frame. */
1309 if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1310 int x_off = mb_x << 4, y_off = mb_y << 4;
1311 int mx = (mb->mv.x>>2) + x_off + 8;
1312 int my = (mb->mv.y>>2) + y_off;
1313 uint8_t **src= s->framep[ref]->data;
1314 int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1315 /* For threading, a ff_thread_await_progress here might be useful, but
1316 * it actually slows down the decoder. Since a bad prefetch doesn't
1317 * generate bad decoder output, we don't run it here. */
1318 s->dsp.prefetch(src[0]+off, s->linesize, 4);
1319 off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1320 s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1321 }
1322 }
1323
1324 /**
1325 * Apply motion vectors to prediction buffer, chapter 18.
1326 */
1327 static av_always_inline
1328 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1329 VP8Macroblock *mb, int mb_x, int mb_y)
1330 {
1331 int x_off = mb_x << 4, y_off = mb_y << 4;
1332 int width = 16*s->mb_width, height = 16*s->mb_height;
1333 AVFrame *ref = s->framep[mb->ref_frame];
1334 VP56mv *bmv = mb->bmv;
1335
1336 switch (mb->partitioning) {
1337 case VP8_SPLITMVMODE_NONE:
1338 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1339 0, 0, 16, 16, width, height, &mb->mv);
1340 break;
1341 case VP8_SPLITMVMODE_4x4: {
1342 int x, y;
1343 VP56mv uvmv;
1344
1345 /* Y */
1346 for (y = 0; y < 4; y++) {
1347 for (x = 0; x < 4; x++) {
1348 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1349 ref, &bmv[4*y + x],
1350 4*x + x_off, 4*y + y_off, 4, 4,
1351 width, height, s->linesize,
1352 s->put_pixels_tab[2]);
1353 }
1354 }
1355
1356 /* U/V */
1357 x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1358 for (y = 0; y < 2; y++) {
1359 for (x = 0; x < 2; x++) {
1360 uvmv.x = mb->bmv[ 2*y * 4 + 2*x ].x +
1361 mb->bmv[ 2*y * 4 + 2*x+1].x +
1362 mb->bmv[(2*y+1) * 4 + 2*x ].x +
1363 mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1364 uvmv.y = mb->bmv[ 2*y * 4 + 2*x ].y +
1365 mb->bmv[ 2*y * 4 + 2*x+1].y +
1366 mb->bmv[(2*y+1) * 4 + 2*x ].y +
1367 mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1368 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1369 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1370 if (s->profile == 3) {
1371 uvmv.x &= ~7;
1372 uvmv.y &= ~7;
1373 }
1374 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1375 dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1376 4*x + x_off, 4*y + y_off, 4, 4,
1377 width, height, s->uvlinesize,
1378 s->put_pixels_tab[2]);
1379 }
1380 }
1381 break;
1382 }
1383 case VP8_SPLITMVMODE_16x8:
1384 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1385 0, 0, 16, 8, width, height, &bmv[0]);
1386 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1387 0, 8, 16, 8, width, height, &bmv[1]);
1388 break;
1389 case VP8_SPLITMVMODE_8x16:
1390 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1391 0, 0, 8, 16, width, height, &bmv[0]);
1392 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1393 8, 0, 8, 16, width, height, &bmv[1]);
1394 break;
1395 case VP8_SPLITMVMODE_8x8:
1396 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1397 0, 0, 8, 8, width, height, &bmv[0]);
1398 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1399 8, 0, 8, 8, width, height, &bmv[1]);
1400 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1401 0, 8, 8, 8, width, height, &bmv[2]);
1402 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1403 8, 8, 8, 8, width, height, &bmv[3]);
1404 break;
1405 }
1406 }
1407
1408 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1409 uint8_t *dst[3], VP8Macroblock *mb)
1410 {
1411 int x, y, ch;
1412
1413 if (mb->mode != MODE_I4x4) {
1414 uint8_t *y_dst = dst[0];
1415 for (y = 0; y < 4; y++) {
1416 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1417 if (nnz4) {
1418 if (nnz4&~0x01010101) {
1419 for (x = 0; x < 4; x++) {
1420 if ((uint8_t)nnz4 == 1)
1421 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1422 else if((uint8_t)nnz4 > 1)
1423 s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1424 nnz4 >>= 8;
1425 if (!nnz4)
1426 break;
1427 }
1428 } else {
1429 s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1430 }
1431 }
1432 y_dst += 4*s->linesize;
1433 }
1434 }
1435
1436 for (ch = 0; ch < 2; ch++) {
1437 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1438 if (nnz4) {
1439 uint8_t *ch_dst = dst[1+ch];
1440 if (nnz4&~0x01010101) {
1441 for (y = 0; y < 2; y++) {
1442 for (x = 0; x < 2; x++) {
1443 if ((uint8_t)nnz4 == 1)
1444 s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1445 else if((uint8_t)nnz4 > 1)
1446 s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1447 nnz4 >>= 8;
1448 if (!nnz4)
1449 goto chroma_idct_end;
1450 }
1451 ch_dst += 4*s->uvlinesize;
1452 }
1453 } else {
1454 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1455 }
1456 }
1457 chroma_idct_end: ;
1458 }
1459 }
1460
1461 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1462 {
1463 int interior_limit, filter_level;
1464
1465 if (s->segmentation.enabled) {
1466 filter_level = s->segmentation.filter_level[mb->segment];
1467 if (!s->segmentation.absolute_vals)
1468 filter_level += s->filter.level;
1469 } else
1470 filter_level = s->filter.level;
1471
1472 if (s->lf_delta.enabled) {
1473 filter_level += s->lf_delta.ref[mb->ref_frame];
1474 filter_level += s->lf_delta.mode[mb->mode];
1475 }
1476
1477 filter_level = av_clip_uintp2(filter_level, 6);
1478
1479 interior_limit = filter_level;
1480 if (s->filter.sharpness) {
1481 interior_limit >>= (s->filter.sharpness + 3) >> 2;
1482 interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1483 }
1484 interior_limit = FFMAX(interior_limit, 1);
1485
1486 f->filter_level = filter_level;
1487 f->inner_limit = interior_limit;
1488 f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1489 }
1490
1491 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1492 {
1493 int mbedge_lim, bedge_lim, hev_thresh;
1494 int filter_level = f->filter_level;
1495 int inner_limit = f->inner_limit;
1496 int inner_filter = f->inner_filter;
1497 int linesize = s->linesize;
1498 int uvlinesize = s->uvlinesize;
1499 static const uint8_t hev_thresh_lut[2][64] = {
1500 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1501 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1502 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1503 3, 3, 3, 3 },
1504 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1506 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1507 2, 2, 2, 2 }
1508 };
1509
1510 if (!filter_level)
1511 return;
1512
1513 bedge_lim = 2*filter_level + inner_limit;
1514 mbedge_lim = bedge_lim + 4;
1515
1516 hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1517
1518 if (mb_x) {
1519 s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
1520 mbedge_lim, inner_limit, hev_thresh);
1521 s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
1522 mbedge_lim, inner_limit, hev_thresh);
1523 }
1524
1525 if (inner_filter) {
1526 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1527 inner_limit, hev_thresh);
1528 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1529 inner_limit, hev_thresh);
1530 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1531 inner_limit, hev_thresh);
1532 s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1533 uvlinesize, bedge_lim,
1534 inner_limit, hev_thresh);
1535 }
1536
1537 if (mb_y) {
1538 s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
1539 mbedge_lim, inner_limit, hev_thresh);
1540 s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
1541 mbedge_lim, inner_limit, hev_thresh);
1542 }
1543
1544 if (inner_filter) {
1545 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1546 linesize, bedge_lim,
1547 inner_limit, hev_thresh);
1548 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1549 linesize, bedge_lim,
1550 inner_limit, hev_thresh);
1551 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1552 linesize, bedge_lim,
1553 inner_limit, hev_thresh);
1554 s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1555 dst[2] + 4 * uvlinesize,
1556 uvlinesize, bedge_lim,
1557 inner_limit, hev_thresh);
1558 }
1559 }
1560
1561 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1562 {
1563 int mbedge_lim, bedge_lim;
1564 int filter_level = f->filter_level;
1565 int inner_limit = f->inner_limit;
1566 int inner_filter = f->inner_filter;
1567 int linesize = s->linesize;
1568
1569 if (!filter_level)
1570 return;
1571
1572 bedge_lim = 2*filter_level + inner_limit;
1573 mbedge_lim = bedge_lim + 4;
1574
1575 if (mb_x)
1576 s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1577 if (inner_filter) {
1578 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1579 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1580 s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1581 }
1582
1583 if (mb_y)
1584 s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1585 if (inner_filter) {
1586 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1587 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1588 s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1589 }
1590 }
1591
1592 static void release_queued_segmaps(VP8Context *s, int is_close)
1593 {
1594 int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1595 while (s->num_maps_to_be_freed > leave_behind)
1596 av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1597 s->maps_are_invalid = 0;
1598 }
1599
1600 #define MARGIN (16 << 2)
1601 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe,
1602 AVFrame *prev_frame)
1603 {
1604 VP8Context *s = avctx->priv_data;
1605 int mb_x, mb_y;
1606
1607 s->mv_min.y = -MARGIN;
1608 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1609 for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1610 VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1611 int mb_xy = mb_y*s->mb_width;
1612
1613 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1614
1615 s->mv_min.x = -MARGIN;
1616 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1617 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1618 if (mb_y == 0)
1619 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1620 decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1621 prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 1);
1622 s->mv_min.x -= 64;
1623 s->mv_max.x -= 64;
1624 }
1625 s->mv_min.y -= 64;
1626 s->mv_max.y -= 64;
1627 }
1628 }
1629
1630 #if HAVE_THREADS
1631 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1632 do {\
1633 int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1634 if (otd->thread_mb_pos < tmp) {\
1635 pthread_mutex_lock(&otd->lock);\
1636 td->wait_mb_pos = tmp;\
1637 do {\
1638 if (otd->thread_mb_pos >= tmp)\
1639 break;\
1640 pthread_cond_wait(&otd->cond, &otd->lock);\
1641 } while (1);\
1642 td->wait_mb_pos = INT_MAX;\
1643 pthread_mutex_unlock(&otd->lock);\
1644 }\
1645 } while(0);
1646
1647 #define update_pos(td, mb_y, mb_x)\
1648 do {\
1649 int pos = (mb_y << 16) | (mb_x & 0xFFFF);\
1650 int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1651 int is_null = (next_td == NULL) || (prev_td == NULL);\
1652 int pos_check = (is_null) ? 1 :\
1653 (next_td != td && pos >= next_td->wait_mb_pos) ||\
1654 (prev_td != td && pos >= prev_td->wait_mb_pos);\
1655 td->thread_mb_pos = pos;\
1656 if (sliced_threading && pos_check) {\
1657 pthread_mutex_lock(&td->lock);\
1658 pthread_cond_broadcast(&td->cond);\
1659 pthread_mutex_unlock(&td->lock);\
1660 }\
1661 } while(0);
1662 #else
1663 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1664 #define update_pos(td, mb_y, mb_x)
1665 #endif
1666
1667 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1668 int jobnr, int threadnr)
1669 {
1670 VP8Context *s = avctx->priv_data;
1671 VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1672 int mb_y = td->thread_mb_pos>>16;
1673 int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1674 int num_jobs = s->num_jobs;
1675 AVFrame *curframe = s->curframe, *prev_frame = s->prev_frame;
1676 VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1677 VP8Macroblock *mb;
1678 uint8_t *dst[3] = {
1679 curframe->data[0] + 16*mb_y*s->linesize,
1680 curframe->data[1] + 8*mb_y*s->uvlinesize,
1681 curframe->data[2] + 8*mb_y*s->uvlinesize
1682 };
1683 if (mb_y == 0) prev_td = td;
1684 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1685 if (mb_y == s->mb_height-1) next_td = td;
1686 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1687 if (s->mb_layout == 1)
1688 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1689 else {
1690 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1691 memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1692 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1693 }
1694
1695 memset(td->left_nnz, 0, sizeof(td->left_nnz));
1696 // left edge of 129 for intra prediction
1697 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1698 for (i = 0; i < 3; i++)
1699 for (y = 0; y < 16>>!!i; y++)
1700 dst[i][y*curframe->linesize[i]-1] = 129;
1701 if (mb_y == 1) {
1702 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1703 }
1704 }
1705
1706 s->mv_min.x = -MARGIN;
1707 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1708
1709 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1710 // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1711 if (prev_td != td) {
1712 if (threadnr != 0) {
1713 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1714 } else {
1715 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1716 }
1717 }
1718
1719 s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1720 s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1721
1722 if (!s->mb_layout)
1723 decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1724 prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 0);
1725
1726 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1727
1728 if (!mb->skip)
1729 decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1730
1731 if (mb->mode <= MODE_I4x4)
1732 intra_predict(s, td, dst, mb, mb_x, mb_y);
1733 else
1734 inter_predict(s, td, dst, mb, mb_x, mb_y);
1735
1736 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1737
1738 if (!mb->skip) {
1739 idct_mb(s, td, dst, mb);
1740 } else {
1741 AV_ZERO64(td->left_nnz);
1742 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
1743
1744 // Reset DC block predictors if they would exist if the mb had coefficients
1745 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1746 td->left_nnz[8] = 0;
1747 s->top_nnz[mb_x][8] = 0;
1748 }
1749 }
1750
1751 if (s->deblock_filter)
1752 filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1753
1754 if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1755 if (s->filter.simple)
1756 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1757 else
1758 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1759 }
1760
1761 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1762
1763 dst[0] += 16;
1764 dst[1] += 8;
1765 dst[2] += 8;
1766 s->mv_min.x -= 64;
1767 s->mv_max.x -= 64;
1768
1769 if (mb_x == s->mb_width+1) {
1770 update_pos(td, mb_y, s->mb_width+3);
1771 } else {
1772 update_pos(td, mb_y, mb_x);
1773 }
1774 }
1775 }
1776
1777 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1778 int jobnr, int threadnr)
1779 {
1780 VP8Context *s = avctx->priv_data;
1781 VP8ThreadData *td = &s->thread_data[threadnr];
1782 int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1783 AVFrame *curframe = s->curframe;
1784 VP8Macroblock *mb;
1785 VP8ThreadData *prev_td, *next_td;
1786 uint8_t *dst[3] = {
1787 curframe->data[0] + 16*mb_y*s->linesize,
1788 curframe->data[1] + 8*mb_y*s->uvlinesize,
1789 curframe->data[2] + 8*mb_y*s->uvlinesize
1790 };
1791
1792 if (s->mb_layout == 1)
1793 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1794 else
1795 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1796
1797 if (mb_y == 0) prev_td = td;
1798 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1799 if (mb_y == s->mb_height-1) next_td = td;
1800 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1801
1802 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1803 VP8FilterStrength *f = &td->filter_strength[mb_x];
1804 if (prev_td != td) {
1805 check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1806 }
1807 if (next_td != td)
1808 if (next_td != &s->thread_data[0]) {
1809 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1810 }
1811
1812 if (num_jobs == 1) {
1813 if (s->filter.simple)
1814 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1815 else
1816 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1817 }
1818
1819 if (s->filter.simple)
1820 filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1821 else
1822 filter_mb(s, dst, f, mb_x, mb_y);
1823 dst[0] += 16;
1824 dst[1] += 8;
1825 dst[2] += 8;
1826
1827 update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1828 }
1829 }
1830
1831 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1832 int jobnr, int threadnr)
1833 {
1834 VP8Context *s = avctx->priv_data;
1835 VP8ThreadData *td = &s->thread_data[jobnr];
1836 VP8ThreadData *next_td = NULL, *prev_td = NULL;
1837 AVFrame *curframe = s->curframe;
1838 int mb_y, num_jobs = s->num_jobs;
1839 td->thread_nr = threadnr;
1840 for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1841 if (mb_y >= s->mb_height) break;
1842 td->thread_mb_pos = mb_y<<16;
1843 vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1844 if (s->deblock_filter)
1845 vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1846 update_pos(td, mb_y, INT_MAX & 0xFFFF);
1847
1848 s->mv_min.y -= 64;
1849 s->mv_max.y -= 64;
1850
1851 if (avctx->active_thread_type == FF_THREAD_FRAME)
1852 ff_thread_report_progress(curframe, mb_y, 0);
1853 }
1854
1855 return 0;
1856 }
1857
1858 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1859 AVPacket *avpkt)
1860 {
1861 VP8Context *s = avctx->priv_data;
1862 int ret, i, referenced, num_jobs;
1863 enum AVDiscard skip_thresh;
1864 AVFrame *av_uninit(curframe), *prev_frame;
1865
1866 release_queued_segmaps(s, 0);
1867
1868 if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1869 goto err;
1870
1871 prev_frame = s->framep[VP56_FRAME_CURRENT];
1872
1873 referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1874 || s->update_altref == VP56_FRAME_CURRENT;
1875
1876 skip_thresh = !referenced ? AVDISCARD_NONREF :
1877 !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1878
1879 if (avctx->skip_frame >= skip_thresh) {
1880 s->invisible = 1;
1881 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1882 goto skip_decode;
1883 }
1884 s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1885
1886 // release no longer referenced frames
1887 for (i = 0; i < 5; i++)
1888 if (s->frames[i].data[0] &&
1889 &s->frames[i] != prev_frame &&
1890 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1891 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1892 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1893 vp8_release_frame(s, &s->frames[i], 1, 0);
1894
1895 // find a free buffer
1896 for (i = 0; i < 5; i++)
1897 if (&s->frames[i] != prev_frame &&
1898 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1899 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1900 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1901 curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1902 break;
1903 }
1904 if (i == 5) {
1905 av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1906 abort();
1907 }
1908 if (curframe->data[0])
1909 vp8_release_frame(s, curframe, 1, 0);
1910
1911 // Given that arithmetic probabilities are updated every frame, it's quite likely
1912 // that the values we have on a random interframe are complete junk if we didn't
1913 // start decode on a keyframe. So just don't display anything rather than junk.
1914 if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1915 !s->framep[VP56_FRAME_GOLDEN] ||
1916 !s->framep[VP56_FRAME_GOLDEN2])) {
1917 av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1918 ret = AVERROR_INVALIDDATA;
1919 goto err;
1920 }
1921
1922 curframe->key_frame = s->keyframe;
1923 curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1924 curframe->reference = referenced ? 3 : 0;
1925 if ((ret = vp8_alloc_frame(s, curframe))) {
1926 av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1927 goto err;
1928 }
1929
1930 // check if golden and altref are swapped
1931 if (s->update_altref != VP56_FRAME_NONE) {
1932 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1933 } else {
1934 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
1935 }
1936 if (s->update_golden != VP56_FRAME_NONE) {
1937 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1938 } else {
1939 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
1940 }
1941 if (s->update_last) {
1942 s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1943 } else {
1944 s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1945 }
1946 s->next_framep[VP56_FRAME_CURRENT] = curframe;
1947
1948 ff_thread_finish_setup(avctx);
1949
1950 s->linesize = curframe->linesize[0];
1951 s->uvlinesize = curframe->linesize[1];
1952
1953 if (!s->thread_data[0].edge_emu_buffer)
1954 for (i = 0; i < MAX_THREADS; i++)
1955 s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1956
1957 memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1958 /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1959 if (!s->mb_layout)
1960 memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1961 if (!s->mb_layout && s->keyframe)
1962 memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1963
1964 // top edge of 127 for intra prediction
1965 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1966 s->top_border[0][15] = s->top_border[0][23] = 127;
1967 s->top_border[0][31] = 127;
1968 memset(s->top_border[1], 127, s->mb_width*sizeof(*s->top_border));
1969 }
1970 memset(s->ref_count, 0, sizeof(s->ref_count));
1971
1972
1973 // Make sure the previous frame has read its segmentation map,
1974 // if we re-use the same map.
1975 if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1976 ff_thread_await_progress(prev_frame, 1, 0);
1977
1978 if (s->mb_layout == 1)
1979 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1980
1981 if (avctx->active_thread_type == FF_THREAD_FRAME)
1982 num_jobs = 1;
1983 else
1984 num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1985 s->num_jobs = num_jobs;
1986 s->curframe = curframe;
1987 s->prev_frame = prev_frame;
1988 s->mv_min.y = -MARGIN;
1989 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1990 for (i = 0; i < MAX_THREADS; i++) {
1991 s->thread_data[i].thread_mb_pos = 0;
1992 s->thread_data[i].wait_mb_pos = INT_MAX;
1993 }
1994 avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1995
1996 ff_thread_report_progress(curframe, INT_MAX, 0);
1997 memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1998
1999 skip_decode:
2000 // if future frames don't use the updated probabilities,
2001 // reset them to the values we saved
2002 if (!s->update_probabilities)
2003 s->prob[0] = s->prob[1];
2004
2005 if (!s->invisible) {
2006 *(AVFrame*)data = *curframe;
2007 *data_size = sizeof(AVFrame);
2008 }
2009
2010 return avpkt->size;
2011 err:
2012 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2013 return ret;
2014 }
2015
2016 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2017 {
2018 VP8Context *s = avctx->priv_data;
2019
2020 s->avctx = avctx;
2021 avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2022
2023 ff_dsputil_init(&s->dsp, avctx);
2024 ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2025 ff_vp8dsp_init(&s->vp8dsp);
2026
2027 return 0;
2028 }
2029
2030 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2031 {
2032 vp8_decode_flush_impl(avctx, 0, 1, 1);
2033 release_queued_segmaps(avctx->priv_data, 1);
2034 return 0;
2035 }
2036
2037 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2038 {
2039 VP8Context *s = avctx->priv_data;
2040
2041 s->avctx = avctx;
2042
2043 return 0;
2044 }
2045
2046 #define REBASE(pic) \
2047 pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2048
2049 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2050 {
2051 VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2052
2053 if (s->macroblocks_base &&
2054 (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2055 free_buffers(s);
2056 s->maps_are_invalid = 1;
2057 s->mb_width = s_src->mb_width;
2058 s->mb_height = s_src->mb_height;
2059 }
2060
2061 s->prob[0] = s_src->prob[!s_src->update_probabilities];
2062 s->segmentation = s_src->segmentation;
2063 s->lf_delta = s_src->lf_delta;
2064 memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2065
2066 memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
2067 s->framep[0] = REBASE(s_src->next_framep[0]);
2068 s->framep[1] = REBASE(s_src->next_framep[1]);
2069 s->framep[2] = REBASE(s_src->next_framep[2]);
2070 s->framep[3] = REBASE(s_src->next_framep[3]);
2071
2072 return 0;
2073 }
2074
2075 AVCodec ff_vp8_decoder = {
2076 .name = "vp8",
2077 .type = AVMEDIA_TYPE_VIDEO,
2078 .id = AV_CODEC_ID_VP8,
2079 .priv_data_size = sizeof(VP8Context),
2080 .init = vp8_decode_init,
2081 .close = vp8_decode_free,
2082 .decode = vp8_decode_frame,
2083 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2084 .flush = vp8_decode_flush,
2085 .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
2086 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2087 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2088 };