vp8: implement sliced threading
[libav.git] / libavcodec / vp8.c
1 /*
2 * VP8 compatible video decoder
3 *
4 * Copyright (C) 2010 David Conrad
5 * Copyright (C) 2010 Ronald S. Bultje
6 * Copyright (C) 2010 Jason Garrett-Glaser
7 * Copyright (C) 2012 Daniel Kang
8 *
9 * This file is part of Libav.
10 *
11 * Libav is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * Libav is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with Libav; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26 #include "libavutil/imgutils.h"
27 #include "avcodec.h"
28 #include "internal.h"
29 #include "vp8.h"
30 #include "vp8data.h"
31 #include "rectangle.h"
32 #include "thread.h"
33
34 #if HAVE_PTHREADS
35 #include <pthread.h>
36 #elif HAVE_W32THREADS
37 #include "w32pthreads.h"
38 #endif
39
40 #if ARCH_ARM
41 # include "arm/vp8.h"
42 #endif
43
44 static void free_buffers(VP8Context *s)
45 {
46 int i;
47 if (s->thread_data)
48 for (i = 0; i < MAX_THREADS; i++) {
49 av_freep(&s->thread_data[i].filter_strength);
50 av_freep(&s->thread_data[i].edge_emu_buffer);
51 }
52 av_freep(&s->thread_data);
53 av_freep(&s->macroblocks_base);
54 av_freep(&s->intra4x4_pred_mode_top);
55 av_freep(&s->top_nnz);
56 av_freep(&s->top_border);
57
58 s->macroblocks = NULL;
59 }
60
61 static int vp8_alloc_frame(VP8Context *s, AVFrame *f)
62 {
63 int ret;
64 if ((ret = ff_thread_get_buffer(s->avctx, f)) < 0)
65 return ret;
66 if (s->num_maps_to_be_freed && !s->maps_are_invalid) {
67 f->ref_index[0] = s->segmentation_maps[--s->num_maps_to_be_freed];
68 } else if (!(f->ref_index[0] = av_mallocz(s->mb_width * s->mb_height))) {
69 ff_thread_release_buffer(s->avctx, f);
70 return AVERROR(ENOMEM);
71 }
72 return 0;
73 }
74
75 static void vp8_release_frame(VP8Context *s, AVFrame *f, int prefer_delayed_free, int can_direct_free)
76 {
77 if (f->ref_index[0]) {
78 if (prefer_delayed_free) {
79 /* Upon a size change, we want to free the maps but other threads may still
80 * be using them, so queue them. Upon a seek, all threads are inactive so
81 * we want to cache one to prevent re-allocation in the next decoding
82 * iteration, but the rest we can free directly. */
83 int max_queued_maps = can_direct_free ? 1 : FF_ARRAY_ELEMS(s->segmentation_maps);
84 if (s->num_maps_to_be_freed < max_queued_maps) {
85 s->segmentation_maps[s->num_maps_to_be_freed++] = f->ref_index[0];
86 } else if (can_direct_free) /* vp8_decode_flush(), but our queue is full */ {
87 av_free(f->ref_index[0]);
88 } /* else: MEMLEAK (should never happen, but better that than crash) */
89 f->ref_index[0] = NULL;
90 } else /* vp8_decode_free() */ {
91 av_free(f->ref_index[0]);
92 }
93 }
94 ff_thread_release_buffer(s->avctx, f);
95 }
96
97 static void vp8_decode_flush_impl(AVCodecContext *avctx,
98 int prefer_delayed_free, int can_direct_free, int free_mem)
99 {
100 VP8Context *s = avctx->priv_data;
101 int i;
102
103 if (!avctx->internal->is_copy) {
104 for (i = 0; i < 5; i++)
105 if (s->frames[i].data[0])
106 vp8_release_frame(s, &s->frames[i], prefer_delayed_free, can_direct_free);
107 }
108 memset(s->framep, 0, sizeof(s->framep));
109
110 if (free_mem) {
111 free_buffers(s);
112 s->maps_are_invalid = 1;
113 }
114 }
115
116 static void vp8_decode_flush(AVCodecContext *avctx)
117 {
118 vp8_decode_flush_impl(avctx, 1, 1, 0);
119 }
120
121 static int update_dimensions(VP8Context *s, int width, int height)
122 {
123 AVCodecContext *avctx = s->avctx;
124 int i;
125
126 if (width != s->avctx->width ||
127 height != s->avctx->height) {
128 if (av_image_check_size(width, height, 0, s->avctx))
129 return AVERROR_INVALIDDATA;
130
131 vp8_decode_flush_impl(s->avctx, 1, 0, 1);
132
133 avcodec_set_dimensions(s->avctx, width, height);
134 }
135
136 s->mb_width = (s->avctx->coded_width +15) / 16;
137 s->mb_height = (s->avctx->coded_height+15) / 16;
138
139 s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
140 if (!s->mb_layout) { // Frame threading and one thread
141 s->macroblocks_base = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
142 s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
143 }
144 else // Sliced threading
145 s->macroblocks_base = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
146 s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
147 s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
148 s->thread_data = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
149
150 for (i = 0; i < MAX_THREADS; i++) {
151 s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
152 pthread_mutex_init(&s->thread_data[i].lock, NULL);
153 pthread_cond_init(&s->thread_data[i].cond, NULL);
154 }
155
156 if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
157 (!s->intra4x4_pred_mode_top && !s->mb_layout))
158 return AVERROR(ENOMEM);
159
160 s->macroblocks = s->macroblocks_base + 1;
161
162 return 0;
163 }
164
165 static void parse_segment_info(VP8Context *s)
166 {
167 VP56RangeCoder *c = &s->c;
168 int i;
169
170 s->segmentation.update_map = vp8_rac_get(c);
171
172 if (vp8_rac_get(c)) { // update segment feature data
173 s->segmentation.absolute_vals = vp8_rac_get(c);
174
175 for (i = 0; i < 4; i++)
176 s->segmentation.base_quant[i] = vp8_rac_get_sint(c, 7);
177
178 for (i = 0; i < 4; i++)
179 s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
180 }
181 if (s->segmentation.update_map)
182 for (i = 0; i < 3; i++)
183 s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
184 }
185
186 static void update_lf_deltas(VP8Context *s)
187 {
188 VP56RangeCoder *c = &s->c;
189 int i;
190
191 for (i = 0; i < 4; i++) {
192 if (vp8_rac_get(c)) {
193 s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
194
195 if (vp8_rac_get(c))
196 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
197 }
198 }
199
200 for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
201 if (vp8_rac_get(c)) {
202 s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
203
204 if (vp8_rac_get(c))
205 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
206 }
207 }
208 }
209
210 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
211 {
212 const uint8_t *sizes = buf;
213 int i;
214
215 s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
216
217 buf += 3*(s->num_coeff_partitions-1);
218 buf_size -= 3*(s->num_coeff_partitions-1);
219 if (buf_size < 0)
220 return -1;
221
222 for (i = 0; i < s->num_coeff_partitions-1; i++) {
223 int size = AV_RL24(sizes + 3*i);
224 if (buf_size - size < 0)
225 return -1;
226
227 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
228 buf += size;
229 buf_size -= size;
230 }
231 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
232
233 return 0;
234 }
235
236 static void get_quants(VP8Context *s)
237 {
238 VP56RangeCoder *c = &s->c;
239 int i, base_qi;
240
241 int yac_qi = vp8_rac_get_uint(c, 7);
242 int ydc_delta = vp8_rac_get_sint(c, 4);
243 int y2dc_delta = vp8_rac_get_sint(c, 4);
244 int y2ac_delta = vp8_rac_get_sint(c, 4);
245 int uvdc_delta = vp8_rac_get_sint(c, 4);
246 int uvac_delta = vp8_rac_get_sint(c, 4);
247
248 for (i = 0; i < 4; i++) {
249 if (s->segmentation.enabled) {
250 base_qi = s->segmentation.base_quant[i];
251 if (!s->segmentation.absolute_vals)
252 base_qi += yac_qi;
253 } else
254 base_qi = yac_qi;
255
256 s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
257 s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
258 s->qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
259 s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
260 s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
261 s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
262
263 s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
264 s->qmat[i].chroma_qmul[0] = FFMIN(s->qmat[i].chroma_qmul[0], 132);
265 }
266 }
267
268 /**
269 * Determine which buffers golden and altref should be updated with after this frame.
270 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
271 *
272 * Intra frames update all 3 references
273 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
274 * If the update (golden|altref) flag is set, it's updated with the current frame
275 * if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
276 * If the flag is not set, the number read means:
277 * 0: no update
278 * 1: VP56_FRAME_PREVIOUS
279 * 2: update golden with altref, or update altref with golden
280 */
281 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
282 {
283 VP56RangeCoder *c = &s->c;
284
285 if (update)
286 return VP56_FRAME_CURRENT;
287
288 switch (vp8_rac_get_uint(c, 2)) {
289 case 1:
290 return VP56_FRAME_PREVIOUS;
291 case 2:
292 return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
293 }
294 return VP56_FRAME_NONE;
295 }
296
297 static void update_refs(VP8Context *s)
298 {
299 VP56RangeCoder *c = &s->c;
300
301 int update_golden = vp8_rac_get(c);
302 int update_altref = vp8_rac_get(c);
303
304 s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
305 s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
306 }
307
308 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
309 {
310 VP56RangeCoder *c = &s->c;
311 int header_size, hscale, vscale, i, j, k, l, m, ret;
312 int width = s->avctx->width;
313 int height = s->avctx->height;
314
315 s->keyframe = !(buf[0] & 1);
316 s->profile = (buf[0]>>1) & 7;
317 s->invisible = !(buf[0] & 0x10);
318 header_size = AV_RL24(buf) >> 5;
319 buf += 3;
320 buf_size -= 3;
321
322 if (s->profile > 3)
323 av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
324
325 if (!s->profile)
326 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
327 else // profile 1-3 use bilinear, 4+ aren't defined so whatever
328 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
329
330 if (header_size > buf_size - 7*s->keyframe) {
331 av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
332 return AVERROR_INVALIDDATA;
333 }
334
335 if (s->keyframe) {
336 if (AV_RL24(buf) != 0x2a019d) {
337 av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
338 return AVERROR_INVALIDDATA;
339 }
340 width = AV_RL16(buf+3) & 0x3fff;
341 height = AV_RL16(buf+5) & 0x3fff;
342 hscale = buf[4] >> 6;
343 vscale = buf[6] >> 6;
344 buf += 7;
345 buf_size -= 7;
346
347 if (hscale || vscale)
348 av_log_missing_feature(s->avctx, "Upscaling", 1);
349
350 s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
351 for (i = 0; i < 4; i++)
352 for (j = 0; j < 16; j++)
353 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
354 sizeof(s->prob->token[i][j]));
355 memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
356 memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
357 memcpy(s->prob->mvc , vp8_mv_default_prob , sizeof(s->prob->mvc));
358 memset(&s->segmentation, 0, sizeof(s->segmentation));
359 }
360
361 ff_vp56_init_range_decoder(c, buf, header_size);
362 buf += header_size;
363 buf_size -= header_size;
364
365 if (s->keyframe) {
366 if (vp8_rac_get(c))
367 av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
368 vp8_rac_get(c); // whether we can skip clamping in dsp functions
369 }
370
371 if ((s->segmentation.enabled = vp8_rac_get(c)))
372 parse_segment_info(s);
373 else
374 s->segmentation.update_map = 0; // FIXME: move this to some init function?
375
376 s->filter.simple = vp8_rac_get(c);
377 s->filter.level = vp8_rac_get_uint(c, 6);
378 s->filter.sharpness = vp8_rac_get_uint(c, 3);
379
380 if ((s->lf_delta.enabled = vp8_rac_get(c)))
381 if (vp8_rac_get(c))
382 update_lf_deltas(s);
383
384 if (setup_partitions(s, buf, buf_size)) {
385 av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
386 return AVERROR_INVALIDDATA;
387 }
388
389 if (!s->macroblocks_base || /* first frame */
390 width != s->avctx->width || height != s->avctx->height) {
391 if ((ret = update_dimensions(s, width, height)) < 0)
392 return ret;
393 }
394
395 get_quants(s);
396
397 if (!s->keyframe) {
398 update_refs(s);
399 s->sign_bias[VP56_FRAME_GOLDEN] = vp8_rac_get(c);
400 s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
401 }
402
403 // if we aren't saving this frame's probabilities for future frames,
404 // make a copy of the current probabilities
405 if (!(s->update_probabilities = vp8_rac_get(c)))
406 s->prob[1] = s->prob[0];
407
408 s->update_last = s->keyframe || vp8_rac_get(c);
409
410 for (i = 0; i < 4; i++)
411 for (j = 0; j < 8; j++)
412 for (k = 0; k < 3; k++)
413 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
414 if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
415 int prob = vp8_rac_get_uint(c, 8);
416 for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
417 s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
418 }
419
420 if ((s->mbskip_enabled = vp8_rac_get(c)))
421 s->prob->mbskip = vp8_rac_get_uint(c, 8);
422
423 if (!s->keyframe) {
424 s->prob->intra = vp8_rac_get_uint(c, 8);
425 s->prob->last = vp8_rac_get_uint(c, 8);
426 s->prob->golden = vp8_rac_get_uint(c, 8);
427
428 if (vp8_rac_get(c))
429 for (i = 0; i < 4; i++)
430 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
431 if (vp8_rac_get(c))
432 for (i = 0; i < 3; i++)
433 s->prob->pred8x8c[i] = vp8_rac_get_uint(c, 8);
434
435 // 17.2 MV probability update
436 for (i = 0; i < 2; i++)
437 for (j = 0; j < 19; j++)
438 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
439 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
440 }
441
442 return 0;
443 }
444
445 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
446 {
447 dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
448 dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
449 }
450
451 /**
452 * Motion vector coding, 17.1.
453 */
454 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
455 {
456 int bit, x = 0;
457
458 if (vp56_rac_get_prob_branchy(c, p[0])) {
459 int i;
460
461 for (i = 0; i < 3; i++)
462 x += vp56_rac_get_prob(c, p[9 + i]) << i;
463 for (i = 9; i > 3; i--)
464 x += vp56_rac_get_prob(c, p[9 + i]) << i;
465 if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
466 x += 8;
467 } else {
468 // small_mvtree
469 const uint8_t *ps = p+2;
470 bit = vp56_rac_get_prob(c, *ps);
471 ps += 1 + 3*bit;
472 x += 4*bit;
473 bit = vp56_rac_get_prob(c, *ps);
474 ps += 1 + bit;
475 x += 2*bit;
476 x += vp56_rac_get_prob(c, *ps);
477 }
478
479 return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
480 }
481
482 static av_always_inline
483 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
484 {
485 if (left == top)
486 return vp8_submv_prob[4-!!left];
487 if (!top)
488 return vp8_submv_prob[2];
489 return vp8_submv_prob[1-!!left];
490 }
491
492 /**
493 * Split motion vector prediction, 16.4.
494 * @returns the number of motion vectors parsed (2, 4 or 16)
495 */
496 static av_always_inline
497 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
498 {
499 int part_idx;
500 int n, num;
501 VP8Macroblock *top_mb;
502 VP8Macroblock *left_mb = &mb[-1];
503 const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
504 *mbsplits_top,
505 *mbsplits_cur, *firstidx;
506 VP56mv *top_mv;
507 VP56mv *left_mv = left_mb->bmv;
508 VP56mv *cur_mv = mb->bmv;
509
510 if (!layout) // layout is inlined, s->mb_layout is not
511 top_mb = &mb[2];
512 else
513 top_mb = &mb[-s->mb_width-1];
514 mbsplits_top = vp8_mbsplits[top_mb->partitioning];
515 top_mv = top_mb->bmv;
516
517 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
518 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
519 part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
520 } else {
521 part_idx = VP8_SPLITMVMODE_8x8;
522 }
523 } else {
524 part_idx = VP8_SPLITMVMODE_4x4;
525 }
526
527 num = vp8_mbsplit_count[part_idx];
528 mbsplits_cur = vp8_mbsplits[part_idx],
529 firstidx = vp8_mbfirstidx[part_idx];
530 mb->partitioning = part_idx;
531
532 for (n = 0; n < num; n++) {
533 int k = firstidx[n];
534 uint32_t left, above;
535 const uint8_t *submv_prob;
536
537 if (!(k & 3))
538 left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
539 else
540 left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
541 if (k <= 3)
542 above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
543 else
544 above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
545
546 submv_prob = get_submv_prob(left, above);
547
548 if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
549 if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
550 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
551 mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
552 mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
553 } else {
554 AV_ZERO32(&mb->bmv[n]);
555 }
556 } else {
557 AV_WN32A(&mb->bmv[n], above);
558 }
559 } else {
560 AV_WN32A(&mb->bmv[n], left);
561 }
562 }
563
564 return num;
565 }
566
567 static av_always_inline
568 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
569 {
570 VP8Macroblock *mb_edge[3] = { 0 /* top */,
571 mb - 1 /* left */,
572 0 /* top-left */ };
573 enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
574 enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
575 int idx = CNT_ZERO;
576 int cur_sign_bias = s->sign_bias[mb->ref_frame];
577 int8_t *sign_bias = s->sign_bias;
578 VP56mv near_mv[4];
579 uint8_t cnt[4] = { 0 };
580 VP56RangeCoder *c = &s->c;
581
582 if (!layout) { // layout is inlined (s->mb_layout is not)
583 mb_edge[0] = mb + 2;
584 mb_edge[2] = mb + 1;
585 }
586 else {
587 mb_edge[0] = mb - s->mb_width-1;
588 mb_edge[2] = mb - s->mb_width-2;
589 }
590
591 AV_ZERO32(&near_mv[0]);
592 AV_ZERO32(&near_mv[1]);
593 AV_ZERO32(&near_mv[2]);
594
595 /* Process MB on top, left and top-left */
596 #define MV_EDGE_CHECK(n)\
597 {\
598 VP8Macroblock *edge = mb_edge[n];\
599 int edge_ref = edge->ref_frame;\
600 if (edge_ref != VP56_FRAME_CURRENT) {\
601 uint32_t mv = AV_RN32A(&edge->mv);\
602 if (mv) {\
603 if (cur_sign_bias != sign_bias[edge_ref]) {\
604 /* SWAR negate of the values in mv. */\
605 mv = ~mv;\
606 mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
607 }\
608 if (!n || mv != AV_RN32A(&near_mv[idx]))\
609 AV_WN32A(&near_mv[++idx], mv);\
610 cnt[idx] += 1 + (n != 2);\
611 } else\
612 cnt[CNT_ZERO] += 1 + (n != 2);\
613 }\
614 }
615
616 MV_EDGE_CHECK(0)
617 MV_EDGE_CHECK(1)
618 MV_EDGE_CHECK(2)
619
620 mb->partitioning = VP8_SPLITMVMODE_NONE;
621 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
622 mb->mode = VP8_MVMODE_MV;
623
624 /* If we have three distinct MVs, merge first and last if they're the same */
625 if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
626 cnt[CNT_NEAREST] += 1;
627
628 /* Swap near and nearest if necessary */
629 if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
630 FFSWAP(uint8_t, cnt[CNT_NEAREST], cnt[CNT_NEAR]);
631 FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
632 }
633
634 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
635 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
636
637 /* Choose the best mv out of 0,0 and the nearest mv */
638 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
639 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode == VP8_MVMODE_SPLIT) +
640 (mb_edge[VP8_EDGE_TOP]->mode == VP8_MVMODE_SPLIT)) * 2 +
641 (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
642
643 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
644 mb->mode = VP8_MVMODE_SPLIT;
645 mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
646 } else {
647 mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
648 mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
649 mb->bmv[0] = mb->mv;
650 }
651 } else {
652 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
653 mb->bmv[0] = mb->mv;
654 }
655 } else {
656 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
657 mb->bmv[0] = mb->mv;
658 }
659 } else {
660 mb->mode = VP8_MVMODE_ZERO;
661 AV_ZERO32(&mb->mv);
662 mb->bmv[0] = mb->mv;
663 }
664 }
665
666 static av_always_inline
667 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
668 int mb_x, int keyframe, int layout)
669 {
670 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
671
672 if (layout == 1) {
673 VP8Macroblock *mb_top = mb - s->mb_width - 1;
674 memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
675 }
676 if (keyframe) {
677 int x, y;
678 uint8_t* top;
679 uint8_t* const left = s->intra4x4_pred_mode_left;
680 if (layout == 1)
681 top = mb->intra4x4_pred_mode_top;
682 else
683 top = s->intra4x4_pred_mode_top + 4 * mb_x;
684 for (y = 0; y < 4; y++) {
685 for (x = 0; x < 4; x++) {
686 const uint8_t *ctx;
687 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
688 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
689 left[y] = top[x] = *intra4x4;
690 intra4x4++;
691 }
692 }
693 } else {
694 int i;
695 for (i = 0; i < 16; i++)
696 intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
697 }
698 }
699
700 static av_always_inline
701 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
702 uint8_t *segment, uint8_t *ref, int layout)
703 {
704 VP56RangeCoder *c = &s->c;
705
706 if (s->segmentation.update_map)
707 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
708 else if (s->segmentation.enabled)
709 *segment = ref ? *ref : *segment;
710 mb->segment = *segment;
711
712 mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
713
714 if (s->keyframe) {
715 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
716
717 if (mb->mode == MODE_I4x4) {
718 decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
719 } else {
720 const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
721 if (s->mb_layout == 1)
722 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
723 else
724 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
725 AV_WN32A( s->intra4x4_pred_mode_left, modes);
726 }
727
728 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
729 mb->ref_frame = VP56_FRAME_CURRENT;
730 } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
731 // inter MB, 16.2
732 if (vp56_rac_get_prob_branchy(c, s->prob->last))
733 mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
734 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
735 else
736 mb->ref_frame = VP56_FRAME_PREVIOUS;
737 s->ref_count[mb->ref_frame-1]++;
738
739 // motion vectors, 16.3
740 decode_mvs(s, mb, mb_x, mb_y, layout);
741 } else {
742 // intra MB, 16.1
743 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
744
745 if (mb->mode == MODE_I4x4)
746 decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
747
748 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
749 mb->ref_frame = VP56_FRAME_CURRENT;
750 mb->partitioning = VP8_SPLITMVMODE_NONE;
751 AV_ZERO32(&mb->bmv[0]);
752 }
753 }
754
755 #ifndef decode_block_coeffs_internal
756 /**
757 * @param c arithmetic bitstream reader context
758 * @param block destination for block coefficients
759 * @param probs probabilities to use when reading trees from the bitstream
760 * @param i initial coeff index, 0 unless a separate DC block is coded
761 * @param qmul array holding the dc/ac dequant factor at position 0/1
762 * @return 0 if no coeffs were decoded
763 * otherwise, the index of the last coeff decoded plus one
764 */
765 static int decode_block_coeffs_internal(VP56RangeCoder *r, DCTELEM block[16],
766 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
767 int i, uint8_t *token_prob, int16_t qmul[2])
768 {
769 VP56RangeCoder c = *r;
770 goto skip_eob;
771 do {
772 int coeff;
773 if (!vp56_rac_get_prob_branchy(&c, token_prob[0])) // DCT_EOB
774 break;
775
776 skip_eob:
777 if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
778 if (++i == 16)
779 break; // invalid input; blocks should end with EOB
780 token_prob = probs[i][0];
781 goto skip_eob;
782 }
783
784 if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
785 coeff = 1;
786 token_prob = probs[i+1][1];
787 } else {
788 if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
789 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
790 if (coeff)
791 coeff += vp56_rac_get_prob(&c, token_prob[5]);
792 coeff += 2;
793 } else {
794 // DCT_CAT*
795 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
796 if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
797 coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
798 } else { // DCT_CAT2
799 coeff = 7;
800 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
801 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
802 }
803 } else { // DCT_CAT3 and up
804 int a = vp56_rac_get_prob(&c, token_prob[8]);
805 int b = vp56_rac_get_prob(&c, token_prob[9+a]);
806 int cat = (a<<1) + b;
807 coeff = 3 + (8<<cat);
808 coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
809 }
810 }
811 token_prob = probs[i+1][2];
812 }
813 block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
814 } while (++i < 16);
815
816 *r = c;
817 return i;
818 }
819 #endif
820
821 /**
822 * @param c arithmetic bitstream reader context
823 * @param block destination for block coefficients
824 * @param probs probabilities to use when reading trees from the bitstream
825 * @param i initial coeff index, 0 unless a separate DC block is coded
826 * @param zero_nhood the initial prediction context for number of surrounding
827 * all-zero blocks (only left/top, so 0-2)
828 * @param qmul array holding the dc/ac dequant factor at position 0/1
829 * @return 0 if no coeffs were decoded
830 * otherwise, the index of the last coeff decoded plus one
831 */
832 static av_always_inline
833 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
834 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
835 int i, int zero_nhood, int16_t qmul[2])
836 {
837 uint8_t *token_prob = probs[i][zero_nhood];
838 if (!vp56_rac_get_prob_branchy(c, token_prob[0])) // DCT_EOB
839 return 0;
840 return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
841 }
842
843 static av_always_inline
844 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
845 uint8_t t_nnz[9], uint8_t l_nnz[9])
846 {
847 int i, x, y, luma_start = 0, luma_ctx = 3;
848 int nnz_pred, nnz, nnz_total = 0;
849 int segment = mb->segment;
850 int block_dc = 0;
851
852 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
853 nnz_pred = t_nnz[8] + l_nnz[8];
854
855 // decode DC values and do hadamard
856 nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
857 s->qmat[segment].luma_dc_qmul);
858 l_nnz[8] = t_nnz[8] = !!nnz;
859 if (nnz) {
860 nnz_total += nnz;
861 block_dc = 1;
862 if (nnz == 1)
863 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
864 else
865 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
866 }
867 luma_start = 1;
868 luma_ctx = 0;
869 }
870
871 // luma blocks
872 for (y = 0; y < 4; y++)
873 for (x = 0; x < 4; x++) {
874 nnz_pred = l_nnz[y] + t_nnz[x];
875 nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
876 nnz_pred, s->qmat[segment].luma_qmul);
877 // nnz+block_dc may be one more than the actual last index, but we don't care
878 td->non_zero_count_cache[y][x] = nnz + block_dc;
879 t_nnz[x] = l_nnz[y] = !!nnz;
880 nnz_total += nnz;
881 }
882
883 // chroma blocks
884 // TODO: what to do about dimensions? 2nd dim for luma is x,
885 // but for chroma it's (y<<1)|x
886 for (i = 4; i < 6; i++)
887 for (y = 0; y < 2; y++)
888 for (x = 0; x < 2; x++) {
889 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
890 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
891 nnz_pred, s->qmat[segment].chroma_qmul);
892 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
893 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
894 nnz_total += nnz;
895 }
896
897 // if there were no coded coeffs despite the macroblock not being marked skip,
898 // we MUST not do the inner loop filter and should not do IDCT
899 // Since skip isn't used for bitstream prediction, just manually set it.
900 if (!nnz_total)
901 mb->skip = 1;
902 }
903
904 static av_always_inline
905 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
906 int linesize, int uvlinesize, int simple)
907 {
908 AV_COPY128(top_border, src_y + 15*linesize);
909 if (!simple) {
910 AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
911 AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
912 }
913 }
914
915 static av_always_inline
916 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
917 int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
918 int simple, int xchg)
919 {
920 uint8_t *top_border_m1 = top_border-32; // for TL prediction
921 src_y -= linesize;
922 src_cb -= uvlinesize;
923 src_cr -= uvlinesize;
924
925 #define XCHG(a,b,xchg) do { \
926 if (xchg) AV_SWAP64(b,a); \
927 else AV_COPY64(b,a); \
928 } while (0)
929
930 XCHG(top_border_m1+8, src_y-8, xchg);
931 XCHG(top_border, src_y, xchg);
932 XCHG(top_border+8, src_y+8, 1);
933 if (mb_x < mb_width-1)
934 XCHG(top_border+32, src_y+16, 1);
935
936 // only copy chroma for normal loop filter
937 // or to initialize the top row to 127
938 if (!simple || !mb_y) {
939 XCHG(top_border_m1+16, src_cb-8, xchg);
940 XCHG(top_border_m1+24, src_cr-8, xchg);
941 XCHG(top_border+16, src_cb, 1);
942 XCHG(top_border+24, src_cr, 1);
943 }
944 }
945
946 static av_always_inline
947 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
948 {
949 if (!mb_x) {
950 return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
951 } else {
952 return mb_y ? mode : LEFT_DC_PRED8x8;
953 }
954 }
955
956 static av_always_inline
957 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
958 {
959 if (!mb_x) {
960 return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
961 } else {
962 return mb_y ? mode : HOR_PRED8x8;
963 }
964 }
965
966 static av_always_inline
967 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
968 {
969 if (mode == DC_PRED8x8) {
970 return check_dc_pred8x8_mode(mode, mb_x, mb_y);
971 } else {
972 return mode;
973 }
974 }
975
976 static av_always_inline
977 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
978 {
979 switch (mode) {
980 case DC_PRED8x8:
981 return check_dc_pred8x8_mode(mode, mb_x, mb_y);
982 case VERT_PRED8x8:
983 return !mb_y ? DC_127_PRED8x8 : mode;
984 case HOR_PRED8x8:
985 return !mb_x ? DC_129_PRED8x8 : mode;
986 case PLANE_PRED8x8 /*TM*/:
987 return check_tm_pred8x8_mode(mode, mb_x, mb_y);
988 }
989 return mode;
990 }
991
992 static av_always_inline
993 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
994 {
995 if (!mb_x) {
996 return mb_y ? VERT_VP8_PRED : DC_129_PRED;
997 } else {
998 return mb_y ? mode : HOR_VP8_PRED;
999 }
1000 }
1001
1002 static av_always_inline
1003 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
1004 {
1005 switch (mode) {
1006 case VERT_PRED:
1007 if (!mb_x && mb_y) {
1008 *copy_buf = 1;
1009 return mode;
1010 }
1011 /* fall-through */
1012 case DIAG_DOWN_LEFT_PRED:
1013 case VERT_LEFT_PRED:
1014 return !mb_y ? DC_127_PRED : mode;
1015 case HOR_PRED:
1016 if (!mb_y) {
1017 *copy_buf = 1;
1018 return mode;
1019 }
1020 /* fall-through */
1021 case HOR_UP_PRED:
1022 return !mb_x ? DC_129_PRED : mode;
1023 case TM_VP8_PRED:
1024 return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1025 case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1026 case DIAG_DOWN_RIGHT_PRED:
1027 case VERT_RIGHT_PRED:
1028 case HOR_DOWN_PRED:
1029 if (!mb_y || !mb_x)
1030 *copy_buf = 1;
1031 return mode;
1032 }
1033 return mode;
1034 }
1035
1036 static av_always_inline
1037 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1038 VP8Macroblock *mb, int mb_x, int mb_y)
1039 {
1040 AVCodecContext *avctx = s->avctx;
1041 int x, y, mode, nnz;
1042 uint32_t tr;
1043
1044 // for the first row, we need to run xchg_mb_border to init the top edge to 127
1045 // otherwise, skip it if we aren't going to deblock
1046 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1047 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1048 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1049 s->filter.simple, 1);
1050
1051 if (mb->mode < MODE_I4x4) {
1052 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
1053 mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1054 } else {
1055 mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
1056 }
1057 s->hpc.pred16x16[mode](dst[0], s->linesize);
1058 } else {
1059 uint8_t *ptr = dst[0];
1060 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1061 uint8_t tr_top[4] = { 127, 127, 127, 127 };
1062
1063 // all blocks on the right edge of the macroblock use bottom edge
1064 // the top macroblock for their topright edge
1065 uint8_t *tr_right = ptr - s->linesize + 16;
1066
1067 // if we're on the right edge of the frame, said edge is extended
1068 // from the top macroblock
1069 if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
1070 mb_x == s->mb_width-1) {
1071 tr = tr_right[-1]*0x01010101u;
1072 tr_right = (uint8_t *)&tr;
1073 }
1074
1075 if (mb->skip)
1076 AV_ZERO128(td->non_zero_count_cache);
1077
1078 for (y = 0; y < 4; y++) {
1079 uint8_t *topright = ptr + 4 - s->linesize;
1080 for (x = 0; x < 4; x++) {
1081 int copy = 0, linesize = s->linesize;
1082 uint8_t *dst = ptr+4*x;
1083 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1084
1085 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
1086 topright = tr_top;
1087 } else if (x == 3)
1088 topright = tr_right;
1089
1090 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
1091 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1092 if (copy) {
1093 dst = copy_dst + 12;
1094 linesize = 8;
1095 if (!(mb_y + y)) {
1096 copy_dst[3] = 127U;
1097 AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1098 } else {
1099 AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1100 if (!(mb_x + x)) {
1101 copy_dst[3] = 129U;
1102 } else {
1103 copy_dst[3] = ptr[4*x-s->linesize-1];
1104 }
1105 }
1106 if (!(mb_x + x)) {
1107 copy_dst[11] =
1108 copy_dst[19] =
1109 copy_dst[27] =
1110 copy_dst[35] = 129U;
1111 } else {
1112 copy_dst[11] = ptr[4*x -1];
1113 copy_dst[19] = ptr[4*x+s->linesize -1];
1114 copy_dst[27] = ptr[4*x+s->linesize*2-1];
1115 copy_dst[35] = ptr[4*x+s->linesize*3-1];
1116 }
1117 }
1118 } else {
1119 mode = intra4x4[x];
1120 }
1121 s->hpc.pred4x4[mode](dst, topright, linesize);
1122 if (copy) {
1123 AV_COPY32(ptr+4*x , copy_dst+12);
1124 AV_COPY32(ptr+4*x+s->linesize , copy_dst+20);
1125 AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1126 AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1127 }
1128
1129 nnz = td->non_zero_count_cache[y][x];
1130 if (nnz) {
1131 if (nnz == 1)
1132 s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1133 else
1134 s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1135 }
1136 topright += 4;
1137 }
1138
1139 ptr += 4*s->linesize;
1140 intra4x4 += 4;
1141 }
1142 }
1143
1144 if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1145 mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1146 } else {
1147 mode = check_intra_pred8x8_mode(mb->chroma_pred_mode, mb_x, mb_y);
1148 }
1149 s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1150 s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1151
1152 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1153 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1154 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1155 s->filter.simple, 0);
1156 }
1157
1158 static const uint8_t subpel_idx[3][8] = {
1159 { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1160 // also function pointer index
1161 { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1162 { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1163 };
1164
1165 /**
1166 * luma MC function
1167 *
1168 * @param s VP8 decoding context
1169 * @param dst target buffer for block data at block position
1170 * @param ref reference picture buffer at origin (0, 0)
1171 * @param mv motion vector (relative to block position) to get pixel data from
1172 * @param x_off horizontal position of block from origin (0, 0)
1173 * @param y_off vertical position of block from origin (0, 0)
1174 * @param block_w width of block (16, 8 or 4)
1175 * @param block_h height of block (always same as block_w)
1176 * @param width width of src/dst plane data
1177 * @param height height of src/dst plane data
1178 * @param linesize size of a single line of plane data, including padding
1179 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1180 */
1181 static av_always_inline
1182 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1183 AVFrame *ref, const VP56mv *mv,
1184 int x_off, int y_off, int block_w, int block_h,
1185 int width, int height, int linesize,
1186 vp8_mc_func mc_func[3][3])
1187 {
1188 uint8_t *src = ref->data[0];
1189
1190 if (AV_RN32A(mv)) {
1191
1192 int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1193 int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1194
1195 x_off += mv->x >> 2;
1196 y_off += mv->y >> 2;
1197
1198 // edge emulation
1199 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1200 src += y_off * linesize + x_off;
1201 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1202 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1203 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1204 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1205 x_off - mx_idx, y_off - my_idx, width, height);
1206 src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1207 }
1208 mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1209 } else {
1210 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1211 mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1212 }
1213 }
1214
1215 /**
1216 * chroma MC function
1217 *
1218 * @param s VP8 decoding context
1219 * @param dst1 target buffer for block data at block position (U plane)
1220 * @param dst2 target buffer for block data at block position (V plane)
1221 * @param ref reference picture buffer at origin (0, 0)
1222 * @param mv motion vector (relative to block position) to get pixel data from
1223 * @param x_off horizontal position of block from origin (0, 0)
1224 * @param y_off vertical position of block from origin (0, 0)
1225 * @param block_w width of block (16, 8 or 4)
1226 * @param block_h height of block (always same as block_w)
1227 * @param width width of src/dst plane data
1228 * @param height height of src/dst plane data
1229 * @param linesize size of a single line of plane data, including padding
1230 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1231 */
1232 static av_always_inline
1233 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1234 AVFrame *ref, const VP56mv *mv, int x_off, int y_off,
1235 int block_w, int block_h, int width, int height, int linesize,
1236 vp8_mc_func mc_func[3][3])
1237 {
1238 uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1239
1240 if (AV_RN32A(mv)) {
1241 int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1242 int my = mv->y&7, my_idx = subpel_idx[0][my];
1243
1244 x_off += mv->x >> 3;
1245 y_off += mv->y >> 3;
1246
1247 // edge emulation
1248 src1 += y_off * linesize + x_off;
1249 src2 += y_off * linesize + x_off;
1250 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1251 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1252 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1253 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1254 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1255 x_off - mx_idx, y_off - my_idx, width, height);
1256 src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1257 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1258
1259 s->dsp.emulated_edge_mc(td->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1260 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1261 x_off - mx_idx, y_off - my_idx, width, height);
1262 src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1263 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1264 } else {
1265 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1266 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1267 }
1268 } else {
1269 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1270 mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1271 mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1272 }
1273 }
1274
1275 static av_always_inline
1276 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1277 AVFrame *ref_frame, int x_off, int y_off,
1278 int bx_off, int by_off,
1279 int block_w, int block_h,
1280 int width, int height, VP56mv *mv)
1281 {
1282 VP56mv uvmv = *mv;
1283
1284 /* Y */
1285 vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1286 ref_frame, mv, x_off + bx_off, y_off + by_off,
1287 block_w, block_h, width, height, s->linesize,
1288 s->put_pixels_tab[block_w == 8]);
1289
1290 /* U/V */
1291 if (s->profile == 3) {
1292 uvmv.x &= ~7;
1293 uvmv.y &= ~7;
1294 }
1295 x_off >>= 1; y_off >>= 1;
1296 bx_off >>= 1; by_off >>= 1;
1297 width >>= 1; height >>= 1;
1298 block_w >>= 1; block_h >>= 1;
1299 vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1300 dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1301 &uvmv, x_off + bx_off, y_off + by_off,
1302 block_w, block_h, width, height, s->uvlinesize,
1303 s->put_pixels_tab[1 + (block_w == 4)]);
1304 }
1305
1306 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1307 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1308 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1309 {
1310 /* Don't prefetch refs that haven't been used very often this frame. */
1311 if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1312 int x_off = mb_x << 4, y_off = mb_y << 4;
1313 int mx = (mb->mv.x>>2) + x_off + 8;
1314 int my = (mb->mv.y>>2) + y_off;
1315 uint8_t **src= s->framep[ref]->data;
1316 int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1317 /* For threading, a ff_thread_await_progress here might be useful, but
1318 * it actually slows down the decoder. Since a bad prefetch doesn't
1319 * generate bad decoder output, we don't run it here. */
1320 s->dsp.prefetch(src[0]+off, s->linesize, 4);
1321 off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1322 s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1323 }
1324 }
1325
1326 /**
1327 * Apply motion vectors to prediction buffer, chapter 18.
1328 */
1329 static av_always_inline
1330 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1331 VP8Macroblock *mb, int mb_x, int mb_y)
1332 {
1333 int x_off = mb_x << 4, y_off = mb_y << 4;
1334 int width = 16*s->mb_width, height = 16*s->mb_height;
1335 AVFrame *ref = s->framep[mb->ref_frame];
1336 VP56mv *bmv = mb->bmv;
1337
1338 switch (mb->partitioning) {
1339 case VP8_SPLITMVMODE_NONE:
1340 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1341 0, 0, 16, 16, width, height, &mb->mv);
1342 break;
1343 case VP8_SPLITMVMODE_4x4: {
1344 int x, y;
1345 VP56mv uvmv;
1346
1347 /* Y */
1348 for (y = 0; y < 4; y++) {
1349 for (x = 0; x < 4; x++) {
1350 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1351 ref, &bmv[4*y + x],
1352 4*x + x_off, 4*y + y_off, 4, 4,
1353 width, height, s->linesize,
1354 s->put_pixels_tab[2]);
1355 }
1356 }
1357
1358 /* U/V */
1359 x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1360 for (y = 0; y < 2; y++) {
1361 for (x = 0; x < 2; x++) {
1362 uvmv.x = mb->bmv[ 2*y * 4 + 2*x ].x +
1363 mb->bmv[ 2*y * 4 + 2*x+1].x +
1364 mb->bmv[(2*y+1) * 4 + 2*x ].x +
1365 mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1366 uvmv.y = mb->bmv[ 2*y * 4 + 2*x ].y +
1367 mb->bmv[ 2*y * 4 + 2*x+1].y +
1368 mb->bmv[(2*y+1) * 4 + 2*x ].y +
1369 mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1370 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1371 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1372 if (s->profile == 3) {
1373 uvmv.x &= ~7;
1374 uvmv.y &= ~7;
1375 }
1376 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1377 dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1378 4*x + x_off, 4*y + y_off, 4, 4,
1379 width, height, s->uvlinesize,
1380 s->put_pixels_tab[2]);
1381 }
1382 }
1383 break;
1384 }
1385 case VP8_SPLITMVMODE_16x8:
1386 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1387 0, 0, 16, 8, width, height, &bmv[0]);
1388 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1389 0, 8, 16, 8, width, height, &bmv[1]);
1390 break;
1391 case VP8_SPLITMVMODE_8x16:
1392 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1393 0, 0, 8, 16, width, height, &bmv[0]);
1394 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1395 8, 0, 8, 16, width, height, &bmv[1]);
1396 break;
1397 case VP8_SPLITMVMODE_8x8:
1398 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1399 0, 0, 8, 8, width, height, &bmv[0]);
1400 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1401 8, 0, 8, 8, width, height, &bmv[1]);
1402 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1403 0, 8, 8, 8, width, height, &bmv[2]);
1404 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1405 8, 8, 8, 8, width, height, &bmv[3]);
1406 break;
1407 }
1408 }
1409
1410 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1411 uint8_t *dst[3], VP8Macroblock *mb)
1412 {
1413 int x, y, ch;
1414
1415 if (mb->mode != MODE_I4x4) {
1416 uint8_t *y_dst = dst[0];
1417 for (y = 0; y < 4; y++) {
1418 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1419 if (nnz4) {
1420 if (nnz4&~0x01010101) {
1421 for (x = 0; x < 4; x++) {
1422 if ((uint8_t)nnz4 == 1)
1423 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1424 else if((uint8_t)nnz4 > 1)
1425 s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1426 nnz4 >>= 8;
1427 if (!nnz4)
1428 break;
1429 }
1430 } else {
1431 s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1432 }
1433 }
1434 y_dst += 4*s->linesize;
1435 }
1436 }
1437
1438 for (ch = 0; ch < 2; ch++) {
1439 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1440 if (nnz4) {
1441 uint8_t *ch_dst = dst[1+ch];
1442 if (nnz4&~0x01010101) {
1443 for (y = 0; y < 2; y++) {
1444 for (x = 0; x < 2; x++) {
1445 if ((uint8_t)nnz4 == 1)
1446 s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1447 else if((uint8_t)nnz4 > 1)
1448 s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1449 nnz4 >>= 8;
1450 if (!nnz4)
1451 goto chroma_idct_end;
1452 }
1453 ch_dst += 4*s->uvlinesize;
1454 }
1455 } else {
1456 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1457 }
1458 }
1459 chroma_idct_end: ;
1460 }
1461 }
1462
1463 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1464 {
1465 int interior_limit, filter_level;
1466
1467 if (s->segmentation.enabled) {
1468 filter_level = s->segmentation.filter_level[mb->segment];
1469 if (!s->segmentation.absolute_vals)
1470 filter_level += s->filter.level;
1471 } else
1472 filter_level = s->filter.level;
1473
1474 if (s->lf_delta.enabled) {
1475 filter_level += s->lf_delta.ref[mb->ref_frame];
1476 filter_level += s->lf_delta.mode[mb->mode];
1477 }
1478
1479 filter_level = av_clip_uintp2(filter_level, 6);
1480
1481 interior_limit = filter_level;
1482 if (s->filter.sharpness) {
1483 interior_limit >>= (s->filter.sharpness + 3) >> 2;
1484 interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1485 }
1486 interior_limit = FFMAX(interior_limit, 1);
1487
1488 f->filter_level = filter_level;
1489 f->inner_limit = interior_limit;
1490 f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1491 }
1492
1493 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1494 {
1495 int mbedge_lim, bedge_lim, hev_thresh;
1496 int filter_level = f->filter_level;
1497 int inner_limit = f->inner_limit;
1498 int inner_filter = f->inner_filter;
1499 int linesize = s->linesize;
1500 int uvlinesize = s->uvlinesize;
1501 static const uint8_t hev_thresh_lut[2][64] = {
1502 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1503 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1504 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1505 3, 3, 3, 3 },
1506 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1507 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1508 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1509 2, 2, 2, 2 }
1510 };
1511
1512 if (!filter_level)
1513 return;
1514
1515 bedge_lim = 2*filter_level + inner_limit;
1516 mbedge_lim = bedge_lim + 4;
1517
1518 hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1519
1520 if (mb_x) {
1521 s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
1522 mbedge_lim, inner_limit, hev_thresh);
1523 s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
1524 mbedge_lim, inner_limit, hev_thresh);
1525 }
1526
1527 if (inner_filter) {
1528 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1529 inner_limit, hev_thresh);
1530 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1531 inner_limit, hev_thresh);
1532 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1533 inner_limit, hev_thresh);
1534 s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1535 uvlinesize, bedge_lim,
1536 inner_limit, hev_thresh);
1537 }
1538
1539 if (mb_y) {
1540 s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
1541 mbedge_lim, inner_limit, hev_thresh);
1542 s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
1543 mbedge_lim, inner_limit, hev_thresh);
1544 }
1545
1546 if (inner_filter) {
1547 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1548 linesize, bedge_lim,
1549 inner_limit, hev_thresh);
1550 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1551 linesize, bedge_lim,
1552 inner_limit, hev_thresh);
1553 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1554 linesize, bedge_lim,
1555 inner_limit, hev_thresh);
1556 s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1557 dst[2] + 4 * uvlinesize,
1558 uvlinesize, bedge_lim,
1559 inner_limit, hev_thresh);
1560 }
1561 }
1562
1563 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1564 {
1565 int mbedge_lim, bedge_lim;
1566 int filter_level = f->filter_level;
1567 int inner_limit = f->inner_limit;
1568 int inner_filter = f->inner_filter;
1569 int linesize = s->linesize;
1570
1571 if (!filter_level)
1572 return;
1573
1574 bedge_lim = 2*filter_level + inner_limit;
1575 mbedge_lim = bedge_lim + 4;
1576
1577 if (mb_x)
1578 s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1579 if (inner_filter) {
1580 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1581 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1582 s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1583 }
1584
1585 if (mb_y)
1586 s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1587 if (inner_filter) {
1588 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1589 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1590 s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1591 }
1592 }
1593
1594 static void release_queued_segmaps(VP8Context *s, int is_close)
1595 {
1596 int leave_behind = is_close ? 0 : !s->maps_are_invalid;
1597 while (s->num_maps_to_be_freed > leave_behind)
1598 av_freep(&s->segmentation_maps[--s->num_maps_to_be_freed]);
1599 s->maps_are_invalid = 0;
1600 }
1601
1602 #define MARGIN (16 << 2)
1603 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, AVFrame *curframe,
1604 AVFrame *prev_frame)
1605 {
1606 VP8Context *s = avctx->priv_data;
1607 int mb_x, mb_y;
1608
1609 s->mv_min.y = -MARGIN;
1610 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1611 for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1612 VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1613 int mb_xy = mb_y*s->mb_width;
1614
1615 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1616
1617 s->mv_min.x = -MARGIN;
1618 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1619 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1620 if (mb_y == 0)
1621 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1622 decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1623 prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 1);
1624 s->mv_min.x -= 64;
1625 s->mv_max.x -= 64;
1626 }
1627 s->mv_min.y -= 64;
1628 s->mv_max.y -= 64;
1629 }
1630 }
1631
1632 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1633 do {\
1634 int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1635 if (otd->thread_mb_pos < tmp) {\
1636 pthread_mutex_lock(&otd->lock);\
1637 td->wait_mb_pos = tmp;\
1638 do {\
1639 if (otd->thread_mb_pos >= tmp)\
1640 break;\
1641 pthread_cond_wait(&otd->cond, &otd->lock);\
1642 } while (1);\
1643 td->wait_mb_pos = INT_MAX;\
1644 pthread_mutex_unlock(&otd->lock);\
1645 }\
1646 } while(0);
1647
1648 #define update_pos(td, mb_y, mb_x)\
1649 do {\
1650 int pos = (mb_y << 16) | (mb_x & 0xFFFF);\
1651 int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1652 int is_null = (next_td == NULL) || (prev_td == NULL);\
1653 int pos_check = (is_null) ? 1 :\
1654 (next_td != td && pos >= next_td->wait_mb_pos) ||\
1655 (prev_td != td && pos >= prev_td->wait_mb_pos);\
1656 td->thread_mb_pos = pos;\
1657 if (sliced_threading && pos_check) {\
1658 pthread_mutex_lock(&td->lock);\
1659 pthread_cond_broadcast(&td->cond);\
1660 pthread_mutex_unlock(&td->lock);\
1661 }\
1662 } while(0);
1663
1664 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1665 int jobnr, int threadnr)
1666 {
1667 VP8Context *s = avctx->priv_data;
1668 VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1669 int mb_y = td->thread_mb_pos>>16;
1670 int i, y, mb_x, mb_xy = mb_y*s->mb_width;
1671 int num_jobs = s->num_jobs;
1672 AVFrame *curframe = s->curframe, *prev_frame = s->prev_frame;
1673 VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1674 VP8Macroblock *mb;
1675 uint8_t *dst[3] = {
1676 curframe->data[0] + 16*mb_y*s->linesize,
1677 curframe->data[1] + 8*mb_y*s->uvlinesize,
1678 curframe->data[2] + 8*mb_y*s->uvlinesize
1679 };
1680 if (mb_y == 0) prev_td = td;
1681 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1682 if (mb_y == s->mb_height-1) next_td = td;
1683 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1684 if (s->mb_layout == 1)
1685 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1686 else {
1687 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1688 memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1689 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1690 }
1691
1692 memset(td->left_nnz, 0, sizeof(td->left_nnz));
1693 // left edge of 129 for intra prediction
1694 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1695 for (i = 0; i < 3; i++)
1696 for (y = 0; y < 16>>!!i; y++)
1697 dst[i][y*curframe->linesize[i]-1] = 129;
1698 if (mb_y == 1) {
1699 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1700 }
1701 }
1702
1703 s->mv_min.x = -MARGIN;
1704 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1705
1706 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1707 // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1708 if (prev_td != td) {
1709 if (threadnr != 0) {
1710 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1711 } else {
1712 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1713 }
1714 }
1715
1716 s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1717 s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1718
1719 if (!s->mb_layout)
1720 decode_mb_mode(s, mb, mb_x, mb_y, curframe->ref_index[0] + mb_xy,
1721 prev_frame && prev_frame->ref_index[0] ? prev_frame->ref_index[0] + mb_xy : NULL, 0);
1722
1723 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1724
1725 if (!mb->skip)
1726 decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1727
1728 if (mb->mode <= MODE_I4x4)
1729 intra_predict(s, td, dst, mb, mb_x, mb_y);
1730 else
1731 inter_predict(s, td, dst, mb, mb_x, mb_y);
1732
1733 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1734
1735 if (!mb->skip) {
1736 idct_mb(s, td, dst, mb);
1737 } else {
1738 AV_ZERO64(td->left_nnz);
1739 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
1740
1741 // Reset DC block predictors if they would exist if the mb had coefficients
1742 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1743 td->left_nnz[8] = 0;
1744 s->top_nnz[mb_x][8] = 0;
1745 }
1746 }
1747
1748 if (s->deblock_filter)
1749 filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1750
1751 if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1752 if (s->filter.simple)
1753 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1754 else
1755 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1756 }
1757
1758 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1759
1760 dst[0] += 16;
1761 dst[1] += 8;
1762 dst[2] += 8;
1763 s->mv_min.x -= 64;
1764 s->mv_max.x -= 64;
1765
1766 if (mb_x == s->mb_width+1) {
1767 update_pos(td, mb_y, s->mb_width+3);
1768 } else {
1769 update_pos(td, mb_y, mb_x);
1770 }
1771 }
1772 }
1773
1774 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1775 int jobnr, int threadnr)
1776 {
1777 VP8Context *s = avctx->priv_data;
1778 VP8ThreadData *td = &s->thread_data[threadnr];
1779 int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1780 AVFrame *curframe = s->curframe;
1781 VP8Macroblock *mb;
1782 VP8ThreadData *prev_td, *next_td;
1783 uint8_t *dst[3] = {
1784 curframe->data[0] + 16*mb_y*s->linesize,
1785 curframe->data[1] + 8*mb_y*s->uvlinesize,
1786 curframe->data[2] + 8*mb_y*s->uvlinesize
1787 };
1788
1789 if (s->mb_layout == 1)
1790 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1791 else
1792 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1793
1794 if (mb_y == 0) prev_td = td;
1795 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1796 if (mb_y == s->mb_height-1) next_td = td;
1797 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1798
1799 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1800 VP8FilterStrength *f = &td->filter_strength[mb_x];
1801 if (prev_td != td) {
1802 check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1803 }
1804 if (next_td != td)
1805 if (next_td != &s->thread_data[0]) {
1806 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1807 }
1808
1809 if (num_jobs == 1) {
1810 if (s->filter.simple)
1811 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1812 else
1813 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1814 }
1815
1816 if (s->filter.simple)
1817 filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1818 else
1819 filter_mb(s, dst, f, mb_x, mb_y);
1820 dst[0] += 16;
1821 dst[1] += 8;
1822 dst[2] += 8;
1823
1824 update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1825 }
1826 }
1827
1828 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1829 int jobnr, int threadnr)
1830 {
1831 VP8Context *s = avctx->priv_data;
1832 VP8ThreadData *td = &s->thread_data[jobnr];
1833 VP8ThreadData *next_td = NULL, *prev_td = NULL;
1834 AVFrame *curframe = s->curframe;
1835 int mb_y, num_jobs = s->num_jobs;
1836 td->thread_nr = threadnr;
1837 for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1838 if (mb_y >= s->mb_height) break;
1839 td->thread_mb_pos = mb_y<<16;
1840 vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1841 if (s->deblock_filter)
1842 vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1843 update_pos(td, mb_y, INT_MAX & 0xFFFF);
1844
1845 s->mv_min.y -= 64;
1846 s->mv_max.y -= 64;
1847
1848 if (avctx->active_thread_type == FF_THREAD_FRAME)
1849 ff_thread_report_progress(curframe, mb_y, 0);
1850 }
1851
1852 return 0;
1853 }
1854
1855 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1856 AVPacket *avpkt)
1857 {
1858 VP8Context *s = avctx->priv_data;
1859 int ret, i, referenced, num_jobs;
1860 enum AVDiscard skip_thresh;
1861 AVFrame *av_uninit(curframe), *prev_frame;
1862
1863 release_queued_segmaps(s, 0);
1864
1865 if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1866 goto err;
1867
1868 prev_frame = s->framep[VP56_FRAME_CURRENT];
1869
1870 referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1871 || s->update_altref == VP56_FRAME_CURRENT;
1872
1873 skip_thresh = !referenced ? AVDISCARD_NONREF :
1874 !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1875
1876 if (avctx->skip_frame >= skip_thresh) {
1877 s->invisible = 1;
1878 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1879 goto skip_decode;
1880 }
1881 s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1882
1883 // release no longer referenced frames
1884 for (i = 0; i < 5; i++)
1885 if (s->frames[i].data[0] &&
1886 &s->frames[i] != prev_frame &&
1887 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1888 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1889 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1890 vp8_release_frame(s, &s->frames[i], 1, 0);
1891
1892 // find a free buffer
1893 for (i = 0; i < 5; i++)
1894 if (&s->frames[i] != prev_frame &&
1895 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1896 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1897 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1898 curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1899 break;
1900 }
1901 if (i == 5) {
1902 av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1903 abort();
1904 }
1905 if (curframe->data[0])
1906 vp8_release_frame(s, curframe, 1, 0);
1907
1908 // Given that arithmetic probabilities are updated every frame, it's quite likely
1909 // that the values we have on a random interframe are complete junk if we didn't
1910 // start decode on a keyframe. So just don't display anything rather than junk.
1911 if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1912 !s->framep[VP56_FRAME_GOLDEN] ||
1913 !s->framep[VP56_FRAME_GOLDEN2])) {
1914 av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1915 ret = AVERROR_INVALIDDATA;
1916 goto err;
1917 }
1918
1919 curframe->key_frame = s->keyframe;
1920 curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1921 curframe->reference = referenced ? 3 : 0;
1922 if ((ret = vp8_alloc_frame(s, curframe))) {
1923 av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1924 goto err;
1925 }
1926
1927 // check if golden and altref are swapped
1928 if (s->update_altref != VP56_FRAME_NONE) {
1929 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1930 } else {
1931 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
1932 }
1933 if (s->update_golden != VP56_FRAME_NONE) {
1934 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1935 } else {
1936 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
1937 }
1938 if (s->update_last) {
1939 s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1940 } else {
1941 s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1942 }
1943 s->next_framep[VP56_FRAME_CURRENT] = curframe;
1944
1945 ff_thread_finish_setup(avctx);
1946
1947 s->linesize = curframe->linesize[0];
1948 s->uvlinesize = curframe->linesize[1];
1949
1950 if (!s->thread_data[0].edge_emu_buffer)
1951 for (i = 0; i < MAX_THREADS; i++)
1952 s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1953
1954 memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1955 /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1956 if (!s->mb_layout)
1957 memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1958 if (!s->mb_layout && s->keyframe)
1959 memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1960
1961 // top edge of 127 for intra prediction
1962 if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1963 s->top_border[0][15] = s->top_border[0][23] = 127;
1964 memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1965 }
1966 memset(s->ref_count, 0, sizeof(s->ref_count));
1967
1968
1969 // Make sure the previous frame has read its segmentation map,
1970 // if we re-use the same map.
1971 if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1972 ff_thread_await_progress(prev_frame, 1, 0);
1973
1974 if (s->mb_layout == 1)
1975 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1976
1977 if (avctx->active_thread_type == FF_THREAD_FRAME)
1978 num_jobs = 1;
1979 else
1980 num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1981 s->num_jobs = num_jobs;
1982 s->curframe = curframe;
1983 s->prev_frame = prev_frame;
1984 s->mv_min.y = -MARGIN;
1985 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1986 for (i = 0; i < MAX_THREADS; i++) {
1987 s->thread_data[i].thread_mb_pos = 0;
1988 s->thread_data[i].wait_mb_pos = INT_MAX;
1989 }
1990 avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1991
1992 ff_thread_report_progress(curframe, INT_MAX, 0);
1993 memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1994
1995 skip_decode:
1996 // if future frames don't use the updated probabilities,
1997 // reset them to the values we saved
1998 if (!s->update_probabilities)
1999 s->prob[0] = s->prob[1];
2000
2001 if (!s->invisible) {
2002 *(AVFrame*)data = *curframe;
2003 *data_size = sizeof(AVFrame);
2004 }
2005
2006 return avpkt->size;
2007 err:
2008 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2009 return ret;
2010 }
2011
2012 static av_cold int vp8_decode_init(AVCodecContext *avctx)
2013 {
2014 VP8Context *s = avctx->priv_data;
2015
2016 s->avctx = avctx;
2017 avctx->pix_fmt = PIX_FMT_YUV420P;
2018
2019 ff_dsputil_init(&s->dsp, avctx);
2020 ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8, 1);
2021 ff_vp8dsp_init(&s->vp8dsp);
2022
2023 return 0;
2024 }
2025
2026 static av_cold int vp8_decode_free(AVCodecContext *avctx)
2027 {
2028 vp8_decode_flush_impl(avctx, 0, 1, 1);
2029 release_queued_segmaps(avctx->priv_data, 1);
2030 return 0;
2031 }
2032
2033 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2034 {
2035 VP8Context *s = avctx->priv_data;
2036
2037 s->avctx = avctx;
2038
2039 return 0;
2040 }
2041
2042 #define REBASE(pic) \
2043 pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2044
2045 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2046 {
2047 VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2048
2049 if (s->macroblocks_base &&
2050 (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2051 free_buffers(s);
2052 s->maps_are_invalid = 1;
2053 s->mb_width = s_src->mb_width;
2054 s->mb_height = s_src->mb_height;
2055 }
2056
2057 s->prob[0] = s_src->prob[!s_src->update_probabilities];
2058 s->segmentation = s_src->segmentation;
2059 s->lf_delta = s_src->lf_delta;
2060 memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2061
2062 memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
2063 s->framep[0] = REBASE(s_src->next_framep[0]);
2064 s->framep[1] = REBASE(s_src->next_framep[1]);
2065 s->framep[2] = REBASE(s_src->next_framep[2]);
2066 s->framep[3] = REBASE(s_src->next_framep[3]);
2067
2068 return 0;
2069 }
2070
2071 AVCodec ff_vp8_decoder = {
2072 .name = "vp8",
2073 .type = AVMEDIA_TYPE_VIDEO,
2074 .id = CODEC_ID_VP8,
2075 .priv_data_size = sizeof(VP8Context),
2076 .init = vp8_decode_init,
2077 .close = vp8_decode_free,
2078 .decode = vp8_decode_frame,
2079 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2080 .flush = vp8_decode_flush,
2081 .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
2082 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2083 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2084 };