vp8: drop support for real (non-emulated) edges
[libav.git] / libavcodec / vp8.c
1 /*
2 * VP8 compatible video decoder
3 *
4 * Copyright (C) 2010 David Conrad
5 * Copyright (C) 2010 Ronald S. Bultje
6 * Copyright (C) 2010 Jason Garrett-Glaser
7 * Copyright (C) 2012 Daniel Kang
8 *
9 * This file is part of Libav.
10 *
11 * Libav is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * Libav is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with Libav; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26 #include "libavutil/imgutils.h"
27 #include "avcodec.h"
28 #include "internal.h"
29 #include "vp8.h"
30 #include "vp8data.h"
31 #include "rectangle.h"
32 #include "thread.h"
33
34 #if ARCH_ARM
35 # include "arm/vp8.h"
36 #endif
37
38 static void free_buffers(VP8Context *s)
39 {
40 int i;
41 if (s->thread_data)
42 for (i = 0; i < MAX_THREADS; i++) {
43 #if HAVE_THREADS
44 pthread_cond_destroy(&s->thread_data[i].cond);
45 pthread_mutex_destroy(&s->thread_data[i].lock);
46 #endif
47 av_freep(&s->thread_data[i].filter_strength);
48 av_freep(&s->thread_data[i].edge_emu_buffer);
49 }
50 av_freep(&s->thread_data);
51 av_freep(&s->macroblocks_base);
52 av_freep(&s->intra4x4_pred_mode_top);
53 av_freep(&s->top_nnz);
54 av_freep(&s->top_border);
55
56 s->macroblocks = NULL;
57 }
58
59 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
60 {
61 int ret;
62 if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
63 ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
64 return ret;
65 if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
66 ff_thread_release_buffer(s->avctx, &f->tf);
67 return AVERROR(ENOMEM);
68 }
69 return 0;
70 }
71
72 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
73 {
74 av_buffer_unref(&f->seg_map);
75 ff_thread_release_buffer(s->avctx, &f->tf);
76 }
77
78 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
79 {
80 int ret;
81
82 vp8_release_frame(s, dst);
83
84 if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
85 return ret;
86 if (src->seg_map &&
87 !(dst->seg_map = av_buffer_ref(src->seg_map))) {
88 vp8_release_frame(s, dst);
89 return AVERROR(ENOMEM);
90 }
91
92 return 0;
93 }
94
95
96 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
97 {
98 VP8Context *s = avctx->priv_data;
99 int i;
100
101 for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
102 vp8_release_frame(s, &s->frames[i]);
103 memset(s->framep, 0, sizeof(s->framep));
104
105 if (free_mem)
106 free_buffers(s);
107 }
108
109 static void vp8_decode_flush(AVCodecContext *avctx)
110 {
111 vp8_decode_flush_impl(avctx, 0);
112 }
113
114 static int update_dimensions(VP8Context *s, int width, int height)
115 {
116 AVCodecContext *avctx = s->avctx;
117 int i, ret;
118
119 if (width != s->avctx->width ||
120 height != s->avctx->height) {
121 vp8_decode_flush_impl(s->avctx, 1);
122
123 ret = ff_set_dimensions(s->avctx, width, height);
124 if (ret < 0)
125 return ret;
126 }
127
128 s->mb_width = (s->avctx->coded_width +15) / 16;
129 s->mb_height = (s->avctx->coded_height+15) / 16;
130
131 s->mb_layout = (avctx->active_thread_type == FF_THREAD_SLICE) && (FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1);
132 if (!s->mb_layout) { // Frame threading and one thread
133 s->macroblocks_base = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
134 s->intra4x4_pred_mode_top = av_mallocz(s->mb_width*4);
135 }
136 else // Sliced threading
137 s->macroblocks_base = av_mallocz((s->mb_width+2)*(s->mb_height+2)*sizeof(*s->macroblocks));
138 s->top_nnz = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
139 s->top_border = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
140 s->thread_data = av_mallocz(MAX_THREADS*sizeof(VP8ThreadData));
141
142 for (i = 0; i < MAX_THREADS; i++) {
143 s->thread_data[i].filter_strength = av_mallocz(s->mb_width*sizeof(*s->thread_data[0].filter_strength));
144 #if HAVE_THREADS
145 pthread_mutex_init(&s->thread_data[i].lock, NULL);
146 pthread_cond_init(&s->thread_data[i].cond, NULL);
147 #endif
148 }
149
150 if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
151 (!s->intra4x4_pred_mode_top && !s->mb_layout))
152 return AVERROR(ENOMEM);
153
154 s->macroblocks = s->macroblocks_base + 1;
155
156 return 0;
157 }
158
159 static void parse_segment_info(VP8Context *s)
160 {
161 VP56RangeCoder *c = &s->c;
162 int i;
163
164 s->segmentation.update_map = vp8_rac_get(c);
165
166 if (vp8_rac_get(c)) { // update segment feature data
167 s->segmentation.absolute_vals = vp8_rac_get(c);
168
169 for (i = 0; i < 4; i++)
170 s->segmentation.base_quant[i] = vp8_rac_get_sint(c, 7);
171
172 for (i = 0; i < 4; i++)
173 s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
174 }
175 if (s->segmentation.update_map)
176 for (i = 0; i < 3; i++)
177 s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
178 }
179
180 static void update_lf_deltas(VP8Context *s)
181 {
182 VP56RangeCoder *c = &s->c;
183 int i;
184
185 for (i = 0; i < 4; i++) {
186 if (vp8_rac_get(c)) {
187 s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
188
189 if (vp8_rac_get(c))
190 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
191 }
192 }
193
194 for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
195 if (vp8_rac_get(c)) {
196 s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
197
198 if (vp8_rac_get(c))
199 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
200 }
201 }
202 }
203
204 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
205 {
206 const uint8_t *sizes = buf;
207 int i;
208
209 s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
210
211 buf += 3*(s->num_coeff_partitions-1);
212 buf_size -= 3*(s->num_coeff_partitions-1);
213 if (buf_size < 0)
214 return -1;
215
216 for (i = 0; i < s->num_coeff_partitions-1; i++) {
217 int size = AV_RL24(sizes + 3*i);
218 if (buf_size - size < 0)
219 return -1;
220
221 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
222 buf += size;
223 buf_size -= size;
224 }
225 ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
226
227 return 0;
228 }
229
230 static void get_quants(VP8Context *s)
231 {
232 VP56RangeCoder *c = &s->c;
233 int i, base_qi;
234
235 int yac_qi = vp8_rac_get_uint(c, 7);
236 int ydc_delta = vp8_rac_get_sint(c, 4);
237 int y2dc_delta = vp8_rac_get_sint(c, 4);
238 int y2ac_delta = vp8_rac_get_sint(c, 4);
239 int uvdc_delta = vp8_rac_get_sint(c, 4);
240 int uvac_delta = vp8_rac_get_sint(c, 4);
241
242 for (i = 0; i < 4; i++) {
243 if (s->segmentation.enabled) {
244 base_qi = s->segmentation.base_quant[i];
245 if (!s->segmentation.absolute_vals)
246 base_qi += yac_qi;
247 } else
248 base_qi = yac_qi;
249
250 s->qmat[i].luma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
251 s->qmat[i].luma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi , 7)];
252 s->qmat[i].luma_dc_qmul[0] = 2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
253 /* 101581>>16 is equivalent to 155/100 */
254 s->qmat[i].luma_dc_qmul[1] = (101581 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)]) >> 16;
255 s->qmat[i].chroma_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
256 s->qmat[i].chroma_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
257
258 s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
259 s->qmat[i].chroma_qmul[0] = FFMIN(s->qmat[i].chroma_qmul[0], 132);
260 }
261 }
262
263 /**
264 * Determine which buffers golden and altref should be updated with after this frame.
265 * The spec isn't clear here, so I'm going by my understanding of what libvpx does
266 *
267 * Intra frames update all 3 references
268 * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
269 * If the update (golden|altref) flag is set, it's updated with the current frame
270 * if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
271 * If the flag is not set, the number read means:
272 * 0: no update
273 * 1: VP56_FRAME_PREVIOUS
274 * 2: update golden with altref, or update altref with golden
275 */
276 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
277 {
278 VP56RangeCoder *c = &s->c;
279
280 if (update)
281 return VP56_FRAME_CURRENT;
282
283 switch (vp8_rac_get_uint(c, 2)) {
284 case 1:
285 return VP56_FRAME_PREVIOUS;
286 case 2:
287 return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
288 }
289 return VP56_FRAME_NONE;
290 }
291
292 static void update_refs(VP8Context *s)
293 {
294 VP56RangeCoder *c = &s->c;
295
296 int update_golden = vp8_rac_get(c);
297 int update_altref = vp8_rac_get(c);
298
299 s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
300 s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
301 }
302
303 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
304 {
305 VP56RangeCoder *c = &s->c;
306 int header_size, hscale, vscale, i, j, k, l, m, ret;
307 int width = s->avctx->width;
308 int height = s->avctx->height;
309
310 s->keyframe = !(buf[0] & 1);
311 s->profile = (buf[0]>>1) & 7;
312 s->invisible = !(buf[0] & 0x10);
313 header_size = AV_RL24(buf) >> 5;
314 buf += 3;
315 buf_size -= 3;
316
317 if (s->profile > 3)
318 av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
319
320 if (!s->profile)
321 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
322 else // profile 1-3 use bilinear, 4+ aren't defined so whatever
323 memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
324
325 if (header_size > buf_size - 7*s->keyframe) {
326 av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
327 return AVERROR_INVALIDDATA;
328 }
329
330 if (s->keyframe) {
331 if (AV_RL24(buf) != 0x2a019d) {
332 av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
333 return AVERROR_INVALIDDATA;
334 }
335 width = AV_RL16(buf+3) & 0x3fff;
336 height = AV_RL16(buf+5) & 0x3fff;
337 hscale = buf[4] >> 6;
338 vscale = buf[6] >> 6;
339 buf += 7;
340 buf_size -= 7;
341
342 if (hscale || vscale)
343 avpriv_request_sample(s->avctx, "Upscaling");
344
345 s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
346 for (i = 0; i < 4; i++)
347 for (j = 0; j < 16; j++)
348 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
349 sizeof(s->prob->token[i][j]));
350 memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
351 memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
352 memcpy(s->prob->mvc , vp8_mv_default_prob , sizeof(s->prob->mvc));
353 memset(&s->segmentation, 0, sizeof(s->segmentation));
354 memset(&s->lf_delta, 0, sizeof(s->lf_delta));
355 }
356
357 ff_vp56_init_range_decoder(c, buf, header_size);
358 buf += header_size;
359 buf_size -= header_size;
360
361 if (s->keyframe) {
362 if (vp8_rac_get(c))
363 av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
364 vp8_rac_get(c); // whether we can skip clamping in dsp functions
365 }
366
367 if ((s->segmentation.enabled = vp8_rac_get(c)))
368 parse_segment_info(s);
369 else
370 s->segmentation.update_map = 0; // FIXME: move this to some init function?
371
372 s->filter.simple = vp8_rac_get(c);
373 s->filter.level = vp8_rac_get_uint(c, 6);
374 s->filter.sharpness = vp8_rac_get_uint(c, 3);
375
376 if ((s->lf_delta.enabled = vp8_rac_get(c)))
377 if (vp8_rac_get(c))
378 update_lf_deltas(s);
379
380 if (setup_partitions(s, buf, buf_size)) {
381 av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
382 return AVERROR_INVALIDDATA;
383 }
384
385 if (!s->macroblocks_base || /* first frame */
386 width != s->avctx->width || height != s->avctx->height) {
387 if ((ret = update_dimensions(s, width, height)) < 0)
388 return ret;
389 }
390
391 get_quants(s);
392
393 if (!s->keyframe) {
394 update_refs(s);
395 s->sign_bias[VP56_FRAME_GOLDEN] = vp8_rac_get(c);
396 s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
397 }
398
399 // if we aren't saving this frame's probabilities for future frames,
400 // make a copy of the current probabilities
401 if (!(s->update_probabilities = vp8_rac_get(c)))
402 s->prob[1] = s->prob[0];
403
404 s->update_last = s->keyframe || vp8_rac_get(c);
405
406 for (i = 0; i < 4; i++)
407 for (j = 0; j < 8; j++)
408 for (k = 0; k < 3; k++)
409 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
410 if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
411 int prob = vp8_rac_get_uint(c, 8);
412 for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
413 s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
414 }
415
416 if ((s->mbskip_enabled = vp8_rac_get(c)))
417 s->prob->mbskip = vp8_rac_get_uint(c, 8);
418
419 if (!s->keyframe) {
420 s->prob->intra = vp8_rac_get_uint(c, 8);
421 s->prob->last = vp8_rac_get_uint(c, 8);
422 s->prob->golden = vp8_rac_get_uint(c, 8);
423
424 if (vp8_rac_get(c))
425 for (i = 0; i < 4; i++)
426 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
427 if (vp8_rac_get(c))
428 for (i = 0; i < 3; i++)
429 s->prob->pred8x8c[i] = vp8_rac_get_uint(c, 8);
430
431 // 17.2 MV probability update
432 for (i = 0; i < 2; i++)
433 for (j = 0; j < 19; j++)
434 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
435 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
436 }
437
438 return 0;
439 }
440
441 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
442 {
443 dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
444 dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
445 }
446
447 /**
448 * Motion vector coding, 17.1.
449 */
450 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
451 {
452 int bit, x = 0;
453
454 if (vp56_rac_get_prob_branchy(c, p[0])) {
455 int i;
456
457 for (i = 0; i < 3; i++)
458 x += vp56_rac_get_prob(c, p[9 + i]) << i;
459 for (i = 9; i > 3; i--)
460 x += vp56_rac_get_prob(c, p[9 + i]) << i;
461 if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
462 x += 8;
463 } else {
464 // small_mvtree
465 const uint8_t *ps = p+2;
466 bit = vp56_rac_get_prob(c, *ps);
467 ps += 1 + 3*bit;
468 x += 4*bit;
469 bit = vp56_rac_get_prob(c, *ps);
470 ps += 1 + bit;
471 x += 2*bit;
472 x += vp56_rac_get_prob(c, *ps);
473 }
474
475 return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
476 }
477
478 static av_always_inline
479 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
480 {
481 if (left == top)
482 return vp8_submv_prob[4-!!left];
483 if (!top)
484 return vp8_submv_prob[2];
485 return vp8_submv_prob[1-!!left];
486 }
487
488 /**
489 * Split motion vector prediction, 16.4.
490 * @returns the number of motion vectors parsed (2, 4 or 16)
491 */
492 static av_always_inline
493 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb, int layout)
494 {
495 int part_idx;
496 int n, num;
497 VP8Macroblock *top_mb;
498 VP8Macroblock *left_mb = &mb[-1];
499 const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
500 *mbsplits_top,
501 *mbsplits_cur, *firstidx;
502 VP56mv *top_mv;
503 VP56mv *left_mv = left_mb->bmv;
504 VP56mv *cur_mv = mb->bmv;
505
506 if (!layout) // layout is inlined, s->mb_layout is not
507 top_mb = &mb[2];
508 else
509 top_mb = &mb[-s->mb_width-1];
510 mbsplits_top = vp8_mbsplits[top_mb->partitioning];
511 top_mv = top_mb->bmv;
512
513 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
514 if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
515 part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
516 } else {
517 part_idx = VP8_SPLITMVMODE_8x8;
518 }
519 } else {
520 part_idx = VP8_SPLITMVMODE_4x4;
521 }
522
523 num = vp8_mbsplit_count[part_idx];
524 mbsplits_cur = vp8_mbsplits[part_idx],
525 firstidx = vp8_mbfirstidx[part_idx];
526 mb->partitioning = part_idx;
527
528 for (n = 0; n < num; n++) {
529 int k = firstidx[n];
530 uint32_t left, above;
531 const uint8_t *submv_prob;
532
533 if (!(k & 3))
534 left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
535 else
536 left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
537 if (k <= 3)
538 above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
539 else
540 above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
541
542 submv_prob = get_submv_prob(left, above);
543
544 if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
545 if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
546 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
547 mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
548 mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
549 } else {
550 AV_ZERO32(&mb->bmv[n]);
551 }
552 } else {
553 AV_WN32A(&mb->bmv[n], above);
554 }
555 } else {
556 AV_WN32A(&mb->bmv[n], left);
557 }
558 }
559
560 return num;
561 }
562
563 static av_always_inline
564 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int layout)
565 {
566 VP8Macroblock *mb_edge[3] = { 0 /* top */,
567 mb - 1 /* left */,
568 0 /* top-left */ };
569 enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
570 enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
571 int idx = CNT_ZERO;
572 int cur_sign_bias = s->sign_bias[mb->ref_frame];
573 int8_t *sign_bias = s->sign_bias;
574 VP56mv near_mv[4];
575 uint8_t cnt[4] = { 0 };
576 VP56RangeCoder *c = &s->c;
577
578 if (!layout) { // layout is inlined (s->mb_layout is not)
579 mb_edge[0] = mb + 2;
580 mb_edge[2] = mb + 1;
581 }
582 else {
583 mb_edge[0] = mb - s->mb_width-1;
584 mb_edge[2] = mb - s->mb_width-2;
585 }
586
587 AV_ZERO32(&near_mv[0]);
588 AV_ZERO32(&near_mv[1]);
589 AV_ZERO32(&near_mv[2]);
590
591 /* Process MB on top, left and top-left */
592 #define MV_EDGE_CHECK(n)\
593 {\
594 VP8Macroblock *edge = mb_edge[n];\
595 int edge_ref = edge->ref_frame;\
596 if (edge_ref != VP56_FRAME_CURRENT) {\
597 uint32_t mv = AV_RN32A(&edge->mv);\
598 if (mv) {\
599 if (cur_sign_bias != sign_bias[edge_ref]) {\
600 /* SWAR negate of the values in mv. */\
601 mv = ~mv;\
602 mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
603 }\
604 if (!n || mv != AV_RN32A(&near_mv[idx]))\
605 AV_WN32A(&near_mv[++idx], mv);\
606 cnt[idx] += 1 + (n != 2);\
607 } else\
608 cnt[CNT_ZERO] += 1 + (n != 2);\
609 }\
610 }
611
612 MV_EDGE_CHECK(0)
613 MV_EDGE_CHECK(1)
614 MV_EDGE_CHECK(2)
615
616 mb->partitioning = VP8_SPLITMVMODE_NONE;
617 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
618 mb->mode = VP8_MVMODE_MV;
619
620 /* If we have three distinct MVs, merge first and last if they're the same */
621 if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
622 cnt[CNT_NEAREST] += 1;
623
624 /* Swap near and nearest if necessary */
625 if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
626 FFSWAP(uint8_t, cnt[CNT_NEAREST], cnt[CNT_NEAR]);
627 FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
628 }
629
630 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
631 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
632
633 /* Choose the best mv out of 0,0 and the nearest mv */
634 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
635 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode == VP8_MVMODE_SPLIT) +
636 (mb_edge[VP8_EDGE_TOP]->mode == VP8_MVMODE_SPLIT)) * 2 +
637 (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
638
639 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
640 mb->mode = VP8_MVMODE_SPLIT;
641 mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout) - 1];
642 } else {
643 mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
644 mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
645 mb->bmv[0] = mb->mv;
646 }
647 } else {
648 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
649 mb->bmv[0] = mb->mv;
650 }
651 } else {
652 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
653 mb->bmv[0] = mb->mv;
654 }
655 } else {
656 mb->mode = VP8_MVMODE_ZERO;
657 AV_ZERO32(&mb->mv);
658 mb->bmv[0] = mb->mv;
659 }
660 }
661
662 static av_always_inline
663 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
664 int mb_x, int keyframe, int layout)
665 {
666 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
667
668 if (layout == 1) {
669 VP8Macroblock *mb_top = mb - s->mb_width - 1;
670 memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
671 }
672 if (keyframe) {
673 int x, y;
674 uint8_t* top;
675 uint8_t* const left = s->intra4x4_pred_mode_left;
676 if (layout == 1)
677 top = mb->intra4x4_pred_mode_top;
678 else
679 top = s->intra4x4_pred_mode_top + 4 * mb_x;
680 for (y = 0; y < 4; y++) {
681 for (x = 0; x < 4; x++) {
682 const uint8_t *ctx;
683 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
684 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
685 left[y] = top[x] = *intra4x4;
686 intra4x4++;
687 }
688 }
689 } else {
690 int i;
691 for (i = 0; i < 16; i++)
692 intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
693 }
694 }
695
696 static av_always_inline
697 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
698 uint8_t *segment, uint8_t *ref, int layout)
699 {
700 VP56RangeCoder *c = &s->c;
701
702 if (s->segmentation.update_map)
703 *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
704 else if (s->segmentation.enabled)
705 *segment = ref ? *ref : *segment;
706 mb->segment = *segment;
707
708 mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
709
710 if (s->keyframe) {
711 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
712
713 if (mb->mode == MODE_I4x4) {
714 decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
715 } else {
716 const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
717 if (s->mb_layout == 1)
718 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
719 else
720 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
721 AV_WN32A( s->intra4x4_pred_mode_left, modes);
722 }
723
724 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
725 mb->ref_frame = VP56_FRAME_CURRENT;
726 } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
727 // inter MB, 16.2
728 if (vp56_rac_get_prob_branchy(c, s->prob->last))
729 mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
730 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
731 else
732 mb->ref_frame = VP56_FRAME_PREVIOUS;
733 s->ref_count[mb->ref_frame-1]++;
734
735 // motion vectors, 16.3
736 decode_mvs(s, mb, mb_x, mb_y, layout);
737 } else {
738 // intra MB, 16.1
739 mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
740
741 if (mb->mode == MODE_I4x4)
742 decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
743
744 mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
745 mb->ref_frame = VP56_FRAME_CURRENT;
746 mb->partitioning = VP8_SPLITMVMODE_NONE;
747 AV_ZERO32(&mb->bmv[0]);
748 }
749 }
750
751 #ifndef decode_block_coeffs_internal
752 /**
753 * @param r arithmetic bitstream reader context
754 * @param block destination for block coefficients
755 * @param probs probabilities to use when reading trees from the bitstream
756 * @param i initial coeff index, 0 unless a separate DC block is coded
757 * @param qmul array holding the dc/ac dequant factor at position 0/1
758 * @return 0 if no coeffs were decoded
759 * otherwise, the index of the last coeff decoded plus one
760 */
761 static int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
762 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
763 int i, uint8_t *token_prob, int16_t qmul[2])
764 {
765 VP56RangeCoder c = *r;
766 goto skip_eob;
767 do {
768 int coeff;
769 if (!vp56_rac_get_prob_branchy(&c, token_prob[0])) // DCT_EOB
770 break;
771
772 skip_eob:
773 if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
774 if (++i == 16)
775 break; // invalid input; blocks should end with EOB
776 token_prob = probs[i][0];
777 goto skip_eob;
778 }
779
780 if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
781 coeff = 1;
782 token_prob = probs[i+1][1];
783 } else {
784 if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
785 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
786 if (coeff)
787 coeff += vp56_rac_get_prob(&c, token_prob[5]);
788 coeff += 2;
789 } else {
790 // DCT_CAT*
791 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
792 if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
793 coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
794 } else { // DCT_CAT2
795 coeff = 7;
796 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
797 coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
798 }
799 } else { // DCT_CAT3 and up
800 int a = vp56_rac_get_prob(&c, token_prob[8]);
801 int b = vp56_rac_get_prob(&c, token_prob[9+a]);
802 int cat = (a<<1) + b;
803 coeff = 3 + (8<<cat);
804 coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
805 }
806 }
807 token_prob = probs[i+1][2];
808 }
809 block[zigzag_scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
810 } while (++i < 16);
811
812 *r = c;
813 return i;
814 }
815 #endif
816
817 /**
818 * @param c arithmetic bitstream reader context
819 * @param block destination for block coefficients
820 * @param probs probabilities to use when reading trees from the bitstream
821 * @param i initial coeff index, 0 unless a separate DC block is coded
822 * @param zero_nhood the initial prediction context for number of surrounding
823 * all-zero blocks (only left/top, so 0-2)
824 * @param qmul array holding the dc/ac dequant factor at position 0/1
825 * @return 0 if no coeffs were decoded
826 * otherwise, the index of the last coeff decoded plus one
827 */
828 static av_always_inline
829 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
830 uint8_t probs[16][3][NUM_DCT_TOKENS-1],
831 int i, int zero_nhood, int16_t qmul[2])
832 {
833 uint8_t *token_prob = probs[i][zero_nhood];
834 if (!vp56_rac_get_prob_branchy(c, token_prob[0])) // DCT_EOB
835 return 0;
836 return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
837 }
838
839 static av_always_inline
840 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c, VP8Macroblock *mb,
841 uint8_t t_nnz[9], uint8_t l_nnz[9])
842 {
843 int i, x, y, luma_start = 0, luma_ctx = 3;
844 int nnz_pred, nnz, nnz_total = 0;
845 int segment = mb->segment;
846 int block_dc = 0;
847
848 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
849 nnz_pred = t_nnz[8] + l_nnz[8];
850
851 // decode DC values and do hadamard
852 nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0, nnz_pred,
853 s->qmat[segment].luma_dc_qmul);
854 l_nnz[8] = t_nnz[8] = !!nnz;
855 if (nnz) {
856 nnz_total += nnz;
857 block_dc = 1;
858 if (nnz == 1)
859 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
860 else
861 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
862 }
863 luma_start = 1;
864 luma_ctx = 0;
865 }
866
867 // luma blocks
868 for (y = 0; y < 4; y++)
869 for (x = 0; x < 4; x++) {
870 nnz_pred = l_nnz[y] + t_nnz[x];
871 nnz = decode_block_coeffs(c, td->block[y][x], s->prob->token[luma_ctx], luma_start,
872 nnz_pred, s->qmat[segment].luma_qmul);
873 // nnz+block_dc may be one more than the actual last index, but we don't care
874 td->non_zero_count_cache[y][x] = nnz + block_dc;
875 t_nnz[x] = l_nnz[y] = !!nnz;
876 nnz_total += nnz;
877 }
878
879 // chroma blocks
880 // TODO: what to do about dimensions? 2nd dim for luma is x,
881 // but for chroma it's (y<<1)|x
882 for (i = 4; i < 6; i++)
883 for (y = 0; y < 2; y++)
884 for (x = 0; x < 2; x++) {
885 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
886 nnz = decode_block_coeffs(c, td->block[i][(y<<1)+x], s->prob->token[2], 0,
887 nnz_pred, s->qmat[segment].chroma_qmul);
888 td->non_zero_count_cache[i][(y<<1)+x] = nnz;
889 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
890 nnz_total += nnz;
891 }
892
893 // if there were no coded coeffs despite the macroblock not being marked skip,
894 // we MUST not do the inner loop filter and should not do IDCT
895 // Since skip isn't used for bitstream prediction, just manually set it.
896 if (!nnz_total)
897 mb->skip = 1;
898 }
899
900 static av_always_inline
901 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
902 int linesize, int uvlinesize, int simple)
903 {
904 AV_COPY128(top_border, src_y + 15*linesize);
905 if (!simple) {
906 AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
907 AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
908 }
909 }
910
911 static av_always_inline
912 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
913 int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
914 int simple, int xchg)
915 {
916 uint8_t *top_border_m1 = top_border-32; // for TL prediction
917 src_y -= linesize;
918 src_cb -= uvlinesize;
919 src_cr -= uvlinesize;
920
921 #define XCHG(a,b,xchg) do { \
922 if (xchg) AV_SWAP64(b,a); \
923 else AV_COPY64(b,a); \
924 } while (0)
925
926 XCHG(top_border_m1+8, src_y-8, xchg);
927 XCHG(top_border, src_y, xchg);
928 XCHG(top_border+8, src_y+8, 1);
929 if (mb_x < mb_width-1)
930 XCHG(top_border+32, src_y+16, 1);
931
932 // only copy chroma for normal loop filter
933 // or to initialize the top row to 127
934 if (!simple || !mb_y) {
935 XCHG(top_border_m1+16, src_cb-8, xchg);
936 XCHG(top_border_m1+24, src_cr-8, xchg);
937 XCHG(top_border+16, src_cb, 1);
938 XCHG(top_border+24, src_cr, 1);
939 }
940 }
941
942 static av_always_inline
943 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
944 {
945 if (!mb_x) {
946 return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
947 } else {
948 return mb_y ? mode : LEFT_DC_PRED8x8;
949 }
950 }
951
952 static av_always_inline
953 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
954 {
955 if (!mb_x) {
956 return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
957 } else {
958 return mb_y ? mode : HOR_PRED8x8;
959 }
960 }
961
962 static av_always_inline
963 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
964 {
965 switch (mode) {
966 case DC_PRED8x8:
967 return check_dc_pred8x8_mode(mode, mb_x, mb_y);
968 case VERT_PRED8x8:
969 return !mb_y ? DC_127_PRED8x8 : mode;
970 case HOR_PRED8x8:
971 return !mb_x ? DC_129_PRED8x8 : mode;
972 case PLANE_PRED8x8 /*TM*/:
973 return check_tm_pred8x8_mode(mode, mb_x, mb_y);
974 }
975 return mode;
976 }
977
978 static av_always_inline
979 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
980 {
981 if (!mb_x) {
982 return mb_y ? VERT_VP8_PRED : DC_129_PRED;
983 } else {
984 return mb_y ? mode : HOR_VP8_PRED;
985 }
986 }
987
988 static av_always_inline
989 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
990 {
991 switch (mode) {
992 case VERT_PRED:
993 if (!mb_x && mb_y) {
994 *copy_buf = 1;
995 return mode;
996 }
997 /* fall-through */
998 case DIAG_DOWN_LEFT_PRED:
999 case VERT_LEFT_PRED:
1000 return !mb_y ? DC_127_PRED : mode;
1001 case HOR_PRED:
1002 if (!mb_y) {
1003 *copy_buf = 1;
1004 return mode;
1005 }
1006 /* fall-through */
1007 case HOR_UP_PRED:
1008 return !mb_x ? DC_129_PRED : mode;
1009 case TM_VP8_PRED:
1010 return check_tm_pred4x4_mode(mode, mb_x, mb_y);
1011 case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
1012 case DIAG_DOWN_RIGHT_PRED:
1013 case VERT_RIGHT_PRED:
1014 case HOR_DOWN_PRED:
1015 if (!mb_y || !mb_x)
1016 *copy_buf = 1;
1017 return mode;
1018 }
1019 return mode;
1020 }
1021
1022 static av_always_inline
1023 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1024 VP8Macroblock *mb, int mb_x, int mb_y)
1025 {
1026 int x, y, mode, nnz;
1027 uint32_t tr;
1028
1029 // for the first row, we need to run xchg_mb_border to init the top edge to 127
1030 // otherwise, skip it if we aren't going to deblock
1031 if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1032 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1033 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1034 s->filter.simple, 1);
1035
1036 if (mb->mode < MODE_I4x4) {
1037 mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
1038 s->hpc.pred16x16[mode](dst[0], s->linesize);
1039 } else {
1040 uint8_t *ptr = dst[0];
1041 uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1042 uint8_t tr_top[4] = { 127, 127, 127, 127 };
1043
1044 // all blocks on the right edge of the macroblock use bottom edge
1045 // the top macroblock for their topright edge
1046 uint8_t *tr_right = ptr - s->linesize + 16;
1047
1048 // if we're on the right edge of the frame, said edge is extended
1049 // from the top macroblock
1050 if (mb_y &&
1051 mb_x == s->mb_width-1) {
1052 tr = tr_right[-1]*0x01010101u;
1053 tr_right = (uint8_t *)&tr;
1054 }
1055
1056 if (mb->skip)
1057 AV_ZERO128(td->non_zero_count_cache);
1058
1059 for (y = 0; y < 4; y++) {
1060 uint8_t *topright = ptr + 4 - s->linesize;
1061 for (x = 0; x < 4; x++) {
1062 int copy = 0, linesize = s->linesize;
1063 uint8_t *dst = ptr+4*x;
1064 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
1065
1066 if ((y == 0 || x == 3) && mb_y == 0) {
1067 topright = tr_top;
1068 } else if (x == 3)
1069 topright = tr_right;
1070
1071 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
1072 if (copy) {
1073 dst = copy_dst + 12;
1074 linesize = 8;
1075 if (!(mb_y + y)) {
1076 copy_dst[3] = 127U;
1077 AV_WN32A(copy_dst+4, 127U * 0x01010101U);
1078 } else {
1079 AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
1080 if (!(mb_x + x)) {
1081 copy_dst[3] = 129U;
1082 } else {
1083 copy_dst[3] = ptr[4*x-s->linesize-1];
1084 }
1085 }
1086 if (!(mb_x + x)) {
1087 copy_dst[11] =
1088 copy_dst[19] =
1089 copy_dst[27] =
1090 copy_dst[35] = 129U;
1091 } else {
1092 copy_dst[11] = ptr[4*x -1];
1093 copy_dst[19] = ptr[4*x+s->linesize -1];
1094 copy_dst[27] = ptr[4*x+s->linesize*2-1];
1095 copy_dst[35] = ptr[4*x+s->linesize*3-1];
1096 }
1097 }
1098 s->hpc.pred4x4[mode](dst, topright, linesize);
1099 if (copy) {
1100 AV_COPY32(ptr+4*x , copy_dst+12);
1101 AV_COPY32(ptr+4*x+s->linesize , copy_dst+20);
1102 AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1103 AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1104 }
1105
1106 nnz = td->non_zero_count_cache[y][x];
1107 if (nnz) {
1108 if (nnz == 1)
1109 s->vp8dsp.vp8_idct_dc_add(ptr+4*x, td->block[y][x], s->linesize);
1110 else
1111 s->vp8dsp.vp8_idct_add(ptr+4*x, td->block[y][x], s->linesize);
1112 }
1113 topright += 4;
1114 }
1115
1116 ptr += 4*s->linesize;
1117 intra4x4 += 4;
1118 }
1119 }
1120
1121 mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode, mb_x, mb_y);
1122 s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1123 s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1124
1125 if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1126 xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1127 s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1128 s->filter.simple, 0);
1129 }
1130
1131 static const uint8_t subpel_idx[3][8] = {
1132 { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1133 // also function pointer index
1134 { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1135 { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1136 };
1137
1138 /**
1139 * luma MC function
1140 *
1141 * @param s VP8 decoding context
1142 * @param dst target buffer for block data at block position
1143 * @param ref reference picture buffer at origin (0, 0)
1144 * @param mv motion vector (relative to block position) to get pixel data from
1145 * @param x_off horizontal position of block from origin (0, 0)
1146 * @param y_off vertical position of block from origin (0, 0)
1147 * @param block_w width of block (16, 8 or 4)
1148 * @param block_h height of block (always same as block_w)
1149 * @param width width of src/dst plane data
1150 * @param height height of src/dst plane data
1151 * @param linesize size of a single line of plane data, including padding
1152 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1153 */
1154 static av_always_inline
1155 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1156 ThreadFrame *ref, const VP56mv *mv,
1157 int x_off, int y_off, int block_w, int block_h,
1158 int width, int height, ptrdiff_t linesize,
1159 vp8_mc_func mc_func[3][3])
1160 {
1161 uint8_t *src = ref->f->data[0];
1162
1163 if (AV_RN32A(mv)) {
1164
1165 int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1166 int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1167
1168 x_off += mv->x >> 2;
1169 y_off += mv->y >> 2;
1170
1171 // edge emulation
1172 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1173 src += y_off * linesize + x_off;
1174 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1175 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1176 s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1177 src - my_idx * linesize - mx_idx,
1178 linesize, linesize,
1179 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1180 x_off - mx_idx, y_off - my_idx, width, height);
1181 src = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1182 }
1183 mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1184 } else {
1185 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1186 mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1187 }
1188 }
1189
1190 /**
1191 * chroma MC function
1192 *
1193 * @param s VP8 decoding context
1194 * @param dst1 target buffer for block data at block position (U plane)
1195 * @param dst2 target buffer for block data at block position (V plane)
1196 * @param ref reference picture buffer at origin (0, 0)
1197 * @param mv motion vector (relative to block position) to get pixel data from
1198 * @param x_off horizontal position of block from origin (0, 0)
1199 * @param y_off vertical position of block from origin (0, 0)
1200 * @param block_w width of block (16, 8 or 4)
1201 * @param block_h height of block (always same as block_w)
1202 * @param width width of src/dst plane data
1203 * @param height height of src/dst plane data
1204 * @param linesize size of a single line of plane data, including padding
1205 * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1206 */
1207 static av_always_inline
1208 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1, uint8_t *dst2,
1209 ThreadFrame *ref, const VP56mv *mv, int x_off, int y_off,
1210 int block_w, int block_h, int width, int height, ptrdiff_t linesize,
1211 vp8_mc_func mc_func[3][3])
1212 {
1213 uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1214
1215 if (AV_RN32A(mv)) {
1216 int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1217 int my = mv->y&7, my_idx = subpel_idx[0][my];
1218
1219 x_off += mv->x >> 3;
1220 y_off += mv->y >> 3;
1221
1222 // edge emulation
1223 src1 += y_off * linesize + x_off;
1224 src2 += y_off * linesize + x_off;
1225 ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1226 if (x_off < mx_idx || x_off >= width - block_w - subpel_idx[2][mx] ||
1227 y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1228 s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1229 src1 - my_idx * linesize - mx_idx,
1230 linesize, linesize,
1231 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1232 x_off - mx_idx, y_off - my_idx, width, height);
1233 src1 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1234 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1235
1236 s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1237 src2 - my_idx * linesize - mx_idx,
1238 linesize, linesize,
1239 block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1240 x_off - mx_idx, y_off - my_idx, width, height);
1241 src2 = td->edge_emu_buffer + mx_idx + linesize * my_idx;
1242 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1243 } else {
1244 mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1245 mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1246 }
1247 } else {
1248 ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1249 mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1250 mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1251 }
1252 }
1253
1254 static av_always_inline
1255 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1256 ThreadFrame *ref_frame, int x_off, int y_off,
1257 int bx_off, int by_off,
1258 int block_w, int block_h,
1259 int width, int height, VP56mv *mv)
1260 {
1261 VP56mv uvmv = *mv;
1262
1263 /* Y */
1264 vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1265 ref_frame, mv, x_off + bx_off, y_off + by_off,
1266 block_w, block_h, width, height, s->linesize,
1267 s->put_pixels_tab[block_w == 8]);
1268
1269 /* U/V */
1270 if (s->profile == 3) {
1271 uvmv.x &= ~7;
1272 uvmv.y &= ~7;
1273 }
1274 x_off >>= 1; y_off >>= 1;
1275 bx_off >>= 1; by_off >>= 1;
1276 width >>= 1; height >>= 1;
1277 block_w >>= 1; block_h >>= 1;
1278 vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1279 dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1280 &uvmv, x_off + bx_off, y_off + by_off,
1281 block_w, block_h, width, height, s->uvlinesize,
1282 s->put_pixels_tab[1 + (block_w == 4)]);
1283 }
1284
1285 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1286 * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1287 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1288 {
1289 /* Don't prefetch refs that haven't been used very often this frame. */
1290 if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1291 int x_off = mb_x << 4, y_off = mb_y << 4;
1292 int mx = (mb->mv.x>>2) + x_off + 8;
1293 int my = (mb->mv.y>>2) + y_off;
1294 uint8_t **src= s->framep[ref]->tf.f->data;
1295 int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1296 /* For threading, a ff_thread_await_progress here might be useful, but
1297 * it actually slows down the decoder. Since a bad prefetch doesn't
1298 * generate bad decoder output, we don't run it here. */
1299 s->vdsp.prefetch(src[0]+off, s->linesize, 4);
1300 off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1301 s->vdsp.prefetch(src[1]+off, src[2]-src[1], 2);
1302 }
1303 }
1304
1305 /**
1306 * Apply motion vectors to prediction buffer, chapter 18.
1307 */
1308 static av_always_inline
1309 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1310 VP8Macroblock *mb, int mb_x, int mb_y)
1311 {
1312 int x_off = mb_x << 4, y_off = mb_y << 4;
1313 int width = 16*s->mb_width, height = 16*s->mb_height;
1314 ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1315 VP56mv *bmv = mb->bmv;
1316
1317 switch (mb->partitioning) {
1318 case VP8_SPLITMVMODE_NONE:
1319 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1320 0, 0, 16, 16, width, height, &mb->mv);
1321 break;
1322 case VP8_SPLITMVMODE_4x4: {
1323 int x, y;
1324 VP56mv uvmv;
1325
1326 /* Y */
1327 for (y = 0; y < 4; y++) {
1328 for (x = 0; x < 4; x++) {
1329 vp8_mc_luma(s, td, dst[0] + 4*y*s->linesize + x*4,
1330 ref, &bmv[4*y + x],
1331 4*x + x_off, 4*y + y_off, 4, 4,
1332 width, height, s->linesize,
1333 s->put_pixels_tab[2]);
1334 }
1335 }
1336
1337 /* U/V */
1338 x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1339 for (y = 0; y < 2; y++) {
1340 for (x = 0; x < 2; x++) {
1341 uvmv.x = mb->bmv[ 2*y * 4 + 2*x ].x +
1342 mb->bmv[ 2*y * 4 + 2*x+1].x +
1343 mb->bmv[(2*y+1) * 4 + 2*x ].x +
1344 mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1345 uvmv.y = mb->bmv[ 2*y * 4 + 2*x ].y +
1346 mb->bmv[ 2*y * 4 + 2*x+1].y +
1347 mb->bmv[(2*y+1) * 4 + 2*x ].y +
1348 mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1349 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1350 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1351 if (s->profile == 3) {
1352 uvmv.x &= ~7;
1353 uvmv.y &= ~7;
1354 }
1355 vp8_mc_chroma(s, td, dst[1] + 4*y*s->uvlinesize + x*4,
1356 dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1357 4*x + x_off, 4*y + y_off, 4, 4,
1358 width, height, s->uvlinesize,
1359 s->put_pixels_tab[2]);
1360 }
1361 }
1362 break;
1363 }
1364 case VP8_SPLITMVMODE_16x8:
1365 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1366 0, 0, 16, 8, width, height, &bmv[0]);
1367 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1368 0, 8, 16, 8, width, height, &bmv[1]);
1369 break;
1370 case VP8_SPLITMVMODE_8x16:
1371 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1372 0, 0, 8, 16, width, height, &bmv[0]);
1373 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1374 8, 0, 8, 16, width, height, &bmv[1]);
1375 break;
1376 case VP8_SPLITMVMODE_8x8:
1377 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1378 0, 0, 8, 8, width, height, &bmv[0]);
1379 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1380 8, 0, 8, 8, width, height, &bmv[1]);
1381 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1382 0, 8, 8, 8, width, height, &bmv[2]);
1383 vp8_mc_part(s, td, dst, ref, x_off, y_off,
1384 8, 8, 8, 8, width, height, &bmv[3]);
1385 break;
1386 }
1387 }
1388
1389 static av_always_inline void idct_mb(VP8Context *s, VP8ThreadData *td,
1390 uint8_t *dst[3], VP8Macroblock *mb)
1391 {
1392 int x, y, ch;
1393
1394 if (mb->mode != MODE_I4x4) {
1395 uint8_t *y_dst = dst[0];
1396 for (y = 0; y < 4; y++) {
1397 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1398 if (nnz4) {
1399 if (nnz4&~0x01010101) {
1400 for (x = 0; x < 4; x++) {
1401 if ((uint8_t)nnz4 == 1)
1402 s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, td->block[y][x], s->linesize);
1403 else if((uint8_t)nnz4 > 1)
1404 s->vp8dsp.vp8_idct_add(y_dst+4*x, td->block[y][x], s->linesize);
1405 nnz4 >>= 8;
1406 if (!nnz4)
1407 break;
1408 }
1409 } else {
1410 s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1411 }
1412 }
1413 y_dst += 4*s->linesize;
1414 }
1415 }
1416
1417 for (ch = 0; ch < 2; ch++) {
1418 uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4+ch]);
1419 if (nnz4) {
1420 uint8_t *ch_dst = dst[1+ch];
1421 if (nnz4&~0x01010101) {
1422 for (y = 0; y < 2; y++) {
1423 for (x = 0; x < 2; x++) {
1424 if ((uint8_t)nnz4 == 1)
1425 s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1426 else if((uint8_t)nnz4 > 1)
1427 s->vp8dsp.vp8_idct_add(ch_dst+4*x, td->block[4+ch][(y<<1)+x], s->uvlinesize);
1428 nnz4 >>= 8;
1429 if (!nnz4)
1430 goto chroma_idct_end;
1431 }
1432 ch_dst += 4*s->uvlinesize;
1433 }
1434 } else {
1435 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4+ch], s->uvlinesize);
1436 }
1437 }
1438 chroma_idct_end: ;
1439 }
1440 }
1441
1442 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1443 {
1444 int interior_limit, filter_level;
1445
1446 if (s->segmentation.enabled) {
1447 filter_level = s->segmentation.filter_level[mb->segment];
1448 if (!s->segmentation.absolute_vals)
1449 filter_level += s->filter.level;
1450 } else
1451 filter_level = s->filter.level;
1452
1453 if (s->lf_delta.enabled) {
1454 filter_level += s->lf_delta.ref[mb->ref_frame];
1455 filter_level += s->lf_delta.mode[mb->mode];
1456 }
1457
1458 filter_level = av_clip_uintp2(filter_level, 6);
1459
1460 interior_limit = filter_level;
1461 if (s->filter.sharpness) {
1462 interior_limit >>= (s->filter.sharpness + 3) >> 2;
1463 interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1464 }
1465 interior_limit = FFMAX(interior_limit, 1);
1466
1467 f->filter_level = filter_level;
1468 f->inner_limit = interior_limit;
1469 f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1470 }
1471
1472 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1473 {
1474 int mbedge_lim, bedge_lim, hev_thresh;
1475 int filter_level = f->filter_level;
1476 int inner_limit = f->inner_limit;
1477 int inner_filter = f->inner_filter;
1478 int linesize = s->linesize;
1479 int uvlinesize = s->uvlinesize;
1480 static const uint8_t hev_thresh_lut[2][64] = {
1481 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1482 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1483 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1484 3, 3, 3, 3 },
1485 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1487 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1488 2, 2, 2, 2 }
1489 };
1490
1491 if (!filter_level)
1492 return;
1493
1494 bedge_lim = 2*filter_level + inner_limit;
1495 mbedge_lim = bedge_lim + 4;
1496
1497 hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1498
1499 if (mb_x) {
1500 s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
1501 mbedge_lim, inner_limit, hev_thresh);
1502 s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
1503 mbedge_lim, inner_limit, hev_thresh);
1504 }
1505
1506 if (inner_filter) {
1507 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1508 inner_limit, hev_thresh);
1509 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1510 inner_limit, hev_thresh);
1511 s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1512 inner_limit, hev_thresh);
1513 s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1514 uvlinesize, bedge_lim,
1515 inner_limit, hev_thresh);
1516 }
1517
1518 if (mb_y) {
1519 s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
1520 mbedge_lim, inner_limit, hev_thresh);
1521 s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
1522 mbedge_lim, inner_limit, hev_thresh);
1523 }
1524
1525 if (inner_filter) {
1526 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1527 linesize, bedge_lim,
1528 inner_limit, hev_thresh);
1529 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1530 linesize, bedge_lim,
1531 inner_limit, hev_thresh);
1532 s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1533 linesize, bedge_lim,
1534 inner_limit, hev_thresh);
1535 s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1536 dst[2] + 4 * uvlinesize,
1537 uvlinesize, bedge_lim,
1538 inner_limit, hev_thresh);
1539 }
1540 }
1541
1542 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1543 {
1544 int mbedge_lim, bedge_lim;
1545 int filter_level = f->filter_level;
1546 int inner_limit = f->inner_limit;
1547 int inner_filter = f->inner_filter;
1548 int linesize = s->linesize;
1549
1550 if (!filter_level)
1551 return;
1552
1553 bedge_lim = 2*filter_level + inner_limit;
1554 mbedge_lim = bedge_lim + 4;
1555
1556 if (mb_x)
1557 s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1558 if (inner_filter) {
1559 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1560 s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1561 s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1562 }
1563
1564 if (mb_y)
1565 s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1566 if (inner_filter) {
1567 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1568 s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1569 s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1570 }
1571 }
1572
1573 #define MARGIN (16 << 2)
1574 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
1575 VP8Frame *prev_frame)
1576 {
1577 VP8Context *s = avctx->priv_data;
1578 int mb_x, mb_y;
1579
1580 s->mv_min.y = -MARGIN;
1581 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1582 for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1583 VP8Macroblock *mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1584 int mb_xy = mb_y*s->mb_width;
1585
1586 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1587
1588 s->mv_min.x = -MARGIN;
1589 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1590 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1591 if (mb_y == 0)
1592 AV_WN32A((mb-s->mb_width-1)->intra4x4_pred_mode_top, DC_PRED*0x01010101);
1593 decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1594 prev_frame && prev_frame->seg_map ?
1595 prev_frame->seg_map->data + mb_xy : NULL, 1);
1596 s->mv_min.x -= 64;
1597 s->mv_max.x -= 64;
1598 }
1599 s->mv_min.y -= 64;
1600 s->mv_max.y -= 64;
1601 }
1602 }
1603
1604 #if HAVE_THREADS
1605 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)\
1606 do {\
1607 int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);\
1608 if (otd->thread_mb_pos < tmp) {\
1609 pthread_mutex_lock(&otd->lock);\
1610 td->wait_mb_pos = tmp;\
1611 do {\
1612 if (otd->thread_mb_pos >= tmp)\
1613 break;\
1614 pthread_cond_wait(&otd->cond, &otd->lock);\
1615 } while (1);\
1616 td->wait_mb_pos = INT_MAX;\
1617 pthread_mutex_unlock(&otd->lock);\
1618 }\
1619 } while(0);
1620
1621 #define update_pos(td, mb_y, mb_x)\
1622 do {\
1623 int pos = (mb_y << 16) | (mb_x & 0xFFFF);\
1624 int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && (num_jobs > 1);\
1625 int is_null = (next_td == NULL) || (prev_td == NULL);\
1626 int pos_check = (is_null) ? 1 :\
1627 (next_td != td && pos >= next_td->wait_mb_pos) ||\
1628 (prev_td != td && pos >= prev_td->wait_mb_pos);\
1629 td->thread_mb_pos = pos;\
1630 if (sliced_threading && pos_check) {\
1631 pthread_mutex_lock(&td->lock);\
1632 pthread_cond_broadcast(&td->cond);\
1633 pthread_mutex_unlock(&td->lock);\
1634 }\
1635 } while(0);
1636 #else
1637 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
1638 #define update_pos(td, mb_y, mb_x)
1639 #endif
1640
1641 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
1642 int jobnr, int threadnr)
1643 {
1644 VP8Context *s = avctx->priv_data;
1645 VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
1646 int mb_y = td->thread_mb_pos>>16;
1647 int mb_x, mb_xy = mb_y*s->mb_width;
1648 int num_jobs = s->num_jobs;
1649 VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
1650 VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1651 VP8Macroblock *mb;
1652 uint8_t *dst[3] = {
1653 curframe->tf.f->data[0] + 16*mb_y*s->linesize,
1654 curframe->tf.f->data[1] + 8*mb_y*s->uvlinesize,
1655 curframe->tf.f->data[2] + 8*mb_y*s->uvlinesize
1656 };
1657 if (mb_y == 0) prev_td = td;
1658 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1659 if (mb_y == s->mb_height-1) next_td = td;
1660 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1661 if (s->mb_layout == 1)
1662 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1663 else {
1664 // Make sure the previous frame has read its segmentation map,
1665 // if we re-use the same map.
1666 if (prev_frame && s->segmentation.enabled &&
1667 !s->segmentation.update_map)
1668 ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
1669 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1670 memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
1671 AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1672 }
1673
1674 memset(td->left_nnz, 0, sizeof(td->left_nnz));
1675
1676 s->mv_min.x = -MARGIN;
1677 s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
1678
1679 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1680 // Wait for previous thread to read mb_x+2, and reach mb_y-1.
1681 if (prev_td != td) {
1682 if (threadnr != 0) {
1683 check_thread_pos(td, prev_td, mb_x+1, mb_y-1);
1684 } else {
1685 check_thread_pos(td, prev_td, (s->mb_width+3) + (mb_x+1), mb_y-1);
1686 }
1687 }
1688
1689 s->vdsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1690 s->vdsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1691
1692 if (!s->mb_layout)
1693 decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
1694 prev_frame && prev_frame->seg_map ?
1695 prev_frame->seg_map->data + mb_xy : NULL, 0);
1696
1697 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1698
1699 if (!mb->skip)
1700 decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz);
1701
1702 if (mb->mode <= MODE_I4x4)
1703 intra_predict(s, td, dst, mb, mb_x, mb_y);
1704 else
1705 inter_predict(s, td, dst, mb, mb_x, mb_y);
1706
1707 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1708
1709 if (!mb->skip) {
1710 idct_mb(s, td, dst, mb);
1711 } else {
1712 AV_ZERO64(td->left_nnz);
1713 AV_WN64(s->top_nnz[mb_x], 0); // array of 9, so unaligned
1714
1715 // Reset DC block predictors if they would exist if the mb had coefficients
1716 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1717 td->left_nnz[8] = 0;
1718 s->top_nnz[mb_x][8] = 0;
1719 }
1720 }
1721
1722 if (s->deblock_filter)
1723 filter_level_for_mb(s, mb, &td->filter_strength[mb_x]);
1724
1725 if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs-1) {
1726 if (s->filter.simple)
1727 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1728 else
1729 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1730 }
1731
1732 prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1733
1734 dst[0] += 16;
1735 dst[1] += 8;
1736 dst[2] += 8;
1737 s->mv_min.x -= 64;
1738 s->mv_max.x -= 64;
1739
1740 if (mb_x == s->mb_width+1) {
1741 update_pos(td, mb_y, s->mb_width+3);
1742 } else {
1743 update_pos(td, mb_y, mb_x);
1744 }
1745 }
1746 }
1747
1748 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
1749 int jobnr, int threadnr)
1750 {
1751 VP8Context *s = avctx->priv_data;
1752 VP8ThreadData *td = &s->thread_data[threadnr];
1753 int mb_x, mb_y = td->thread_mb_pos>>16, num_jobs = s->num_jobs;
1754 AVFrame *curframe = s->curframe->tf.f;
1755 VP8Macroblock *mb;
1756 VP8ThreadData *prev_td, *next_td;
1757 uint8_t *dst[3] = {
1758 curframe->data[0] + 16*mb_y*s->linesize,
1759 curframe->data[1] + 8*mb_y*s->uvlinesize,
1760 curframe->data[2] + 8*mb_y*s->uvlinesize
1761 };
1762
1763 if (s->mb_layout == 1)
1764 mb = s->macroblocks_base + ((s->mb_width+1)*(mb_y + 1) + 1);
1765 else
1766 mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1767
1768 if (mb_y == 0) prev_td = td;
1769 else prev_td = &s->thread_data[(jobnr + num_jobs - 1)%num_jobs];
1770 if (mb_y == s->mb_height-1) next_td = td;
1771 else next_td = &s->thread_data[(jobnr + 1)%num_jobs];
1772
1773 for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
1774 VP8FilterStrength *f = &td->filter_strength[mb_x];
1775 if (prev_td != td) {
1776 check_thread_pos(td, prev_td, (mb_x+1) + (s->mb_width+3), mb_y-1);
1777 }
1778 if (next_td != td)
1779 if (next_td != &s->thread_data[0]) {
1780 check_thread_pos(td, next_td, mb_x+1, mb_y+1);
1781 }
1782
1783 if (num_jobs == 1) {
1784 if (s->filter.simple)
1785 backup_mb_border(s->top_border[mb_x+1], dst[0], NULL, NULL, s->linesize, 0, 1);
1786 else
1787 backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1788 }
1789
1790 if (s->filter.simple)
1791 filter_mb_simple(s, dst[0], f, mb_x, mb_y);
1792 else
1793 filter_mb(s, dst, f, mb_x, mb_y);
1794 dst[0] += 16;
1795 dst[1] += 8;
1796 dst[2] += 8;
1797
1798 update_pos(td, mb_y, (s->mb_width+3) + mb_x);
1799 }
1800 }
1801
1802 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
1803 int jobnr, int threadnr)
1804 {
1805 VP8Context *s = avctx->priv_data;
1806 VP8ThreadData *td = &s->thread_data[jobnr];
1807 VP8ThreadData *next_td = NULL, *prev_td = NULL;
1808 VP8Frame *curframe = s->curframe;
1809 int mb_y, num_jobs = s->num_jobs;
1810 td->thread_nr = threadnr;
1811 for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
1812 if (mb_y >= s->mb_height) break;
1813 td->thread_mb_pos = mb_y<<16;
1814 vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
1815 if (s->deblock_filter)
1816 vp8_filter_mb_row(avctx, tdata, jobnr, threadnr);
1817 update_pos(td, mb_y, INT_MAX & 0xFFFF);
1818
1819 s->mv_min.y -= 64;
1820 s->mv_max.y -= 64;
1821
1822 if (avctx->active_thread_type == FF_THREAD_FRAME)
1823 ff_thread_report_progress(&curframe->tf, mb_y, 0);
1824 }
1825
1826 return 0;
1827 }
1828
1829 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
1830 AVPacket *avpkt)
1831 {
1832 VP8Context *s = avctx->priv_data;
1833 int ret, i, referenced, num_jobs;
1834 enum AVDiscard skip_thresh;
1835 VP8Frame *av_uninit(curframe), *prev_frame;
1836
1837 if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1838 goto err;
1839
1840 prev_frame = s->framep[VP56_FRAME_CURRENT];
1841
1842 referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1843 || s->update_altref == VP56_FRAME_CURRENT;
1844
1845 skip_thresh = !referenced ? AVDISCARD_NONREF :
1846 !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1847
1848 if (avctx->skip_frame >= skip_thresh) {
1849 s->invisible = 1;
1850 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1851 goto skip_decode;
1852 }
1853 s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1854
1855 // release no longer referenced frames
1856 for (i = 0; i < 5; i++)
1857 if (s->frames[i].tf.f->data[0] &&
1858 &s->frames[i] != prev_frame &&
1859 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1860 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1861 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1862 vp8_release_frame(s, &s->frames[i]);
1863
1864 // find a free buffer
1865 for (i = 0; i < 5; i++)
1866 if (&s->frames[i] != prev_frame &&
1867 &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1868 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1869 &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1870 curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1871 break;
1872 }
1873 if (i == 5) {
1874 av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1875 abort();
1876 }
1877 if (curframe->tf.f->data[0])
1878 vp8_release_frame(s, curframe);
1879
1880 // Given that arithmetic probabilities are updated every frame, it's quite likely
1881 // that the values we have on a random interframe are complete junk if we didn't
1882 // start decode on a keyframe. So just don't display anything rather than junk.
1883 if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1884 !s->framep[VP56_FRAME_GOLDEN] ||
1885 !s->framep[VP56_FRAME_GOLDEN2])) {
1886 av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1887 ret = AVERROR_INVALIDDATA;
1888 goto err;
1889 }
1890
1891 curframe->tf.f->key_frame = s->keyframe;
1892 curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1893 if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
1894 av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1895 goto err;
1896 }
1897
1898 // check if golden and altref are swapped
1899 if (s->update_altref != VP56_FRAME_NONE) {
1900 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
1901 } else {
1902 s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
1903 }
1904 if (s->update_golden != VP56_FRAME_NONE) {
1905 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
1906 } else {
1907 s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
1908 }
1909 if (s->update_last) {
1910 s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1911 } else {
1912 s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1913 }
1914 s->next_framep[VP56_FRAME_CURRENT] = curframe;
1915
1916 ff_thread_finish_setup(avctx);
1917
1918 s->linesize = curframe->tf.f->linesize[0];
1919 s->uvlinesize = curframe->tf.f->linesize[1];
1920
1921 if (!s->thread_data[0].edge_emu_buffer)
1922 for (i = 0; i < MAX_THREADS; i++)
1923 s->thread_data[i].edge_emu_buffer = av_malloc(21*s->linesize);
1924
1925 memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1926 /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1927 if (!s->mb_layout)
1928 memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1929 if (!s->mb_layout && s->keyframe)
1930 memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1931
1932 memset(s->ref_count, 0, sizeof(s->ref_count));
1933
1934
1935 if (s->mb_layout == 1) {
1936 // Make sure the previous frame has read its segmentation map,
1937 // if we re-use the same map.
1938 if (prev_frame && s->segmentation.enabled &&
1939 !s->segmentation.update_map)
1940 ff_thread_await_progress(&prev_frame->tf, 1, 0);
1941 vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
1942 }
1943
1944 if (avctx->active_thread_type == FF_THREAD_FRAME)
1945 num_jobs = 1;
1946 else
1947 num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
1948 s->num_jobs = num_jobs;
1949 s->curframe = curframe;
1950 s->prev_frame = prev_frame;
1951 s->mv_min.y = -MARGIN;
1952 s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1953 for (i = 0; i < MAX_THREADS; i++) {
1954 s->thread_data[i].thread_mb_pos = 0;
1955 s->thread_data[i].wait_mb_pos = INT_MAX;
1956 }
1957 avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL, num_jobs);
1958
1959 ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
1960 memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1961
1962 skip_decode:
1963 // if future frames don't use the updated probabilities,
1964 // reset them to the values we saved
1965 if (!s->update_probabilities)
1966 s->prob[0] = s->prob[1];
1967
1968 if (!s->invisible) {
1969 if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
1970 return ret;
1971 *got_frame = 1;
1972 }
1973
1974 return avpkt->size;
1975 err:
1976 memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
1977 return ret;
1978 }
1979
1980 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
1981 {
1982 VP8Context *s = avctx->priv_data;
1983 int i;
1984
1985 vp8_decode_flush_impl(avctx, 1);
1986 for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
1987 av_frame_free(&s->frames[i].tf.f);
1988
1989 return 0;
1990 }
1991
1992 static av_cold int vp8_init_frames(VP8Context *s)
1993 {
1994 int i;
1995 for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
1996 s->frames[i].tf.f = av_frame_alloc();
1997 if (!s->frames[i].tf.f)
1998 return AVERROR(ENOMEM);
1999 }
2000 return 0;
2001 }
2002
2003 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2004 {
2005 VP8Context *s = avctx->priv_data;
2006 int ret;
2007
2008 s->avctx = avctx;
2009 avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2010 avctx->internal->allocate_progress = 1;
2011
2012 ff_videodsp_init(&s->vdsp, 8);
2013 ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2014 ff_vp8dsp_init(&s->vp8dsp);
2015
2016 if ((ret = vp8_init_frames(s)) < 0) {
2017 ff_vp8_decode_free(avctx);
2018 return ret;
2019 }
2020
2021 return 0;
2022 }
2023
2024 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2025 {
2026 VP8Context *s = avctx->priv_data;
2027 int ret;
2028
2029 s->avctx = avctx;
2030
2031 if ((ret = vp8_init_frames(s)) < 0) {
2032 ff_vp8_decode_free(avctx);
2033 return ret;
2034 }
2035
2036 return 0;
2037 }
2038
2039 #define REBASE(pic) \
2040 pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2041
2042 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
2043 {
2044 VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2045 int i;
2046
2047 if (s->macroblocks_base &&
2048 (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2049 free_buffers(s);
2050 s->mb_width = s_src->mb_width;
2051 s->mb_height = s_src->mb_height;
2052 }
2053
2054 s->prob[0] = s_src->prob[!s_src->update_probabilities];
2055 s->segmentation = s_src->segmentation;
2056 s->lf_delta = s_src->lf_delta;
2057 memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2058
2059 for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2060 if (s_src->frames[i].tf.f->data[0]) {
2061 int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2062 if (ret < 0)
2063 return ret;
2064 }
2065 }
2066
2067 s->framep[0] = REBASE(s_src->next_framep[0]);
2068 s->framep[1] = REBASE(s_src->next_framep[1]);
2069 s->framep[2] = REBASE(s_src->next_framep[2]);
2070 s->framep[3] = REBASE(s_src->next_framep[3]);
2071
2072 return 0;
2073 }
2074
2075 AVCodec ff_vp8_decoder = {
2076 .name = "vp8",
2077 .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
2078 .type = AVMEDIA_TYPE_VIDEO,
2079 .id = AV_CODEC_ID_VP8,
2080 .priv_data_size = sizeof(VP8Context),
2081 .init = ff_vp8_decode_init,
2082 .close = ff_vp8_decode_free,
2083 .decode = ff_vp8_decode_frame,
2084 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2085 .flush = vp8_decode_flush,
2086 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2087 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2088 };