Commit | Line | Data |
---|---|---|
d592f67f MN |
1 | /* |
2 | * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder | |
3 | * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 | * | |
b78e7197 DB |
5 | * This file is part of FFmpeg. |
6 | * | |
7 | * FFmpeg is free software; you can redistribute it and/or | |
d592f67f MN |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either | |
b78e7197 | 10 | * version 2.1 of the License, or (at your option) any later version. |
d592f67f | 11 | * |
b78e7197 | 12 | * FFmpeg is distributed in the hope that it will be useful, |
d592f67f MN |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * Lesser General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU Lesser General Public | |
b78e7197 | 18 | * License along with FFmpeg; if not, write to the Free Software |
5509bffa | 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
d592f67f MN |
20 | * |
21 | */ | |
115329f1 | 22 | |
d592f67f MN |
23 | /** |
24 | * @file cabac.h | |
25 | * Context Adaptive Binary Arithmetic Coder. | |
26 | */ | |
27 | ||
28 | ||
2848ce84 | 29 | //#undef NDEBUG |
d592f67f | 30 | #include <assert.h> |
755073fe RD |
31 | #ifdef ARCH_X86_64 |
32 | #define ARCH_X86 | |
33 | #endif | |
34 | #ifdef ARCH_X86 | |
35 | #include "x86_cpu.h" | |
36 | #endif | |
d592f67f | 37 | |
5659b509 | 38 | #define CABAC_BITS 16 |
ec7eb896 | 39 | #define CABAC_MASK ((1<<CABAC_BITS)-1) |
0bc2e7f0 | 40 | #define BRANCHLESS_CABAC_DECODER 1 |
a0f2c6ba | 41 | //#define ARCH_X86_DISABLED 1 |
ec7eb896 | 42 | |
d592f67f MN |
43 | typedef struct CABACContext{ |
44 | int low; | |
45 | int range; | |
46 | int outstanding_count; | |
47 | #ifdef STRICT_LIMITS | |
48 | int symCount; | |
49 | #endif | |
e96682e6 MN |
50 | const uint8_t *bytestream_start; |
51 | const uint8_t *bytestream; | |
bba83349 | 52 | const uint8_t *bytestream_end; |
d592f67f MN |
53 | PutBitContext pb; |
54 | }CABACContext; | |
55 | ||
68a205ed | 56 | extern uint8_t ff_h264_mlps_state[4*64]; |
a0f2c6ba | 57 | extern uint8_t ff_h264_lps_range[4*2*64]; ///< rangeTabLPS |
d61c4e73 MN |
58 | extern uint8_t ff_h264_mps_state[2*64]; ///< transIdxMPS |
59 | extern uint8_t ff_h264_lps_state[2*64]; ///< transIdxLPS | |
f24a5159 | 60 | extern const uint8_t ff_h264_norm_shift[512]; |
ec7eb896 | 61 | |
d592f67f MN |
62 | |
63 | void ff_init_cabac_encoder(CABACContext *c, uint8_t *buf, int buf_size); | |
e96682e6 | 64 | void ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size); |
d61c4e73 | 65 | void ff_init_cabac_states(CABACContext *c); |
d592f67f MN |
66 | |
67 | ||
68 | static inline void put_cabac_bit(CABACContext *c, int b){ | |
115329f1 DB |
69 | put_bits(&c->pb, 1, b); |
70 | for(;c->outstanding_count; c->outstanding_count--){ | |
d592f67f MN |
71 | put_bits(&c->pb, 1, 1-b); |
72 | } | |
73 | } | |
74 | ||
75 | static inline void renorm_cabac_encoder(CABACContext *c){ | |
76 | while(c->range < 0x100){ | |
77 | //FIXME optimize | |
78 | if(c->low<0x100){ | |
79 | put_cabac_bit(c, 0); | |
80 | }else if(c->low<0x200){ | |
81 | c->outstanding_count++; | |
82 | c->low -= 0x100; | |
83 | }else{ | |
84 | put_cabac_bit(c, 1); | |
85 | c->low -= 0x200; | |
86 | } | |
115329f1 | 87 | |
d592f67f MN |
88 | c->range+= c->range; |
89 | c->low += c->low; | |
90 | } | |
91 | } | |
92 | ||
938dd846 | 93 | static void put_cabac(CABACContext *c, uint8_t * const state, int bit){ |
f24a5159 | 94 | int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + *state]; |
115329f1 | 95 | |
d592f67f MN |
96 | if(bit == ((*state)&1)){ |
97 | c->range -= RangeLPS; | |
d61c4e73 | 98 | *state= ff_h264_mps_state[*state]; |
d592f67f MN |
99 | }else{ |
100 | c->low += c->range - RangeLPS; | |
101 | c->range = RangeLPS; | |
d61c4e73 | 102 | *state= ff_h264_lps_state[*state]; |
d592f67f | 103 | } |
115329f1 | 104 | |
d592f67f MN |
105 | renorm_cabac_encoder(c); |
106 | ||
107 | #ifdef STRICT_LIMITS | |
108 | c->symCount++; | |
109 | #endif | |
110 | } | |
111 | ||
938dd846 | 112 | static void put_cabac_static(CABACContext *c, int RangeLPS, int bit){ |
d592f67f MN |
113 | assert(c->range > RangeLPS); |
114 | ||
115 | if(!bit){ | |
116 | c->range -= RangeLPS; | |
117 | }else{ | |
118 | c->low += c->range - RangeLPS; | |
119 | c->range = RangeLPS; | |
120 | } | |
121 | ||
122 | renorm_cabac_encoder(c); | |
123 | ||
124 | #ifdef STRICT_LIMITS | |
125 | c->symCount++; | |
126 | #endif | |
127 | } | |
128 | ||
61ccfcc0 MN |
129 | /** |
130 | * @param bit 0 -> write zero bit, !=0 write one bit | |
131 | */ | |
938dd846 | 132 | static void put_cabac_bypass(CABACContext *c, int bit){ |
d592f67f MN |
133 | c->low += c->low; |
134 | ||
135 | if(bit){ | |
136 | c->low += c->range; | |
137 | } | |
138 | //FIXME optimize | |
139 | if(c->low<0x200){ | |
140 | put_cabac_bit(c, 0); | |
141 | }else if(c->low<0x400){ | |
142 | c->outstanding_count++; | |
143 | c->low -= 0x200; | |
144 | }else{ | |
145 | put_cabac_bit(c, 1); | |
146 | c->low -= 0x400; | |
147 | } | |
115329f1 | 148 | |
d592f67f MN |
149 | #ifdef STRICT_LIMITS |
150 | c->symCount++; | |
151 | #endif | |
152 | } | |
153 | ||
5e20f836 MN |
154 | /** |
155 | * | |
156 | * @return the number of bytes written | |
157 | */ | |
938dd846 | 158 | static int put_cabac_terminate(CABACContext *c, int bit){ |
d592f67f MN |
159 | c->range -= 2; |
160 | ||
161 | if(!bit){ | |
162 | renorm_cabac_encoder(c); | |
163 | }else{ | |
164 | c->low += c->range; | |
165 | c->range= 2; | |
115329f1 | 166 | |
d592f67f MN |
167 | renorm_cabac_encoder(c); |
168 | ||
169 | assert(c->low <= 0x1FF); | |
170 | put_cabac_bit(c, c->low>>9); | |
171 | put_bits(&c->pb, 2, ((c->low>>7)&3)|1); | |
115329f1 | 172 | |
d592f67f MN |
173 | flush_put_bits(&c->pb); //FIXME FIXME FIXME XXX wrong |
174 | } | |
115329f1 | 175 | |
d592f67f MN |
176 | #ifdef STRICT_LIMITS |
177 | c->symCount++; | |
178 | #endif | |
5e20f836 | 179 | |
b46243ed | 180 | return (put_bits_count(&c->pb)+7)>>3; |
d592f67f MN |
181 | } |
182 | ||
61ccfcc0 MN |
183 | /** |
184 | * put (truncated) unary binarization. | |
185 | */ | |
938dd846 | 186 | static void put_cabac_u(CABACContext *c, uint8_t * state, int v, int max, int max_index, int truncated){ |
61ccfcc0 | 187 | int i; |
115329f1 | 188 | |
61ccfcc0 | 189 | assert(v <= max); |
115329f1 | 190 | |
61ccfcc0 MN |
191 | #if 1 |
192 | for(i=0; i<v; i++){ | |
193 | put_cabac(c, state, 1); | |
194 | if(i < max_index) state++; | |
195 | } | |
196 | if(truncated==0 || v<max) | |
197 | put_cabac(c, state, 0); | |
198 | #else | |
199 | if(v <= max_index){ | |
200 | for(i=0; i<v; i++){ | |
201 | put_cabac(c, state+i, 1); | |
202 | } | |
203 | if(truncated==0 || v<max) | |
204 | put_cabac(c, state+i, 0); | |
205 | }else{ | |
206 | for(i=0; i<=max_index; i++){ | |
207 | put_cabac(c, state+i, 1); | |
208 | } | |
209 | for(; i<v; i++){ | |
210 | put_cabac(c, state+max_index, 1); | |
211 | } | |
212 | if(truncated==0 || v<max) | |
213 | put_cabac(c, state+max_index, 0); | |
214 | } | |
215 | #endif | |
216 | } | |
217 | ||
218 | /** | |
219 | * put unary exp golomb k-th order binarization. | |
220 | */ | |
938dd846 | 221 | static void put_cabac_ueg(CABACContext *c, uint8_t * state, int v, int max, int is_signed, int k, int max_index){ |
61ccfcc0 | 222 | int i; |
115329f1 | 223 | |
61ccfcc0 MN |
224 | if(v==0) |
225 | put_cabac(c, state, 0); | |
226 | else{ | |
8f8c0800 | 227 | const int sign= v < 0; |
115329f1 | 228 | |
c26abfa5 | 229 | if(is_signed) v= FFABS(v); |
115329f1 | 230 | |
61ccfcc0 MN |
231 | if(v<max){ |
232 | for(i=0; i<v; i++){ | |
233 | put_cabac(c, state, 1); | |
234 | if(i < max_index) state++; | |
235 | } | |
236 | ||
237 | put_cabac(c, state, 0); | |
238 | }else{ | |
239 | int m= 1<<k; | |
240 | ||
241 | for(i=0; i<max; i++){ | |
242 | put_cabac(c, state, 1); | |
243 | if(i < max_index) state++; | |
244 | } | |
245 | ||
246 | v -= max; | |
247 | while(v >= m){ //FIXME optimize | |
248 | put_cabac_bypass(c, 1); | |
249 | v-= m; | |
250 | m+= m; | |
251 | } | |
252 | put_cabac_bypass(c, 0); | |
253 | while(m>>=1){ | |
254 | put_cabac_bypass(c, v&m); | |
255 | } | |
256 | } | |
257 | ||
258 | if(is_signed) | |
259 | put_cabac_bypass(c, sign); | |
260 | } | |
261 | } | |
262 | ||
ec7eb896 | 263 | static void refill(CABACContext *c){ |
ec7eb896 | 264 | #if CABAC_BITS == 16 |
2ae7569d | 265 | c->low+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); |
ec7eb896 MN |
266 | #else |
267 | c->low+= c->bytestream[0]<<1; | |
268 | #endif | |
269 | c->low -= CABAC_MASK; | |
270 | c->bytestream+= CABAC_BITS/8; | |
271 | } | |
272 | ||
273 | static void refill2(CABACContext *c){ | |
274 | int i, x; | |
275 | ||
276 | x= c->low ^ (c->low-1); | |
f24a5159 | 277 | i= 7 - ff_h264_norm_shift[x>>(CABAC_BITS-1)]; |
ec7eb896 MN |
278 | |
279 | x= -CABAC_MASK; | |
115329f1 | 280 | |
ec7eb896 MN |
281 | #if CABAC_BITS == 16 |
282 | x+= (c->bytestream[0]<<9) + (c->bytestream[1]<<1); | |
283 | #else | |
284 | x+= c->bytestream[0]<<1; | |
285 | #endif | |
115329f1 | 286 | |
ec7eb896 MN |
287 | c->low += x<<i; |
288 | c->bytestream+= CABAC_BITS/8; | |
289 | } | |
ec7eb896 | 290 | |
d592f67f | 291 | static inline void renorm_cabac_decoder(CABACContext *c){ |
f24a5159 | 292 | while(c->range < 0x100){ |
d592f67f MN |
293 | c->range+= c->range; |
294 | c->low+= c->low; | |
ec7eb896 MN |
295 | if(!(c->low & CABAC_MASK)) |
296 | refill(c); | |
d592f67f MN |
297 | } |
298 | } | |
299 | ||
ec7eb896 | 300 | static inline void renorm_cabac_decoder_once(CABACContext *c){ |
400d0f8e | 301 | #ifdef ARCH_X86_DISABLED |
ec8f483a MN |
302 | int temp; |
303 | #if 0 | |
4310580d | 304 | //P3:683 athlon:475 |
ec8f483a | 305 | asm( |
f24a5159 | 306 | "lea -0x100(%0), %2 \n\t" |
ec8f483a MN |
307 | "shr $31, %2 \n\t" //FIXME 31->63 for x86-64 |
308 | "shl %%cl, %0 \n\t" | |
309 | "shl %%cl, %1 \n\t" | |
310 | : "+r"(c->range), "+r"(c->low), "+c"(temp) | |
311 | ); | |
312 | #elif 0 | |
4310580d | 313 | //P3:680 athlon:474 |
ec8f483a | 314 | asm( |
f24a5159 | 315 | "cmp $0x100, %0 \n\t" |
ec8f483a MN |
316 | "setb %%cl \n\t" //FIXME 31->63 for x86-64 |
317 | "shl %%cl, %0 \n\t" | |
318 | "shl %%cl, %1 \n\t" | |
319 | : "+r"(c->range), "+r"(c->low), "+c"(temp) | |
320 | ); | |
321 | #elif 1 | |
322 | int temp2; | |
4310580d | 323 | //P3:665 athlon:517 |
ec8f483a | 324 | asm( |
f24a5159 | 325 | "lea -0x100(%0), %%eax \n\t" |
ec8f483a MN |
326 | "cdq \n\t" |
327 | "mov %0, %%eax \n\t" | |
328 | "and %%edx, %0 \n\t" | |
329 | "and %1, %%edx \n\t" | |
330 | "add %%eax, %0 \n\t" | |
331 | "add %%edx, %1 \n\t" | |
332 | : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) | |
333 | ); | |
334 | #elif 0 | |
335 | int temp2; | |
4310580d | 336 | //P3:673 athlon:509 |
ec8f483a | 337 | asm( |
f24a5159 | 338 | "cmp $0x100, %0 \n\t" |
ec8f483a MN |
339 | "sbb %%edx, %%edx \n\t" |
340 | "mov %0, %%eax \n\t" | |
341 | "and %%edx, %0 \n\t" | |
342 | "and %1, %%edx \n\t" | |
343 | "add %%eax, %0 \n\t" | |
344 | "add %%edx, %1 \n\t" | |
345 | : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) | |
346 | ); | |
347 | #else | |
348 | int temp2; | |
4310580d | 349 | //P3:677 athlon:511 |
ec8f483a | 350 | asm( |
f24a5159 | 351 | "cmp $0x100, %0 \n\t" |
ec8f483a MN |
352 | "lea (%0, %0), %%eax \n\t" |
353 | "lea (%1, %1), %%edx \n\t" | |
354 | "cmovb %%eax, %0 \n\t" | |
355 | "cmovb %%edx, %1 \n\t" | |
356 | : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2) | |
357 | ); | |
358 | #endif | |
359 | #else | |
4310580d | 360 | //P3:675 athlon:476 |
f24a5159 | 361 | int shift= (uint32_t)(c->range - 0x100)>>31; |
bfe328ca LM |
362 | c->range<<= shift; |
363 | c->low <<= shift; | |
ec8f483a | 364 | #endif |
ec7eb896 MN |
365 | if(!(c->low & CABAC_MASK)) |
366 | refill(c); | |
367 | } | |
368 | ||
851ded89 | 369 | static int always_inline get_cabac_inline(CABACContext *c, uint8_t * const state){ |
bfe328ca | 370 | //FIXME gcc generates duplicate load/stores for c->low and c->range |
f7d0b683 MN |
371 | #define LOW "0" |
372 | #define RANGE "4" | |
755073fe RD |
373 | #ifdef ARCH_X86_64 |
374 | #define BYTESTART "16" | |
375 | #define BYTE "24" | |
376 | #define BYTEEND "32" | |
377 | #else | |
d61c4e73 MN |
378 | #define BYTESTART "12" |
379 | #define BYTE "16" | |
380 | #define BYTEEND "20" | |
755073fe | 381 | #endif |
419b8784 | 382 | #if defined(ARCH_X86_32) && !(defined(PIC) && defined(__GNUC__)) |
ba9fb5da BR |
383 | int bit; |
384 | ||
0bc2e7f0 | 385 | #ifndef BRANCHLESS_CABAC_DECODER |
f7d0b683 | 386 | asm volatile( |
4041a495 | 387 | "movzbl (%1), %0 \n\t" |
f7d0b683 MN |
388 | "movl "RANGE "(%2), %%ebx \n\t" |
389 | "movl "RANGE "(%2), %%edx \n\t" | |
f24a5159 | 390 | "andl $0xC0, %%ebx \n\t" |
4041a495 | 391 | "movzbl "MANGLE(ff_h264_lps_range)"(%0, %%ebx, 2), %%esi\n\t" |
f7d0b683 MN |
392 | "movl "LOW "(%2), %%ebx \n\t" |
393 | //eax:state ebx:low, edx:range, esi:RangeLPS | |
394 | "subl %%esi, %%edx \n\t" | |
f24a5159 MN |
395 | "movl %%edx, %%ecx \n\t" |
396 | "shll $17, %%ecx \n\t" | |
397 | "cmpl %%ecx, %%ebx \n\t" | |
f7d0b683 | 398 | " ja 1f \n\t" |
1f4d5e9f MN |
399 | |
400 | #if 1 | |
401 | //athlon:4067 P3:4110 | |
f24a5159 | 402 | "lea -0x100(%%edx), %%ecx \n\t" |
1f4d5e9f MN |
403 | "shr $31, %%ecx \n\t" |
404 | "shl %%cl, %%edx \n\t" | |
405 | "shl %%cl, %%ebx \n\t" | |
406 | #else | |
407 | //athlon:4057 P3:4130 | |
f24a5159 | 408 | "cmp $0x100, %%edx \n\t" //FIXME avoidable |
f7d0b683 MN |
409 | "setb %%cl \n\t" |
410 | "shl %%cl, %%edx \n\t" | |
411 | "shl %%cl, %%ebx \n\t" | |
1f4d5e9f | 412 | #endif |
4041a495 | 413 | "movzbl "MANGLE(ff_h264_mps_state)"(%0), %%ecx \n\t" |
f7d0b683 MN |
414 | "movb %%cl, (%1) \n\t" |
415 | //eax:state ebx:low, edx:range, esi:RangeLPS | |
416 | "test %%bx, %%bx \n\t" | |
417 | " jnz 2f \n\t" | |
755073fe | 418 | "mov "BYTE "(%2), %%"REG_S" \n\t" |
f7d0b683 | 419 | "subl $0xFFFF, %%ebx \n\t" |
755073fe | 420 | "movzwl (%%"REG_S"), %%ecx \n\t" |
f7d0b683 MN |
421 | "bswap %%ecx \n\t" |
422 | "shrl $15, %%ecx \n\t" | |
755073fe | 423 | "add $2, %%"REG_S" \n\t" |
f7d0b683 | 424 | "addl %%ecx, %%ebx \n\t" |
755073fe | 425 | "mov %%"REG_S", "BYTE "(%2) \n\t" |
f7d0b683 MN |
426 | "jmp 2f \n\t" |
427 | "1: \n\t" | |
428 | //eax:state ebx:low, edx:range, esi:RangeLPS | |
f24a5159 | 429 | "subl %%ecx, %%ebx \n\t" |
f7d0b683 | 430 | "movl %%esi, %%edx \n\t" |
a6672acf | 431 | "movzbl " MANGLE(ff_h264_norm_shift) "(%%esi), %%ecx \n\t" |
f7d0b683 MN |
432 | "shll %%cl, %%ebx \n\t" |
433 | "shll %%cl, %%edx \n\t" | |
4041a495 | 434 | "movzbl "MANGLE(ff_h264_lps_state)"(%0), %%ecx \n\t" |
f24a5159 | 435 | "movb %%cl, (%1) \n\t" |
755073fe | 436 | "add $1, %0 \n\t" |
f7d0b683 MN |
437 | "test %%bx, %%bx \n\t" |
438 | " jnz 2f \n\t" | |
439 | ||
755073fe RD |
440 | "mov "BYTE "(%2), %%"REG_c" \n\t" |
441 | "movzwl (%%"REG_c"), %%esi \n\t" | |
f7d0b683 MN |
442 | "bswap %%esi \n\t" |
443 | "shrl $15, %%esi \n\t" | |
444 | "subl $0xFFFF, %%esi \n\t" | |
755073fe RD |
445 | "add $2, %%"REG_c" \n\t" |
446 | "mov %%"REG_c", "BYTE "(%2) \n\t" | |
f7d0b683 MN |
447 | |
448 | "leal -1(%%ebx), %%ecx \n\t" | |
449 | "xorl %%ebx, %%ecx \n\t" | |
f24a5159 | 450 | "shrl $15, %%ecx \n\t" |
a6672acf | 451 | "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t" |
d17faef0 MN |
452 | "neg %%ecx \n\t" |
453 | "add $7, %%ecx \n\t" | |
f7d0b683 MN |
454 | |
455 | "shll %%cl , %%esi \n\t" | |
456 | "addl %%esi, %%ebx \n\t" | |
457 | "2: \n\t" | |
458 | "movl %%edx, "RANGE "(%2) \n\t" | |
459 | "movl %%ebx, "LOW "(%2) \n\t" | |
f7d0b683 MN |
460 | :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used |
461 | :"r"(state), "r"(c) | |
755073fe | 462 | : "%"REG_c, "%ebx", "%edx", "%"REG_S, "memory" |
f7d0b683 | 463 | ); |
9ed92c65 | 464 | bit&=1; |
a0490b32 | 465 | #else /* BRANCHLESS_CABAC_DECODER */ |
13404b2e MN |
466 | |
467 | ||
94e4c3a3 | 468 | #if defined CMOV_IS_FAST |
13404b2e MN |
469 | #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
470 | "mov "tmp" , %%ecx \n\t"\ | |
471 | "shl $17 , "tmp" \n\t"\ | |
472 | "cmp "low" , "tmp" \n\t"\ | |
473 | "cmova %%ecx , "range" \n\t"\ | |
474 | "sbb %%ecx , %%ecx \n\t"\ | |
475 | "and %%ecx , "tmp" \n\t"\ | |
476 | "sub "tmp" , "low" \n\t"\ | |
477 | "xor %%ecx , "ret" \n\t" | |
a0490b32 | 478 | #else /* CMOV_IS_FAST */ |
13404b2e MN |
479 | #define BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
480 | "mov "tmp" , %%ecx \n\t"\ | |
481 | "shl $17 , "tmp" \n\t"\ | |
482 | "sub "low" , "tmp" \n\t"\ | |
483 | "sar $31 , "tmp" \n\t" /*lps_mask*/\ | |
484 | "sub %%ecx , "range" \n\t" /*RangeLPS - range*/\ | |
485 | "and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\ | |
486 | "add %%ecx , "range" \n\t" /*new range*/\ | |
487 | "shl $17 , %%ecx \n\t"\ | |
488 | "and "tmp" , %%ecx \n\t"\ | |
489 | "sub %%ecx , "low" \n\t"\ | |
490 | "xor "tmp" , "ret" \n\t" | |
a0490b32 | 491 | #endif /* CMOV_IS_FAST */ |
ef0090a9 | 492 | |
ef0090a9 | 493 | |
13404b2e MN |
494 | #define BRANCHLESS_GET_CABAC(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ |
495 | "movzbl "statep" , "ret" \n\t"\ | |
496 | "mov "range" , "tmp" \n\t"\ | |
497 | "and $0xC0 , "range" \n\t"\ | |
498 | "movzbl "MANGLE(ff_h264_lps_range)"("ret", "range", 2), "range" \n\t"\ | |
499 | "sub "range" , "tmp" \n\t"\ | |
500 | BRANCHLESS_GET_CABAC_UPDATE(ret, cabac, statep, low, lowword, range, tmp, tmpbyte)\ | |
501 | "movzbl " MANGLE(ff_h264_norm_shift) "("range"), %%ecx \n\t"\ | |
502 | "shl %%cl , "range" \n\t"\ | |
503 | "movzbl "MANGLE(ff_h264_mlps_state)"+128("ret"), "tmp" \n\t"\ | |
504 | "mov "tmpbyte" , "statep" \n\t"\ | |
505 | "shl %%cl , "low" \n\t"\ | |
506 | "test "lowword" , "lowword" \n\t"\ | |
507 | " jnz 1f \n\t"\ | |
755073fe RD |
508 | "mov "BYTE"("cabac"), %%"REG_c" \n\t"\ |
509 | "movzwl (%%"REG_c") , "tmp" \n\t"\ | |
13404b2e MN |
510 | "bswap "tmp" \n\t"\ |
511 | "shr $15 , "tmp" \n\t"\ | |
512 | "sub $0xFFFF , "tmp" \n\t"\ | |
755073fe RD |
513 | "add $2 , %%"REG_c" \n\t"\ |
514 | "mov %%"REG_c" , "BYTE "("cabac") \n\t"\ | |
13404b2e MN |
515 | "lea -1("low") , %%ecx \n\t"\ |
516 | "xor "low" , %%ecx \n\t"\ | |
517 | "shr $15 , %%ecx \n\t"\ | |
518 | "movzbl " MANGLE(ff_h264_norm_shift) "(%%ecx), %%ecx \n\t"\ | |
519 | "neg %%ecx \n\t"\ | |
520 | "add $7 , %%ecx \n\t"\ | |
521 | "shl %%cl , "tmp" \n\t"\ | |
522 | "add "tmp" , "low" \n\t"\ | |
523 | "1: \n\t" | |
ef0090a9 | 524 | |
13404b2e MN |
525 | asm volatile( |
526 | "movl "RANGE "(%2), %%esi \n\t" | |
527 | "movl "LOW "(%2), %%ebx \n\t" | |
528 | BRANCHLESS_GET_CABAC("%0", "%2", "(%1)", "%%ebx", "%%bx", "%%esi", "%%edx", "%%dl") | |
f24a5159 | 529 | "movl %%esi, "RANGE "(%2) \n\t" |
b99f3cab | 530 | "movl %%ebx, "LOW "(%2) \n\t" |
ef0090a9 | 531 | |
ef0090a9 MN |
532 | :"=&a"(bit) |
533 | :"r"(state), "r"(c) | |
755073fe | 534 | : "%"REG_c, "%ebx", "%edx", "%esi", "memory" |
ef0090a9 | 535 | ); |
f1b37db4 | 536 | bit&=1; |
a0490b32 | 537 | #endif /* BRANCHLESS_CABAC_DECODER */ |
419b8784 | 538 | #else /* defined(ARCH_X86_32) && !(defined(PIC) && defined(__GNUC__)) */ |
bfe328ca | 539 | int s = *state; |
a0f2c6ba | 540 | int RangeLPS= ff_h264_lps_range[2*(c->range&0xC0) + s]; |
88730be6 | 541 | int bit, lps_mask attribute_unused; |
115329f1 | 542 | |
d592f67f | 543 | c->range -= RangeLPS; |
0bc2e7f0 | 544 | #ifndef BRANCHLESS_CABAC_DECODER |
f24a5159 | 545 | if(c->low < (c->range<<17)){ |
bfe328ca | 546 | bit= s&1; |
d61c4e73 | 547 | *state= ff_h264_mps_state[s]; |
ec7eb896 | 548 | renorm_cabac_decoder_once(c); |
d592f67f | 549 | }else{ |
f24a5159 MN |
550 | bit= ff_h264_norm_shift[RangeLPS]; |
551 | c->low -= (c->range<<17); | |
d61c4e73 | 552 | *state= ff_h264_lps_state[s]; |
260ceb63 MN |
553 | c->range = RangeLPS<<bit; |
554 | c->low <<= bit; | |
555 | bit= (s&1)^1; | |
556 | ||
ec7eb896 MN |
557 | if(!(c->low & 0xFFFF)){ |
558 | refill2(c); | |
260ceb63 | 559 | } |
d592f67f | 560 | } |
a0490b32 | 561 | #else /* BRANCHLESS_CABAC_DECODER */ |
f24a5159 | 562 | lps_mask= ((c->range<<17) - c->low)>>31; |
115329f1 | 563 | |
f24a5159 | 564 | c->low -= (c->range<<17) & lps_mask; |
ec7eb896 | 565 | c->range += (RangeLPS - c->range) & lps_mask; |
115329f1 | 566 | |
2e1aee80 | 567 | s^=lps_mask; |
68a205ed | 568 | *state= (ff_h264_mlps_state+128)[s]; |
2e1aee80 | 569 | bit= s&1; |
115329f1 | 570 | |
f24a5159 | 571 | lps_mask= ff_h264_norm_shift[c->range]; |
ec7eb896 MN |
572 | c->range<<= lps_mask; |
573 | c->low <<= lps_mask; | |
574 | if(!(c->low & CABAC_MASK)) | |
575 | refill2(c); | |
a0490b32 | 576 | #endif /* BRANCHLESS_CABAC_DECODER */ |
419b8784 | 577 | #endif /* defined(ARCH_X86_32) && !(defined(PIC) && defined(__GNUC__)) */ |
115329f1 | 578 | return bit; |
d592f67f MN |
579 | } |
580 | ||
851ded89 MN |
581 | static int __attribute((noinline)) get_cabac_noinline(CABACContext *c, uint8_t * const state){ |
582 | return get_cabac_inline(c,state); | |
583 | } | |
584 | ||
585 | static int get_cabac(CABACContext *c, uint8_t * const state){ | |
586 | return get_cabac_inline(c,state); | |
587 | } | |
588 | ||
938dd846 | 589 | static int get_cabac_bypass(CABACContext *c){ |
ebd624b6 MN |
590 | #if 0 //not faster |
591 | int bit; | |
592 | asm volatile( | |
593 | "movl "RANGE "(%1), %%ebx \n\t" | |
594 | "movl "LOW "(%1), %%eax \n\t" | |
595 | "shl $17, %%ebx \n\t" | |
596 | "add %%eax, %%eax \n\t" | |
597 | "sub %%ebx, %%eax \n\t" | |
598 | "cdq \n\t" | |
599 | "and %%edx, %%ebx \n\t" | |
600 | "add %%ebx, %%eax \n\t" | |
601 | "test %%ax, %%ax \n\t" | |
602 | " jnz 1f \n\t" | |
755073fe | 603 | "movl "BYTE "(%1), %%"REG_b" \n\t" |
ebd624b6 | 604 | "subl $0xFFFF, %%eax \n\t" |
755073fe | 605 | "movzwl (%%"REG_b"), %%ecx \n\t" |
ebd624b6 MN |
606 | "bswap %%ecx \n\t" |
607 | "shrl $15, %%ecx \n\t" | |
755073fe | 608 | "addl $2, %%"REG_b" \n\t" |
ebd624b6 | 609 | "addl %%ecx, %%eax \n\t" |
755073fe | 610 | "movl %%"REG_b", "BYTE "(%1) \n\t" |
ebd624b6 MN |
611 | "1: \n\t" |
612 | "movl %%eax, "LOW "(%1) \n\t" | |
613 | ||
614 | :"=&d"(bit) | |
615 | :"r"(c) | |
755073fe | 616 | : "%eax", "%"REG_b, "%ecx", "memory" |
ebd624b6 MN |
617 | ); |
618 | return bit+1; | |
619 | #else | |
f24a5159 | 620 | int range; |
d592f67f MN |
621 | c->low += c->low; |
622 | ||
ec7eb896 MN |
623 | if(!(c->low & CABAC_MASK)) |
624 | refill(c); | |
115329f1 | 625 | |
f24a5159 MN |
626 | range= c->range<<17; |
627 | if(c->low < range){ | |
d592f67f MN |
628 | return 0; |
629 | }else{ | |
f24a5159 | 630 | c->low -= range; |
d592f67f MN |
631 | return 1; |
632 | } | |
ebd624b6 MN |
633 | #endif |
634 | } | |
635 | ||
636 | ||
637 | static always_inline int get_cabac_bypass_sign(CABACContext *c, int val){ | |
638 | #ifdef ARCH_X86 | |
ebd624b6 MN |
639 | asm volatile( |
640 | "movl "RANGE "(%1), %%ebx \n\t" | |
641 | "movl "LOW "(%1), %%eax \n\t" | |
642 | "shl $17, %%ebx \n\t" | |
643 | "add %%eax, %%eax \n\t" | |
644 | "sub %%ebx, %%eax \n\t" | |
645 | "cdq \n\t" | |
646 | "and %%edx, %%ebx \n\t" | |
647 | "add %%ebx, %%eax \n\t" | |
648 | "xor %%edx, %%ecx \n\t" | |
649 | "sub %%edx, %%ecx \n\t" | |
650 | "test %%ax, %%ax \n\t" | |
651 | " jnz 1f \n\t" | |
755073fe | 652 | "mov "BYTE "(%1), %%"REG_b" \n\t" |
ebd624b6 | 653 | "subl $0xFFFF, %%eax \n\t" |
755073fe | 654 | "movzwl (%%"REG_b"), %%edx \n\t" |
ebd624b6 MN |
655 | "bswap %%edx \n\t" |
656 | "shrl $15, %%edx \n\t" | |
755073fe | 657 | "add $2, %%"REG_b" \n\t" |
ebd624b6 | 658 | "addl %%edx, %%eax \n\t" |
755073fe | 659 | "mov %%"REG_b", "BYTE "(%1) \n\t" |
ebd624b6 MN |
660 | "1: \n\t" |
661 | "movl %%eax, "LOW "(%1) \n\t" | |
662 | ||
663 | :"+c"(val) | |
664 | :"r"(c) | |
755073fe | 665 | : "%eax", "%"REG_b, "%edx", "memory" |
ebd624b6 MN |
666 | ); |
667 | return val; | |
668 | #else | |
669 | int range, mask; | |
670 | c->low += c->low; | |
671 | ||
672 | if(!(c->low & CABAC_MASK)) | |
673 | refill(c); | |
674 | ||
675 | range= c->range<<17; | |
676 | c->low -= range; | |
677 | mask= c->low >> 31; | |
678 | range &= mask; | |
679 | c->low += range; | |
680 | return (val^mask)-mask; | |
681 | #endif | |
d592f67f | 682 | } |
ebd624b6 | 683 | |
eb73bf72 MN |
684 | //FIXME the x86 code from this file should be moved into i386/h264 or cabac something.c/h (note ill kill you if you move my code away from under my fingers before iam finished with it!) |
685 | //FIXME use some macros to avoid duplicatin get_cabac (cant be done yet as that would make optimization work hard) | |
419b8784 | 686 | #if defined(ARCH_X86_32) && !(defined(PIC) && defined(__GNUC__)) |
eb73bf72 MN |
687 | static int decode_significance_x86(CABACContext *c, int max_coeff, uint8_t *significant_coeff_ctx_base, int *index){ |
688 | void *end= significant_coeff_ctx_base + max_coeff - 1; | |
689 | int minusstart= -(int)significant_coeff_ctx_base; | |
849a5004 | 690 | int minusindex= 4-(int)index; |
eb73bf72 MN |
691 | int coeff_count; |
692 | asm volatile( | |
693 | "movl "RANGE "(%3), %%esi \n\t" | |
694 | "movl "LOW "(%3), %%ebx \n\t" | |
695 | ||
696 | "2: \n\t" | |
697 | ||
a616db28 | 698 | BRANCHLESS_GET_CABAC("%%edx", "%3", "(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
eb73bf72 | 699 | |
a616db28 | 700 | "test $1, %%edx \n\t" |
eb73bf72 MN |
701 | " jz 3f \n\t" |
702 | ||
a616db28 MN |
703 | BRANCHLESS_GET_CABAC("%%edx", "%3", "61(%1)", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
704 | ||
755073fe | 705 | "mov %2, %%"REG_a" \n\t" |
eb73bf72 | 706 | "movl %4, %%ecx \n\t" |
755073fe RD |
707 | "add %1, %%"REG_c" \n\t" |
708 | "movl %%ecx, (%%"REG_a") \n\t" | |
eb73bf72 | 709 | |
a616db28 | 710 | "test $1, %%edx \n\t" |
eb73bf72 MN |
711 | " jnz 4f \n\t" |
712 | ||
755073fe RD |
713 | "add $4, %%"REG_a" \n\t" |
714 | "mov %%"REG_a", %2 \n\t" | |
d3e7c5c3 | 715 | |
eb73bf72 | 716 | "3: \n\t" |
755073fe RD |
717 | "add $1, %1 \n\t" |
718 | "cmp %5, %1 \n\t" | |
eb73bf72 | 719 | " jb 2b \n\t" |
755073fe | 720 | "mov %2, %%"REG_a" \n\t" |
eb73bf72 | 721 | "movl %4, %%ecx \n\t" |
755073fe RD |
722 | "add %1, %%"REG_c" \n\t" |
723 | "movl %%ecx, (%%"REG_a") \n\t" | |
eb73bf72 | 724 | "4: \n\t" |
755073fe | 725 | "add %6, %%eax \n\t" |
eb73bf72 MN |
726 | "shr $2, %%eax \n\t" |
727 | ||
728 | "movl %%esi, "RANGE "(%3) \n\t" | |
729 | "movl %%ebx, "LOW "(%3) \n\t" | |
730 | :"=&a"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index)\ | |
731 | :"r"(c), "m"(minusstart), "m"(end), "m"(minusindex)\ | |
755073fe | 732 | : "%"REG_c, "%ebx", "%edx", "%esi", "memory"\ |
eb73bf72 MN |
733 | ); |
734 | return coeff_count; | |
735 | } | |
e08f5806 MN |
736 | |
737 | static int decode_significance_8x8_x86(CABACContext *c, uint8_t *significant_coeff_ctx_base, int *index, uint8_t *sig_off){ | |
738 | int minusindex= 4-(int)index; | |
739 | int coeff_count; | |
755073fe | 740 | long last=0; |
e08f5806 MN |
741 | asm volatile( |
742 | "movl "RANGE "(%3), %%esi \n\t" | |
743 | "movl "LOW "(%3), %%ebx \n\t" | |
744 | ||
755073fe | 745 | "mov %1, %%"REG_D" \n\t" |
e08f5806 MN |
746 | "2: \n\t" |
747 | ||
755073fe RD |
748 | "mov %6, %%"REG_a" \n\t" |
749 | "movzbl (%%"REG_a", %%"REG_D"), %%edi \n\t" | |
750 | "add %5, %%"REG_D" \n\t" | |
e08f5806 | 751 | |
755073fe | 752 | BRANCHLESS_GET_CABAC("%%edx", "%3", "(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
e08f5806 MN |
753 | |
754 | "mov %1, %%edi \n\t" | |
755 | "test $1, %%edx \n\t" | |
756 | " jz 3f \n\t" | |
757 | ||
758 | "movzbl "MANGLE(last_coeff_flag_offset_8x8)"(%%edi), %%edi\n\t" | |
755073fe | 759 | "add %5, %%"REG_D" \n\t" |
e08f5806 | 760 | |
755073fe | 761 | BRANCHLESS_GET_CABAC("%%edx", "%3", "15(%%"REG_D")", "%%ebx", "%%bx", "%%esi", "%%eax", "%%al") |
e08f5806 | 762 | |
755073fe | 763 | "mov %2, %%"REG_a" \n\t" |
e08f5806 | 764 | "mov %1, %%edi \n\t" |
755073fe | 765 | "movl %%edi, (%%"REG_a") \n\t" |
e08f5806 MN |
766 | |
767 | "test $1, %%edx \n\t" | |
768 | " jnz 4f \n\t" | |
769 | ||
755073fe RD |
770 | "add $4, %%"REG_a" \n\t" |
771 | "mov %%"REG_a", %2 \n\t" | |
e08f5806 MN |
772 | |
773 | "3: \n\t" | |
774 | "addl $1, %%edi \n\t" | |
775 | "mov %%edi, %1 \n\t" | |
776 | "cmpl $63, %%edi \n\t" | |
777 | " jb 2b \n\t" | |
755073fe RD |
778 | "mov %2, %%"REG_a" \n\t" |
779 | "movl %%edi, (%%"REG_a") \n\t" | |
e08f5806 MN |
780 | "4: \n\t" |
781 | "addl %4, %%eax \n\t" | |
782 | "shr $2, %%eax \n\t" | |
783 | ||
784 | "movl %%esi, "RANGE "(%3) \n\t" | |
785 | "movl %%ebx, "LOW "(%3) \n\t" | |
786 | :"=&a"(coeff_count),"+m"(last), "+m"(index)\ | |
787 | :"r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), "m"(sig_off)\ | |
755073fe | 788 | : "%"REG_c, "%ebx", "%edx", "%esi", "%"REG_D, "memory"\ |
e08f5806 MN |
789 | ); |
790 | return coeff_count; | |
791 | } | |
419b8784 | 792 | #endif /* defined(ARCH_X86_32) && !(defined(PIC) && defined(__GNUC__)) */ |
d592f67f | 793 | |
5e20f836 MN |
794 | /** |
795 | * | |
796 | * @return the number of bytes read or 0 if no end | |
797 | */ | |
938dd846 | 798 | static int get_cabac_terminate(CABACContext *c){ |
f24a5159 MN |
799 | c->range -= 2; |
800 | if(c->low < c->range<<17){ | |
ec7eb896 | 801 | renorm_cabac_decoder_once(c); |
d592f67f MN |
802 | return 0; |
803 | }else{ | |
5e20f836 | 804 | return c->bytestream - c->bytestream_start; |
115329f1 | 805 | } |
d592f67f MN |
806 | } |
807 | ||
61ccfcc0 MN |
808 | /** |
809 | * get (truncated) unnary binarization. | |
810 | */ | |
938dd846 | 811 | static int get_cabac_u(CABACContext *c, uint8_t * state, int max, int max_index, int truncated){ |
61ccfcc0 | 812 | int i; |
115329f1 DB |
813 | |
814 | for(i=0; i<max; i++){ | |
61ccfcc0 MN |
815 | if(get_cabac(c, state)==0) |
816 | return i; | |
115329f1 | 817 | |
61ccfcc0 MN |
818 | if(i< max_index) state++; |
819 | } | |
820 | ||
821 | return truncated ? max : -1; | |
822 | } | |
823 | ||
824 | /** | |
825 | * get unary exp golomb k-th order binarization. | |
826 | */ | |
938dd846 | 827 | static int get_cabac_ueg(CABACContext *c, uint8_t * state, int max, int is_signed, int k, int max_index){ |
61ccfcc0 MN |
828 | int i, v; |
829 | int m= 1<<k; | |
115329f1 DB |
830 | |
831 | if(get_cabac(c, state)==0) | |
61ccfcc0 | 832 | return 0; |
115329f1 | 833 | |
61ccfcc0 | 834 | if(0 < max_index) state++; |
115329f1 DB |
835 | |
836 | for(i=1; i<max; i++){ | |
61ccfcc0 MN |
837 | if(get_cabac(c, state)==0){ |
838 | if(is_signed && get_cabac_bypass(c)){ | |
839 | return -i; | |
840 | }else | |
841 | return i; | |
842 | } | |
843 | ||
844 | if(i < max_index) state++; | |
845 | } | |
115329f1 | 846 | |
61ccfcc0 MN |
847 | while(get_cabac_bypass(c)){ |
848 | i+= m; | |
849 | m+= m; | |
850 | } | |
115329f1 | 851 | |
61ccfcc0 MN |
852 | v=0; |
853 | while(m>>=1){ | |
854 | v+= v + get_cabac_bypass(c); | |
855 | } | |
856 | i += v; | |
857 | ||
858 | if(is_signed && get_cabac_bypass(c)){ | |
859 | return -i; | |
860 | }else | |
861 | return i; | |
862 | } |