mjpeg: Detect overreads in mjpeg_decode_scan() and error out.
[libav.git] / libavcodec / x86 / x86inc.asm
CommitLineData
bafad220
LM
1;*****************************************************************************
2;* x86inc.asm
3;*****************************************************************************
2966cc18 4;* Copyright (C) 2005-2008 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
bafad220 8;*
2966cc18
JGG
9;* Permission to use, copy, modify, and/or distribute this software for any
10;* purpose with or without fee is hereby granted, provided that the above
11;* copyright notice and this permission notice appear in all copies.
bafad220 12;*
2966cc18
JGG
13;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
14;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
15;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
16;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
17;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
18;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
19;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
20;*****************************************************************************
21
2966cc18
JGG
22; This is a header file for the x264ASM assembly language, which uses
23; NASM/YASM syntax combined with a large number of macros to provide easy
24; abstraction between different calling conventions (x86_32, win64, linux64).
25; It also has various other useful features to simplify writing the kind of
26; DSP functions that are most often used in x264.
27
28; Unlike the rest of x264, this file is available under an ISC license, as it
29; has significant usefulness outside of x264 and we want it to be available
30; to the largest audience possible. Of course, if you modify it for your own
31; purposes to add a new feature, we strongly encourage contributing a patch
32; as this feature might be useful for others as well. Send patches or ideas
33; to x264-devel@videolan.org .
34
35%define program_name ff
36
3f87f39c
JA
37%ifdef ARCH_X86_64
38 %ifidn __OUTPUT_FORMAT__,win32
39 %define WIN64
40 %else
41 %define UNIX64
42 %endif
43%endif
44
2966cc18
JGG
45%ifdef PREFIX
46 %define mangle(x) _ %+ x
47%else
48 %define mangle(x) x
49%endif
50
bafad220
LM
51; FIXME: All of the 64bit asm functions that take a stride as an argument
52; via register, assume that the high dword of that register is filled with 0.
53; This is true in practice (since we never do any 64bit arithmetic on strides,
54; and x264's strides are all positive), but is not guaranteed by the ABI.
55
56; Name of the .rodata section.
57; Kludge: Something on OS X fails to align .rodata even given an align attribute,
58; so use a different read-only section.
3f87f39c 59%macro SECTION_RODATA 0-1 16
bafad220 60 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 61 SECTION .text align=%1
bafad220 62 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 63 SECTION .text align=%1
bafad220
LM
64 fakegot:
65 %else
3f87f39c 66 SECTION .rodata align=%1
bafad220
LM
67 %endif
68%endmacro
69
3f87f39c
JA
70%ifdef WIN64
71 %define PIC
72%elifndef ARCH_X86_64
2966cc18
JGG
73; x86_32 doesn't require PIC.
74; Some distros prefer shared objects to be PIC, but nothing breaks if
75; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
76 %undef PIC
77%endif
78%ifdef PIC
2966cc18 79 default rel
bafad220
LM
80%endif
81
82; Macros to eliminate most code duplication between x86_32 and x86_64:
83; Currently this works only for leaf functions which load all their arguments
84; into registers at the start, and make no other use of the stack. Luckily that
85; covers most of x264's asm.
86
87; PROLOGUE:
88; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
89; %2 = number of registers used. pushes callee-saved regs if needed.
90; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
bafad220
LM
91; %4 = list of names to define to registers
92; PROLOGUE can also be invoked by adding the same options to cglobal
93
94; e.g.
29e4edbb 95; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 96; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
97
98; TODO Some functions can use some args directly from the stack. If they're the
99; last args then you can just not declare them, but if they're in the middle
100; we need more flexible macro.
101
102; RET:
103; Pops anything that was pushed by PROLOGUE
104
105; REP_RET:
106; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
107; which are slow when a normal ret follows a branch.
108
3f87f39c
JA
109; registers:
110; rN and rNq are the native-size register holding function argument N
111; rNd, rNw, rNb are dword, word, and byte size
112; rNm is the original location of arg N (a register or on the stack), dword
113; rNmp is native size
114
bafad220
LM
115%macro DECLARE_REG 6
116 %define r%1q %2
117 %define r%1d %3
118 %define r%1w %4
119 %define r%1b %5
120 %define r%1m %6
3f87f39c
JA
121 %ifid %6 ; i.e. it's a register
122 %define r%1mp %2
123 %elifdef ARCH_X86_64 ; memory
124 %define r%1mp qword %6
125 %else
126 %define r%1mp dword %6
127 %endif
bafad220
LM
128 %define r%1 %2
129%endmacro
130
131%macro DECLARE_REG_SIZE 2
132 %define r%1q r%1
133 %define e%1q r%1
134 %define r%1d e%1
135 %define e%1d e%1
136 %define r%1w %1
137 %define e%1w %1
138 %define r%1b %2
139 %define e%1b %2
140%ifndef ARCH_X86_64
141 %define r%1 e%1
142%endif
143%endmacro
144
145DECLARE_REG_SIZE ax, al
146DECLARE_REG_SIZE bx, bl
147DECLARE_REG_SIZE cx, cl
148DECLARE_REG_SIZE dx, dl
149DECLARE_REG_SIZE si, sil
150DECLARE_REG_SIZE di, dil
151DECLARE_REG_SIZE bp, bpl
152
3f87f39c
JA
153; t# defines for when per-arch register allocation is more complex than just function arguments
154
155%macro DECLARE_REG_TMP 1-*
156 %assign %%i 0
157 %rep %0
158 CAT_XDEFINE t, %%i, r%1
159 %assign %%i %%i+1
160 %rotate 1
161 %endrep
162%endmacro
163
164%macro DECLARE_REG_TMP_SIZE 0-*
165 %rep %0
166 %define t%1q t%1 %+ q
167 %define t%1d t%1 %+ d
168 %define t%1w t%1 %+ w
169 %define t%1b t%1 %+ b
170 %rotate 1
171 %endrep
172%endmacro
173
2966cc18 174DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
3f87f39c 175
bafad220
LM
176%ifdef ARCH_X86_64
177 %define gprsize 8
178%else
179 %define gprsize 4
180%endif
181
182%macro PUSH 1
183 push %1
184 %assign stack_offset stack_offset+gprsize
185%endmacro
186
187%macro POP 1
188 pop %1
189 %assign stack_offset stack_offset-gprsize
190%endmacro
191
192%macro SUB 2
193 sub %1, %2
194 %ifidn %1, rsp
195 %assign stack_offset stack_offset+(%2)
196 %endif
197%endmacro
198
199%macro ADD 2
200 add %1, %2
201 %ifidn %1, rsp
202 %assign stack_offset stack_offset-(%2)
203 %endif
204%endmacro
205
206%macro movifnidn 2
207 %ifnidn %1, %2
208 mov %1, %2
209 %endif
210%endmacro
211
212%macro movsxdifnidn 2
213 %ifnidn %1, %2
214 movsxd %1, %2
215 %endif
216%endmacro
217
218%macro ASSERT 1
219 %if (%1) == 0
220 %error assert failed
221 %endif
222%endmacro
223
224%macro DEFINE_ARGS 0-*
225 %ifdef n_arg_names
226 %assign %%i 0
227 %rep n_arg_names
228 CAT_UNDEF arg_name %+ %%i, q
229 CAT_UNDEF arg_name %+ %%i, d
230 CAT_UNDEF arg_name %+ %%i, w
231 CAT_UNDEF arg_name %+ %%i, b
2f77923d 232 CAT_UNDEF arg_name %+ %%i, m
bafad220
LM
233 CAT_UNDEF arg_name, %%i
234 %assign %%i %%i+1
235 %endrep
236 %endif
237
238 %assign %%i 0
239 %rep %0
240 %xdefine %1q r %+ %%i %+ q
241 %xdefine %1d r %+ %%i %+ d
242 %xdefine %1w r %+ %%i %+ w
243 %xdefine %1b r %+ %%i %+ b
2f77923d 244 %xdefine %1m r %+ %%i %+ m
bafad220
LM
245 CAT_XDEFINE arg_name, %%i, %1
246 %assign %%i %%i+1
247 %rotate 1
248 %endrep
249 %assign n_arg_names %%i
250%endmacro
251
3f87f39c 252%ifdef WIN64 ; Windows x64 ;=================================================
bafad220
LM
253
254DECLARE_REG 0, rcx, ecx, cx, cl, ecx
255DECLARE_REG 1, rdx, edx, dx, dl, edx
256DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
257DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
258DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
259DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
260DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
261%define r7m [rsp + stack_offset + 64]
262%define r8m [rsp + stack_offset + 72]
263
264%macro LOAD_IF_USED 2 ; reg_id, number_of_args
265 %if %1 < %2
3f87f39c
JA
266 mov r%1, [rsp + stack_offset + 8 + %1*8]
267 %endif
268%endmacro
269
2966cc18 270%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
3f87f39c
JA
271 ASSERT %2 >= %1
272 %assign regs_used %2
273 ASSERT regs_used <= 7
3f87f39c
JA
274 %if regs_used > 4
275 push r4
276 push r5
277 %assign stack_offset stack_offset+16
278 %endif
532e7697
LM
279 WIN64_SPILL_XMM %3
280 LOAD_IF_USED 4, %1
281 LOAD_IF_USED 5, %1
282 LOAD_IF_USED 6, %1
283 DEFINE_ARGS %4
284%endmacro
285
286%macro WIN64_SPILL_XMM 1
287 %assign xmm_regs_used %1
288 ASSERT xmm_regs_used <= 16
3f87f39c
JA
289 %if xmm_regs_used > 6
290 sub rsp, (xmm_regs_used-6)*16+16
291 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
292 %assign %%i xmm_regs_used
293 %rep (xmm_regs_used-6)
294 %assign %%i %%i-1
295 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
296 %endrep
297 %endif
3f87f39c
JA
298%endmacro
299
532e7697 300%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
301 %if xmm_regs_used > 6
302 %assign %%i xmm_regs_used
303 %rep (xmm_regs_used-6)
304 %assign %%i %%i-1
305 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
306 %endrep
307 add %1, (xmm_regs_used-6)*16+16
308 %endif
309%endmacro
310
532e7697
LM
311%macro WIN64_RESTORE_XMM 1
312 WIN64_RESTORE_XMM_INTERNAL %1
3f87f39c
JA
313 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
314 %assign xmm_regs_used 0
315%endmacro
316
317%macro RET 0
532e7697 318 WIN64_RESTORE_XMM_INTERNAL rsp
3f87f39c
JA
319 %if regs_used > 4
320 pop r5
321 pop r4
bafad220 322 %endif
3f87f39c 323 ret
bafad220
LM
324%endmacro
325
3f87f39c
JA
326%macro REP_RET 0
327 %if regs_used > 4 || xmm_regs_used > 6
328 RET
329 %else
330 rep ret
331 %endif
332%endmacro
333
334%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
bafad220
LM
335
336DECLARE_REG 0, rdi, edi, di, dil, edi
337DECLARE_REG 1, rsi, esi, si, sil, esi
338DECLARE_REG 2, rdx, edx, dx, dl, edx
339DECLARE_REG 3, rcx, ecx, cx, cl, ecx
340DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
341DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
342DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
343%define r7m [rsp + stack_offset + 16]
344%define r8m [rsp + stack_offset + 24]
345
346%macro LOAD_IF_USED 2 ; reg_id, number_of_args
347 %if %1 < %2
348 mov r%1, [rsp - 40 + %1*8]
349 %endif
350%endmacro
351
3f87f39c 352%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
bafad220
LM
353 ASSERT %2 >= %1
354 ASSERT %2 <= 7
bafad220
LM
355 LOAD_IF_USED 6, %1
356 DEFINE_ARGS %4
357%endmacro
358
359%macro RET 0
360 ret
361%endmacro
362
363%macro REP_RET 0
364 rep ret
365%endmacro
366
367%else ; X86_32 ;==============================================================
368
369DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
370DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
371DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
372DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
373DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
374DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
375DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
376%define r7m [esp + stack_offset + 32]
377%define r8m [esp + stack_offset + 36]
378%define rsp esp
379
380%macro PUSH_IF_USED 1 ; reg_id
381 %if %1 < regs_used
382 push r%1
383 %assign stack_offset stack_offset+4
384 %endif
385%endmacro
386
387%macro POP_IF_USED 1 ; reg_id
388 %if %1 < regs_used
389 pop r%1
390 %endif
391%endmacro
392
393%macro LOAD_IF_USED 2 ; reg_id, number_of_args
394 %if %1 < %2
395 mov r%1, [esp + stack_offset + 4 + %1*4]
396 %endif
397%endmacro
398
2966cc18 399%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
bafad220 400 ASSERT %2 >= %1
bafad220 401 %assign regs_used %2
bafad220
LM
402 ASSERT regs_used <= 7
403 PUSH_IF_USED 3
404 PUSH_IF_USED 4
405 PUSH_IF_USED 5
406 PUSH_IF_USED 6
407 LOAD_IF_USED 0, %1
408 LOAD_IF_USED 1, %1
409 LOAD_IF_USED 2, %1
410 LOAD_IF_USED 3, %1
411 LOAD_IF_USED 4, %1
412 LOAD_IF_USED 5, %1
413 LOAD_IF_USED 6, %1
bafad220
LM
414 DEFINE_ARGS %4
415%endmacro
416
417%macro RET 0
418 POP_IF_USED 6
419 POP_IF_USED 5
420 POP_IF_USED 4
421 POP_IF_USED 3
422 ret
423%endmacro
424
425%macro REP_RET 0
426 %if regs_used > 3
427 RET
428 %else
429 rep ret
430 %endif
431%endmacro
432
433%endif ;======================================================================
434
532e7697
LM
435%ifndef WIN64
436%macro WIN64_SPILL_XMM 1
437%endmacro
438%macro WIN64_RESTORE_XMM 1
439%endmacro
440%endif
441
bafad220
LM
442
443
444;=============================================================================
445; arch-independent part
446;=============================================================================
447
448%assign function_align 16
449
450; Symbol prefix for C linkage
451%macro cglobal 1-2+
2966cc18 452 %xdefine %1 mangle(program_name %+ _ %+ %1)
29e4edbb 453 %xdefine %1.skip_prologue %1 %+ .skip_prologue
bafad220 454 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 455 global %1:function hidden
bafad220 456 %else
40c7d0ae 457 global %1
bafad220
LM
458 %endif
459 align function_align
460 %1:
461 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
3f87f39c 462 %assign stack_offset 0
bafad220
LM
463 %if %0 > 1
464 PROLOGUE %2
465 %endif
466%endmacro
467
468%macro cextern 1
2966cc18
JGG
469 %xdefine %1 mangle(program_name %+ _ %+ %1)
470 extern %1
471%endmacro
472
473;like cextern, but without the prefix
474%macro cextern_naked 1
475 %xdefine %1 mangle(%1)
3f87f39c 476 extern %1
bafad220
LM
477%endmacro
478
2966cc18
JGG
479%macro const 2+
480 %xdefine %1 mangle(program_name %+ _ %+ %1)
481 global %1
482 %1: %2
483%endmacro
484
bafad220
LM
485; This is needed for ELF, otherwise the GNU linker assumes the stack is
486; executable by default.
487%ifidn __OUTPUT_FORMAT__,elf
488SECTION .note.GNU-stack noalloc noexec nowrite progbits
489%endif
490
bafad220
LM
491; merge mmx and sse*
492
493%macro CAT_XDEFINE 3
494 %xdefine %1%2 %3
495%endmacro
496
497%macro CAT_UNDEF 2
498 %undef %1%2
499%endmacro
500
501%macro INIT_MMX 0
502 %define RESET_MM_PERMUTATION INIT_MMX
503 %define mmsize 8
504 %define num_mmregs 8
505 %define mova movq
506 %define movu movq
507 %define movh movd
532e7697 508 %define movnta movntq
bafad220
LM
509 %assign %%i 0
510 %rep 8
511 CAT_XDEFINE m, %%i, mm %+ %%i
512 CAT_XDEFINE nmm, %%i, %%i
513 %assign %%i %%i+1
514 %endrep
515 %rep 8
516 CAT_UNDEF m, %%i
517 CAT_UNDEF nmm, %%i
518 %assign %%i %%i+1
519 %endrep
520%endmacro
521
522%macro INIT_XMM 0
523 %define RESET_MM_PERMUTATION INIT_XMM
524 %define mmsize 16
525 %define num_mmregs 8
526 %ifdef ARCH_X86_64
527 %define num_mmregs 16
528 %endif
529 %define mova movdqa
530 %define movu movdqu
531 %define movh movq
532e7697 532 %define movnta movntdq
bafad220
LM
533 %assign %%i 0
534 %rep num_mmregs
535 CAT_XDEFINE m, %%i, xmm %+ %%i
536 CAT_XDEFINE nxmm, %%i, %%i
537 %assign %%i %%i+1
538 %endrep
539%endmacro
540
541INIT_MMX
542
543; I often want to use macros that permute their arguments. e.g. there's no
544; efficient way to implement butterfly or transpose or dct without swapping some
545; arguments.
546;
547; I would like to not have to manually keep track of the permutations:
548; If I insert a permutation in the middle of a function, it should automatically
549; change everything that follows. For more complex macros I may also have multiple
550; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
551;
552; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
553; permutes its arguments. It's equivalent to exchanging the contents of the
554; registers, except that this way you exchange the register names instead, so it
555; doesn't cost any cycles.
556
557%macro PERMUTE 2-* ; takes a list of pairs to swap
558%rep %0/2
559 %xdefine tmp%2 m%2
560 %xdefine ntmp%2 nm%2
561 %rotate 2
562%endrep
563%rep %0/2
564 %xdefine m%1 tmp%2
565 %xdefine nm%1 ntmp%2
566 %undef tmp%2
567 %undef ntmp%2
568 %rotate 2
569%endrep
570%endmacro
571
572%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
573%rep %0-1
574%ifdef m%1
575 %xdefine tmp m%1
576 %xdefine m%1 m%2
577 %xdefine m%2 tmp
578 CAT_XDEFINE n, m%1, %1
579 CAT_XDEFINE n, m%2, %2
580%else
581 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
582 ; Be careful using this mode in nested macros though, as in some cases there may be
583 ; other copies of m# that have already been dereferenced and don't get updated correctly.
584 %xdefine %%n1 n %+ %1
585 %xdefine %%n2 n %+ %2
586 %xdefine tmp m %+ %%n1
587 CAT_XDEFINE m, %%n1, m %+ %%n2
588 CAT_XDEFINE m, %%n2, tmp
589 CAT_XDEFINE n, m %+ %%n1, %%n1
590 CAT_XDEFINE n, m %+ %%n2, %%n2
591%endif
592 %undef tmp
593 %rotate 1
594%endrep
595%endmacro
596
2966cc18
JGG
597; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
598; function name, then any later calls to that function will automatically
599; load the permutation, so values can be returned in mmregs.
600%macro SAVE_MM_PERMUTATION 1 ; name to save as
bafad220
LM
601 %assign %%i 0
602 %rep num_mmregs
603 CAT_XDEFINE %1_m, %%i, m %+ %%i
604 %assign %%i %%i+1
605 %endrep
606%endmacro
607
2966cc18 608%macro LOAD_MM_PERMUTATION 1 ; name to load from
bafad220
LM
609 %assign %%i 0
610 %rep num_mmregs
611 CAT_XDEFINE m, %%i, %1_m %+ %%i
3f87f39c 612 CAT_XDEFINE n, m %+ %%i, %%i
bafad220
LM
613 %assign %%i %%i+1
614 %endrep
615%endmacro
616
617%macro call 1
618 call %1
619 %ifdef %1_m0
620 LOAD_MM_PERMUTATION %1
621 %endif
622%endmacro
623
2966cc18 624; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
625%macro add 2
626 %ifnum %2
627 %if %2==128
628 sub %1, -128
629 %else
630 add %1, %2
631 %endif
632 %else
633 add %1, %2
634 %endif
635%endmacro
636
637%macro sub 2
638 %ifnum %2
639 %if %2==128
640 add %1, -128
641 %else
642 sub %1, %2
643 %endif
644 %else
645 sub %1, %2
646 %endif
647%endmacro