sync yasm macros to x264
[libav.git] / libavcodec / x86 / x86inc.asm
CommitLineData
bafad220
LM
1;*****************************************************************************
2;* x86inc.asm
3;*****************************************************************************
4;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
5;*
6;* This file is part of FFmpeg.
7;*
8;* FFmpeg is free software; you can redistribute it and/or
9;* modify it under the terms of the GNU Lesser General Public
10;* License as published by the Free Software Foundation; either
11;* version 2.1 of the License, or (at your option) any later version.
12;*
13;* FFmpeg is distributed in the hope that it will be useful,
14;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16;* Lesser General Public License for more details.
17;*
18;* You should have received a copy of the GNU Lesser General Public
19;* License along with FFmpeg; if not, write to the Free Software
20;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21;*****************************************************************************
22
3f87f39c
JA
23%ifdef ARCH_X86_64
24 %ifidn __OUTPUT_FORMAT__,win32
25 %define WIN64
26 %else
27 %define UNIX64
28 %endif
29%endif
30
bafad220
LM
31; FIXME: All of the 64bit asm functions that take a stride as an argument
32; via register, assume that the high dword of that register is filled with 0.
33; This is true in practice (since we never do any 64bit arithmetic on strides,
34; and x264's strides are all positive), but is not guaranteed by the ABI.
35
36; Name of the .rodata section.
37; Kludge: Something on OS X fails to align .rodata even given an align attribute,
38; so use a different read-only section.
3f87f39c 39%macro SECTION_RODATA 0-1 16
bafad220 40 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 41 SECTION .text align=%1
bafad220 42 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 43 SECTION .text align=%1
bafad220
LM
44 fakegot:
45 %else
3f87f39c 46 SECTION .rodata align=%1
bafad220
LM
47 %endif
48%endmacro
49
3f87f39c
JA
50; PIC support macros.
51; x86_64 can't fit 64bit address literals in most instruction types,
52; so shared objects (under the assumption that they might be anywhere
53; in memory) must use an address mode that does fit.
54; So all accesses to global variables must use this macro, e.g.
bafad220 55; mov eax, [foo GLOBAL]
29e4edbb 56; instead of
bafad220
LM
57; mov eax, [foo]
58;
3f87f39c
JA
59; x86_32 doesn't require PIC.
60; Some distros prefer shared objects to be PIC, but nothing breaks if
61; the code contains a few textrels, so we'll skip that complexity.
62
63%ifdef WIN64
64 %define PIC
65%elifndef ARCH_X86_64
66 %undef PIC
67%endif
68%ifdef PIC
bafad220 69 %define GLOBAL wrt rip
bafad220 70%else
3f87f39c 71 %define GLOBAL
bafad220
LM
72%endif
73
74; Macros to eliminate most code duplication between x86_32 and x86_64:
75; Currently this works only for leaf functions which load all their arguments
76; into registers at the start, and make no other use of the stack. Luckily that
77; covers most of x264's asm.
78
79; PROLOGUE:
80; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
81; %2 = number of registers used. pushes callee-saved regs if needed.
82; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
bafad220
LM
83; %4 = list of names to define to registers
84; PROLOGUE can also be invoked by adding the same options to cglobal
85
86; e.g.
29e4edbb 87; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 88; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
89
90; TODO Some functions can use some args directly from the stack. If they're the
91; last args then you can just not declare them, but if they're in the middle
92; we need more flexible macro.
93
94; RET:
95; Pops anything that was pushed by PROLOGUE
96
97; REP_RET:
98; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
99; which are slow when a normal ret follows a branch.
100
3f87f39c
JA
101; registers:
102; rN and rNq are the native-size register holding function argument N
103; rNd, rNw, rNb are dword, word, and byte size
104; rNm is the original location of arg N (a register or on the stack), dword
105; rNmp is native size
106
bafad220
LM
107%macro DECLARE_REG 6
108 %define r%1q %2
109 %define r%1d %3
110 %define r%1w %4
111 %define r%1b %5
112 %define r%1m %6
3f87f39c
JA
113 %ifid %6 ; i.e. it's a register
114 %define r%1mp %2
115 %elifdef ARCH_X86_64 ; memory
116 %define r%1mp qword %6
117 %else
118 %define r%1mp dword %6
119 %endif
bafad220
LM
120 %define r%1 %2
121%endmacro
122
123%macro DECLARE_REG_SIZE 2
124 %define r%1q r%1
125 %define e%1q r%1
126 %define r%1d e%1
127 %define e%1d e%1
128 %define r%1w %1
129 %define e%1w %1
130 %define r%1b %2
131 %define e%1b %2
132%ifndef ARCH_X86_64
133 %define r%1 e%1
134%endif
135%endmacro
136
137DECLARE_REG_SIZE ax, al
138DECLARE_REG_SIZE bx, bl
139DECLARE_REG_SIZE cx, cl
140DECLARE_REG_SIZE dx, dl
141DECLARE_REG_SIZE si, sil
142DECLARE_REG_SIZE di, dil
143DECLARE_REG_SIZE bp, bpl
144
3f87f39c
JA
145; t# defines for when per-arch register allocation is more complex than just function arguments
146
147%macro DECLARE_REG_TMP 1-*
148 %assign %%i 0
149 %rep %0
150 CAT_XDEFINE t, %%i, r%1
151 %assign %%i %%i+1
152 %rotate 1
153 %endrep
154%endmacro
155
156%macro DECLARE_REG_TMP_SIZE 0-*
157 %rep %0
158 %define t%1q t%1 %+ q
159 %define t%1d t%1 %+ d
160 %define t%1w t%1 %+ w
161 %define t%1b t%1 %+ b
162 %rotate 1
163 %endrep
164%endmacro
165
166DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
167
bafad220
LM
168%ifdef ARCH_X86_64
169 %define gprsize 8
170%else
171 %define gprsize 4
172%endif
173
174%macro PUSH 1
175 push %1
176 %assign stack_offset stack_offset+gprsize
177%endmacro
178
179%macro POP 1
180 pop %1
181 %assign stack_offset stack_offset-gprsize
182%endmacro
183
184%macro SUB 2
185 sub %1, %2
186 %ifidn %1, rsp
187 %assign stack_offset stack_offset+(%2)
188 %endif
189%endmacro
190
191%macro ADD 2
192 add %1, %2
193 %ifidn %1, rsp
194 %assign stack_offset stack_offset-(%2)
195 %endif
196%endmacro
197
198%macro movifnidn 2
199 %ifnidn %1, %2
200 mov %1, %2
201 %endif
202%endmacro
203
204%macro movsxdifnidn 2
205 %ifnidn %1, %2
206 movsxd %1, %2
207 %endif
208%endmacro
209
210%macro ASSERT 1
211 %if (%1) == 0
212 %error assert failed
213 %endif
214%endmacro
215
216%macro DEFINE_ARGS 0-*
217 %ifdef n_arg_names
218 %assign %%i 0
219 %rep n_arg_names
220 CAT_UNDEF arg_name %+ %%i, q
221 CAT_UNDEF arg_name %+ %%i, d
222 CAT_UNDEF arg_name %+ %%i, w
223 CAT_UNDEF arg_name %+ %%i, b
2f77923d 224 CAT_UNDEF arg_name %+ %%i, m
bafad220
LM
225 CAT_UNDEF arg_name, %%i
226 %assign %%i %%i+1
227 %endrep
228 %endif
229
230 %assign %%i 0
231 %rep %0
232 %xdefine %1q r %+ %%i %+ q
233 %xdefine %1d r %+ %%i %+ d
234 %xdefine %1w r %+ %%i %+ w
235 %xdefine %1b r %+ %%i %+ b
2f77923d 236 %xdefine %1m r %+ %%i %+ m
bafad220
LM
237 CAT_XDEFINE arg_name, %%i, %1
238 %assign %%i %%i+1
239 %rotate 1
240 %endrep
241 %assign n_arg_names %%i
242%endmacro
243
3f87f39c 244%ifdef WIN64 ; Windows x64 ;=================================================
bafad220
LM
245
246DECLARE_REG 0, rcx, ecx, cx, cl, ecx
247DECLARE_REG 1, rdx, edx, dx, dl, edx
248DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
249DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
250DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
251DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
252DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
253%define r7m [rsp + stack_offset + 64]
254%define r8m [rsp + stack_offset + 72]
255
256%macro LOAD_IF_USED 2 ; reg_id, number_of_args
257 %if %1 < %2
3f87f39c
JA
258 mov r%1, [rsp + stack_offset + 8 + %1*8]
259 %endif
260%endmacro
261
262%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
263 ASSERT %2 >= %1
264 %assign regs_used %2
265 ASSERT regs_used <= 7
266 %if %0 > 2
267 %assign xmm_regs_used %3
268 %else
269 %assign xmm_regs_used 0
270 %endif
271 ASSERT xmm_regs_used <= 16
272 %if regs_used > 4
273 push r4
274 push r5
275 %assign stack_offset stack_offset+16
276 %endif
277 %if xmm_regs_used > 6
278 sub rsp, (xmm_regs_used-6)*16+16
279 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
280 %assign %%i xmm_regs_used
281 %rep (xmm_regs_used-6)
282 %assign %%i %%i-1
283 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
284 %endrep
285 %endif
286 LOAD_IF_USED 4, %1
287 LOAD_IF_USED 5, %1
288 LOAD_IF_USED 6, %1
289 DEFINE_ARGS %4
290%endmacro
291
292%macro RESTORE_XMM_INTERNAL 1
293 %if xmm_regs_used > 6
294 %assign %%i xmm_regs_used
295 %rep (xmm_regs_used-6)
296 %assign %%i %%i-1
297 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
298 %endrep
299 add %1, (xmm_regs_used-6)*16+16
300 %endif
301%endmacro
302
303%macro RESTORE_XMM 1
304 RESTORE_XMM_INTERNAL %1
305 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
306 %assign xmm_regs_used 0
307%endmacro
308
309%macro RET 0
310 RESTORE_XMM_INTERNAL rsp
311 %if regs_used > 4
312 pop r5
313 pop r4
bafad220 314 %endif
3f87f39c 315 ret
bafad220
LM
316%endmacro
317
3f87f39c
JA
318%macro REP_RET 0
319 %if regs_used > 4 || xmm_regs_used > 6
320 RET
321 %else
322 rep ret
323 %endif
324%endmacro
325
326%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
bafad220
LM
327
328DECLARE_REG 0, rdi, edi, di, dil, edi
329DECLARE_REG 1, rsi, esi, si, sil, esi
330DECLARE_REG 2, rdx, edx, dx, dl, edx
331DECLARE_REG 3, rcx, ecx, cx, cl, ecx
332DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
333DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
334DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
335%define r7m [rsp + stack_offset + 16]
336%define r8m [rsp + stack_offset + 24]
337
338%macro LOAD_IF_USED 2 ; reg_id, number_of_args
339 %if %1 < %2
340 mov r%1, [rsp - 40 + %1*8]
341 %endif
342%endmacro
343
3f87f39c 344%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
bafad220
LM
345 ASSERT %2 >= %1
346 ASSERT %2 <= 7
bafad220
LM
347 LOAD_IF_USED 6, %1
348 DEFINE_ARGS %4
349%endmacro
350
351%macro RET 0
352 ret
353%endmacro
354
355%macro REP_RET 0
356 rep ret
357%endmacro
358
359%else ; X86_32 ;==============================================================
360
361DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
362DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
363DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
364DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
365DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
366DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
367DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
368%define r7m [esp + stack_offset + 32]
369%define r8m [esp + stack_offset + 36]
370%define rsp esp
371
372%macro PUSH_IF_USED 1 ; reg_id
373 %if %1 < regs_used
374 push r%1
375 %assign stack_offset stack_offset+4
376 %endif
377%endmacro
378
379%macro POP_IF_USED 1 ; reg_id
380 %if %1 < regs_used
381 pop r%1
382 %endif
383%endmacro
384
385%macro LOAD_IF_USED 2 ; reg_id, number_of_args
386 %if %1 < %2
387 mov r%1, [esp + stack_offset + 4 + %1*4]
388 %endif
389%endmacro
390
3f87f39c 391%macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
bafad220 392 ASSERT %2 >= %1
bafad220 393 %assign regs_used %2
bafad220
LM
394 ASSERT regs_used <= 7
395 PUSH_IF_USED 3
396 PUSH_IF_USED 4
397 PUSH_IF_USED 5
398 PUSH_IF_USED 6
399 LOAD_IF_USED 0, %1
400 LOAD_IF_USED 1, %1
401 LOAD_IF_USED 2, %1
402 LOAD_IF_USED 3, %1
403 LOAD_IF_USED 4, %1
404 LOAD_IF_USED 5, %1
405 LOAD_IF_USED 6, %1
bafad220
LM
406 DEFINE_ARGS %4
407%endmacro
408
409%macro RET 0
410 POP_IF_USED 6
411 POP_IF_USED 5
412 POP_IF_USED 4
413 POP_IF_USED 3
414 ret
415%endmacro
416
417%macro REP_RET 0
418 %if regs_used > 3
419 RET
420 %else
421 rep ret
422 %endif
423%endmacro
424
425%endif ;======================================================================
426
427
428
429;=============================================================================
430; arch-independent part
431;=============================================================================
432
433%assign function_align 16
434
435; Symbol prefix for C linkage
436%macro cglobal 1-2+
40c7d0ae
JGG
437 %xdefine %1 ff_%1
438 %ifdef PREFIX
439 %xdefine %1 _ %+ %1
440 %endif
29e4edbb 441 %xdefine %1.skip_prologue %1 %+ .skip_prologue
bafad220 442 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 443 global %1:function hidden
bafad220 444 %else
40c7d0ae 445 global %1
bafad220
LM
446 %endif
447 align function_align
448 %1:
449 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
3f87f39c 450 %assign stack_offset 0
bafad220
LM
451 %if %0 > 1
452 PROLOGUE %2
453 %endif
454%endmacro
455
456%macro cextern 1
457 %ifdef PREFIX
3f87f39c 458 %xdefine %1 _%1
bafad220 459 %endif
3f87f39c 460 extern %1
bafad220
LM
461%endmacro
462
463; This is needed for ELF, otherwise the GNU linker assumes the stack is
464; executable by default.
465%ifidn __OUTPUT_FORMAT__,elf
466SECTION .note.GNU-stack noalloc noexec nowrite progbits
467%endif
468
469%assign FENC_STRIDE 16
470%assign FDEC_STRIDE 32
471
472; merge mmx and sse*
473
474%macro CAT_XDEFINE 3
475 %xdefine %1%2 %3
476%endmacro
477
478%macro CAT_UNDEF 2
479 %undef %1%2
480%endmacro
481
482%macro INIT_MMX 0
483 %define RESET_MM_PERMUTATION INIT_MMX
484 %define mmsize 8
485 %define num_mmregs 8
486 %define mova movq
487 %define movu movq
488 %define movh movd
489 %define movnt movntq
490 %assign %%i 0
491 %rep 8
492 CAT_XDEFINE m, %%i, mm %+ %%i
493 CAT_XDEFINE nmm, %%i, %%i
494 %assign %%i %%i+1
495 %endrep
496 %rep 8
497 CAT_UNDEF m, %%i
498 CAT_UNDEF nmm, %%i
499 %assign %%i %%i+1
500 %endrep
501%endmacro
502
503%macro INIT_XMM 0
504 %define RESET_MM_PERMUTATION INIT_XMM
505 %define mmsize 16
506 %define num_mmregs 8
507 %ifdef ARCH_X86_64
508 %define num_mmregs 16
509 %endif
510 %define mova movdqa
511 %define movu movdqu
512 %define movh movq
513 %define movnt movntdq
514 %assign %%i 0
515 %rep num_mmregs
516 CAT_XDEFINE m, %%i, xmm %+ %%i
517 CAT_XDEFINE nxmm, %%i, %%i
518 %assign %%i %%i+1
519 %endrep
520%endmacro
521
522INIT_MMX
523
524; I often want to use macros that permute their arguments. e.g. there's no
525; efficient way to implement butterfly or transpose or dct without swapping some
526; arguments.
527;
528; I would like to not have to manually keep track of the permutations:
529; If I insert a permutation in the middle of a function, it should automatically
530; change everything that follows. For more complex macros I may also have multiple
531; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
532;
533; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
534; permutes its arguments. It's equivalent to exchanging the contents of the
535; registers, except that this way you exchange the register names instead, so it
536; doesn't cost any cycles.
537
538%macro PERMUTE 2-* ; takes a list of pairs to swap
539%rep %0/2
540 %xdefine tmp%2 m%2
541 %xdefine ntmp%2 nm%2
542 %rotate 2
543%endrep
544%rep %0/2
545 %xdefine m%1 tmp%2
546 %xdefine nm%1 ntmp%2
547 %undef tmp%2
548 %undef ntmp%2
549 %rotate 2
550%endrep
551%endmacro
552
553%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
554%rep %0-1
555%ifdef m%1
556 %xdefine tmp m%1
557 %xdefine m%1 m%2
558 %xdefine m%2 tmp
559 CAT_XDEFINE n, m%1, %1
560 CAT_XDEFINE n, m%2, %2
561%else
562 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
563 ; Be careful using this mode in nested macros though, as in some cases there may be
564 ; other copies of m# that have already been dereferenced and don't get updated correctly.
565 %xdefine %%n1 n %+ %1
566 %xdefine %%n2 n %+ %2
567 %xdefine tmp m %+ %%n1
568 CAT_XDEFINE m, %%n1, m %+ %%n2
569 CAT_XDEFINE m, %%n2, tmp
570 CAT_XDEFINE n, m %+ %%n1, %%n1
571 CAT_XDEFINE n, m %+ %%n2, %%n2
572%endif
573 %undef tmp
574 %rotate 1
575%endrep
576%endmacro
577
578%macro SAVE_MM_PERMUTATION 1
579 %assign %%i 0
580 %rep num_mmregs
581 CAT_XDEFINE %1_m, %%i, m %+ %%i
582 %assign %%i %%i+1
583 %endrep
584%endmacro
585
586%macro LOAD_MM_PERMUTATION 1
587 %assign %%i 0
588 %rep num_mmregs
589 CAT_XDEFINE m, %%i, %1_m %+ %%i
3f87f39c 590 CAT_XDEFINE n, m %+ %%i, %%i
bafad220
LM
591 %assign %%i %%i+1
592 %endrep
593%endmacro
594
595%macro call 1
596 call %1
597 %ifdef %1_m0
598 LOAD_MM_PERMUTATION %1
599 %endif
600%endmacro
601
3f87f39c 602;Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
603%macro add 2
604 %ifnum %2
605 %if %2==128
606 sub %1, -128
607 %else
608 add %1, %2
609 %endif
610 %else
611 add %1, %2
612 %endif
613%endmacro
614
615%macro sub 2
616 %ifnum %2
617 %if %2==128
618 add %1, -128
619 %else
620 sub %1, %2
621 %endif
622 %else
623 sub %1, %2
624 %endif
625%endmacro