x86inc: Clear __SECT__
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
71155665 4;* Copyright (C) 2005-2013 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
79793f83 8;* Fiona Glaser <fiona@x264.com>
ad7d7d4f 9;* Henrik Gramner <henrik@gramner.com>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
d633d12b
DB
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
3b15a6d7 45%define WIN64 0
96c9cc10 46%define UNIX64 0
3b15a6d7 47%if ARCH_X86_64
3f87f39c 48 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 49 %define WIN64 1
166f3993
HY
50 %elifidn __OUTPUT_FORMAT__,win64
51 %define WIN64 1
47f9d7ce
DB
52 %elifidn __OUTPUT_FORMAT__,x64
53 %define WIN64 1
3f87f39c 54 %else
3b15a6d7 55 %define UNIX64 1
3f87f39c
JA
56 %endif
57%endif
58
2966cc18
JGG
59%ifdef PREFIX
60 %define mangle(x) _ %+ x
61%else
62 %define mangle(x) x
63%endif
64
ad7d7d4f
HG
65; aout does not support align=
66; NOTE: This section is out of sync with x264, in order to
67; keep supporting OS/2.
3f87f39c 68%macro SECTION_RODATA 0-1 16
ad7d7d4f 69 %ifidn __OUTPUT_FORMAT__,aout
d69f9a42 70 section .text
bafad220 71 %else
3f87f39c 72 SECTION .rodata align=%1
bafad220
LM
73 %endif
74%endmacro
75
d69f9a42
DY
76%macro SECTION_TEXT 0-1 16
77 %ifidn __OUTPUT_FORMAT__,aout
78 SECTION .text
79 %else
80 SECTION .text align=%1
81 %endif
82%endmacro
83
3b15a6d7 84%if WIN64
3f87f39c 85 %define PIC
412b248e 86%elif ARCH_X86_64 == 0
2966cc18
JGG
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
90 %undef PIC
91%endif
92%ifdef PIC
2966cc18 93 default rel
bafad220
LM
94%endif
95
180d43bc
MR
96%macro CPUNOP 1
97 %if HAVE_CPUNOP
98 CPU %1
99 %endif
100%endmacro
101
bafad220
LM
102; Macros to eliminate most code duplication between x86_32 and x86_64:
103; Currently this works only for leaf functions which load all their arguments
104; into registers at the start, and make no other use of the stack. Luckily that
105; covers most of x264's asm.
106
107; PROLOGUE:
108; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
109; %2 = number of registers used. pushes callee-saved regs if needed.
110; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
6f40e9f0
RB
111; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
112; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
113; and an extra register will be allocated to hold the original stack
114; pointer (to not invalidate r0m etc.). To prevent the use of an extra
115; register as stack pointer, request a negative stack size.
116; %4+/%5+ = list of names to define to registers
bafad220
LM
117; PROLOGUE can also be invoked by adding the same options to cglobal
118
119; e.g.
29e4edbb 120; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 121; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
122
123; TODO Some functions can use some args directly from the stack. If they're the
124; last args then you can just not declare them, but if they're in the middle
125; we need more flexible macro.
126
127; RET:
2f7f2e4b 128; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
129
130; REP_RET:
25cb0c1a 131; Use this instead of RET if it's a branch target.
bafad220 132
3f87f39c
JA
133; registers:
134; rN and rNq are the native-size register holding function argument N
135; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 136; rNh is the high 8 bits of the word size
3f87f39c
JA
137; rNm is the original location of arg N (a register or on the stack), dword
138; rNmp is native size
139
96c9cc10 140%macro DECLARE_REG 2-3
bafad220 141 %define r%1q %2
96c9cc10
RB
142 %define r%1d %2d
143 %define r%1w %2w
144 %define r%1b %2b
145 %define r%1h %2h
7a1944b9 146 %define %2q %2
96c9cc10
RB
147 %if %0 == 2
148 %define r%1m %2d
3f87f39c 149 %define r%1mp %2
3b15a6d7 150 %elif ARCH_X86_64 ; memory
6f40e9f0 151 %define r%1m [rstk + stack_offset + %3]
0995ad8d 152 %define r%1mp qword r %+ %1 %+ m
3f87f39c 153 %else
6f40e9f0 154 %define r%1m [rstk + stack_offset + %3]
0995ad8d 155 %define r%1mp dword r %+ %1 %+ m
3f87f39c 156 %endif
bafad220
LM
157 %define r%1 %2
158%endmacro
159
96c9cc10 160%macro DECLARE_REG_SIZE 3
bafad220
LM
161 %define r%1q r%1
162 %define e%1q r%1
163 %define r%1d e%1
164 %define e%1d e%1
165 %define r%1w %1
166 %define e%1w %1
96c9cc10
RB
167 %define r%1h %3
168 %define e%1h %3
bafad220
LM
169 %define r%1b %2
170 %define e%1b %2
3b15a6d7 171%if ARCH_X86_64 == 0
bafad220
LM
172 %define r%1 e%1
173%endif
174%endmacro
175
96c9cc10
RB
176DECLARE_REG_SIZE ax, al, ah
177DECLARE_REG_SIZE bx, bl, bh
178DECLARE_REG_SIZE cx, cl, ch
179DECLARE_REG_SIZE dx, dl, dh
180DECLARE_REG_SIZE si, sil, null
181DECLARE_REG_SIZE di, dil, null
182DECLARE_REG_SIZE bp, bpl, null
bafad220 183
3f87f39c
JA
184; t# defines for when per-arch register allocation is more complex than just function arguments
185
186%macro DECLARE_REG_TMP 1-*
187 %assign %%i 0
188 %rep %0
189 CAT_XDEFINE t, %%i, r%1
190 %assign %%i %%i+1
191 %rotate 1
192 %endrep
193%endmacro
194
195%macro DECLARE_REG_TMP_SIZE 0-*
196 %rep %0
197 %define t%1q t%1 %+ q
198 %define t%1d t%1 %+ d
199 %define t%1w t%1 %+ w
96c9cc10 200 %define t%1h t%1 %+ h
3f87f39c
JA
201 %define t%1b t%1 %+ b
202 %rotate 1
203 %endrep
204%endmacro
205
729f90e2 206DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 207
3b15a6d7 208%if ARCH_X86_64
bafad220
LM
209 %define gprsize 8
210%else
211 %define gprsize 4
212%endif
213
214%macro PUSH 1
215 push %1
6f40e9f0
RB
216 %ifidn rstk, rsp
217 %assign stack_offset stack_offset+gprsize
218 %endif
bafad220
LM
219%endmacro
220
221%macro POP 1
222 pop %1
6f40e9f0
RB
223 %ifidn rstk, rsp
224 %assign stack_offset stack_offset-gprsize
225 %endif
bafad220
LM
226%endmacro
227
729f90e2
HG
228%macro PUSH_IF_USED 1-*
229 %rep %0
230 %if %1 < regs_used
231 PUSH r%1
232 %endif
233 %rotate 1
234 %endrep
235%endmacro
236
237%macro POP_IF_USED 1-*
238 %rep %0
239 %if %1 < regs_used
240 pop r%1
241 %endif
242 %rotate 1
243 %endrep
244%endmacro
245
246%macro LOAD_IF_USED 1-*
247 %rep %0
248 %if %1 < num_args
249 mov r%1, r %+ %1 %+ mp
250 %endif
251 %rotate 1
252 %endrep
253%endmacro
254
bafad220
LM
255%macro SUB 2
256 sub %1, %2
6f40e9f0 257 %ifidn %1, rstk
bafad220
LM
258 %assign stack_offset stack_offset+(%2)
259 %endif
260%endmacro
261
262%macro ADD 2
263 add %1, %2
6f40e9f0 264 %ifidn %1, rstk
bafad220
LM
265 %assign stack_offset stack_offset-(%2)
266 %endif
267%endmacro
268
269%macro movifnidn 2
270 %ifnidn %1, %2
271 mov %1, %2
272 %endif
273%endmacro
274
275%macro movsxdifnidn 2
276 %ifnidn %1, %2
277 movsxd %1, %2
278 %endif
279%endmacro
280
281%macro ASSERT 1
282 %if (%1) == 0
283 %error assert failed
284 %endif
285%endmacro
286
287%macro DEFINE_ARGS 0-*
288 %ifdef n_arg_names
289 %assign %%i 0
290 %rep n_arg_names
291 CAT_UNDEF arg_name %+ %%i, q
292 CAT_UNDEF arg_name %+ %%i, d
293 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 294 CAT_UNDEF arg_name %+ %%i, h
bafad220 295 CAT_UNDEF arg_name %+ %%i, b
2f77923d 296 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 297 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
298 CAT_UNDEF arg_name, %%i
299 %assign %%i %%i+1
300 %endrep
301 %endif
302
0f53d0cf
LM
303 %xdefine %%stack_offset stack_offset
304 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
305 %assign %%i 0
306 %rep %0
307 %xdefine %1q r %+ %%i %+ q
308 %xdefine %1d r %+ %%i %+ d
309 %xdefine %1w r %+ %%i %+ w
96c9cc10 310 %xdefine %1h r %+ %%i %+ h
bafad220 311 %xdefine %1b r %+ %%i %+ b
2f77923d 312 %xdefine %1m r %+ %%i %+ m
98b9da2a 313 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
314 CAT_XDEFINE arg_name, %%i, %1
315 %assign %%i %%i+1
316 %rotate 1
317 %endrep
0f53d0cf
LM
318 %xdefine stack_offset %%stack_offset
319 %assign n_arg_names %0
bafad220
LM
320%endmacro
321
6f40e9f0
RB
322%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
323 %ifnum %1
324 %if %1 != 0
325 %assign %%stack_alignment ((mmsize + 15) & ~15)
326 %assign stack_size %1
327 %if stack_size < 0
328 %assign stack_size -stack_size
329 %endif
bbe4a6db
HG
330 %assign stack_size_padded stack_size
331 %if WIN64
332 %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
333 %if mmsize != 8
334 %assign xmm_regs_used %2
335 %if xmm_regs_used > 8
336 %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
337 %endif
338 %endif
a34d9ad9 339 %endif
6f40e9f0 340 %if mmsize <= 16 && HAVE_ALIGNED_STACK
bbe4a6db 341 %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
6f40e9f0
RB
342 SUB rsp, stack_size_padded
343 %else
a34d9ad9
RB
344 %assign %%reg_num (regs_used - 1)
345 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
346 ; align stack, and save original stack location directly above
347 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
348 ; stack in a single instruction (i.e. mov rsp, rstk or mov
349 ; rsp, [rsp+stack_size_padded])
350 mov rstk, rsp
6f40e9f0
RB
351 %if %1 < 0 ; need to store rsp on stack
352 sub rsp, gprsize+stack_size_padded
353 and rsp, ~(%%stack_alignment-1)
354 %xdefine rstkm [rsp+stack_size_padded]
355 mov rstkm, rstk
356 %else ; can keep rsp in rstk during whole function
357 sub rsp, stack_size_padded
358 and rsp, ~(%%stack_alignment-1)
359 %xdefine rstkm rstk
360 %endif
361 %endif
bbe4a6db 362 WIN64_PUSH_XMM
6f40e9f0
RB
363 %endif
364 %endif
365%endmacro
366
367%macro SETUP_STACK_POINTER 1
368 %ifnum %1
369 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
370 %if %1 > 0
371 %assign regs_used (regs_used + 1)
372 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
373 %warning "Stack pointer will overwrite register argument"
374 %endif
375 %endif
376 %endif
377%endmacro
378
379%macro DEFINE_ARGS_INTERNAL 3+
380 %ifnum %2
381 DEFINE_ARGS %3
382 %elif %1 == 4
383 DEFINE_ARGS %2
384 %elif %1 > 4
385 DEFINE_ARGS %2, %3
386 %endif
387%endmacro
388
3b15a6d7 389%if WIN64 ; Windows x64 ;=================================================
bafad220 390
96c9cc10
RB
391DECLARE_REG 0, rcx
392DECLARE_REG 1, rdx
393DECLARE_REG 2, R8
394DECLARE_REG 3, R9
395DECLARE_REG 4, R10, 40
396DECLARE_REG 5, R11, 48
397DECLARE_REG 6, rax, 56
398DECLARE_REG 7, rdi, 64
399DECLARE_REG 8, rsi, 72
400DECLARE_REG 9, rbx, 80
401DECLARE_REG 10, rbp, 88
402DECLARE_REG 11, R12, 96
403DECLARE_REG 12, R13, 104
404DECLARE_REG 13, R14, 112
405DECLARE_REG 14, R15, 120
3f87f39c 406
6f40e9f0 407%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 408 %assign num_args %1
3f87f39c 409 %assign regs_used %2
729f90e2 410 ASSERT regs_used >= num_args
a34d9ad9 411 SETUP_STACK_POINTER %4
729f90e2
HG
412 ASSERT regs_used <= 15
413 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
414 ALLOC_STACK %4, %3
415 %if mmsize != 8 && stack_size == 0
9cf73853
HG
416 WIN64_SPILL_XMM %3
417 %endif
729f90e2 418 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
419 DEFINE_ARGS_INTERNAL %0, %4, %5
420%endmacro
421
422%macro WIN64_PUSH_XMM 0
bbe4a6db
HG
423 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
424 %if xmm_regs_used > 6
425 movaps [rstk + stack_offset + 8], xmm6
426 %endif
427 %if xmm_regs_used > 7
428 movaps [rstk + stack_offset + 24], xmm7
429 %endif
430 %if xmm_regs_used > 8
431 %assign %%i 8
432 %rep xmm_regs_used-8
433 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
434 %assign %%i %%i+1
435 %endrep
436 %endif
532e7697
LM
437%endmacro
438
439%macro WIN64_SPILL_XMM 1
440 %assign xmm_regs_used %1
441 ASSERT xmm_regs_used <= 16
bbe4a6db
HG
442 %if xmm_regs_used > 8
443 %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
444 SUB rsp, stack_size_padded
3f87f39c 445 %endif
bbe4a6db 446 WIN64_PUSH_XMM
3f87f39c
JA
447%endmacro
448
532e7697 449%macro WIN64_RESTORE_XMM_INTERNAL 1
bbe4a6db
HG
450 %assign %%pad_size 0
451 %if xmm_regs_used > 8
3f87f39c 452 %assign %%i xmm_regs_used
bbe4a6db 453 %rep xmm_regs_used-8
3f87f39c 454 %assign %%i %%i-1
bbe4a6db 455 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
3f87f39c 456 %endrep
6f40e9f0
RB
457 %endif
458 %if stack_size_padded > 0
459 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
460 mov rsp, rstkm
461 %else
462 add %1, stack_size_padded
bbe4a6db 463 %assign %%pad_size stack_size_padded
6f40e9f0 464 %endif
3f87f39c 465 %endif
bbe4a6db
HG
466 %if xmm_regs_used > 7
467 movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
468 %endif
469 %if xmm_regs_used > 6
470 movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
471 %endif
3f87f39c
JA
472%endmacro
473
532e7697
LM
474%macro WIN64_RESTORE_XMM 1
475 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 476 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
477 %assign xmm_regs_used 0
478%endmacro
479
6f40e9f0 480%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 481
3f87f39c 482%macro RET 0
532e7697 483 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 484 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
485%if mmsize == 32
486 vzeroupper
487%endif
25cb0c1a 488 AUTO_REP_RET
bafad220
LM
489%endmacro
490
3b15a6d7 491%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 492
96c9cc10
RB
493DECLARE_REG 0, rdi
494DECLARE_REG 1, rsi
495DECLARE_REG 2, rdx
496DECLARE_REG 3, rcx
497DECLARE_REG 4, R8
498DECLARE_REG 5, R9
499DECLARE_REG 6, rax, 8
500DECLARE_REG 7, R10, 16
501DECLARE_REG 8, R11, 24
502DECLARE_REG 9, rbx, 32
503DECLARE_REG 10, rbp, 40
504DECLARE_REG 11, R12, 48
505DECLARE_REG 12, R13, 56
506DECLARE_REG 13, R14, 64
507DECLARE_REG 14, R15, 72
bafad220 508
6f40e9f0 509%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
510 %assign num_args %1
511 %assign regs_used %2
512 ASSERT regs_used >= num_args
a34d9ad9 513 SETUP_STACK_POINTER %4
729f90e2
HG
514 ASSERT regs_used <= 15
515 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 516 ALLOC_STACK %4
729f90e2 517 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 518 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
519%endmacro
520
6f40e9f0 521%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 522
bafad220 523%macro RET 0
6f40e9f0
RB
524%if stack_size_padded > 0
525%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
526 mov rsp, rstkm
527%else
528 add rsp, stack_size_padded
529%endif
530%endif
729f90e2 531 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
532%if mmsize == 32
533 vzeroupper
534%endif
25cb0c1a 535 AUTO_REP_RET
bafad220
LM
536%endmacro
537
bafad220
LM
538%else ; X86_32 ;==============================================================
539
96c9cc10
RB
540DECLARE_REG 0, eax, 4
541DECLARE_REG 1, ecx, 8
542DECLARE_REG 2, edx, 12
543DECLARE_REG 3, ebx, 16
544DECLARE_REG 4, esi, 20
545DECLARE_REG 5, edi, 24
546DECLARE_REG 6, ebp, 28
bafad220
LM
547%define rsp esp
548
729f90e2
HG
549%macro DECLARE_ARG 1-*
550 %rep %0
6f40e9f0 551 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
552 %define r%1mp dword r%1m
553 %rotate 1
554 %endrep
bafad220
LM
555%endmacro
556
729f90e2 557DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 558
6f40e9f0 559%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 560 %assign num_args %1
bafad220 561 %assign regs_used %2
a34d9ad9
RB
562 ASSERT regs_used >= num_args
563 %if num_args > 7
564 %assign num_args 7
565 %endif
729f90e2
HG
566 %if regs_used > 7
567 %assign regs_used 7
568 %endif
6f40e9f0
RB
569 SETUP_STACK_POINTER %4
570 ASSERT regs_used <= 7
729f90e2 571 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 572 ALLOC_STACK %4
729f90e2 573 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 574 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
575%endmacro
576
6f40e9f0 577%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 578
bafad220 579%macro RET 0
6f40e9f0
RB
580%if stack_size_padded > 0
581%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
582 mov rsp, rstkm
583%else
584 add rsp, stack_size_padded
585%endif
586%endif
729f90e2 587 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
588%if mmsize == 32
589 vzeroupper
590%endif
25cb0c1a 591 AUTO_REP_RET
bafad220
LM
592%endmacro
593
bafad220
LM
594%endif ;======================================================================
595
3b15a6d7 596%if WIN64 == 0
532e7697
LM
597%macro WIN64_SPILL_XMM 1
598%endmacro
599%macro WIN64_RESTORE_XMM 1
600%endmacro
6f40e9f0
RB
601%macro WIN64_PUSH_XMM 0
602%endmacro
532e7697
LM
603%endif
604
25cb0c1a
LM
605; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
606; a branch or a branch target. So switch to a 2-byte form of ret in that case.
607; We can automatically detect "follows a branch", but not a branch target.
608; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
96c9cc10
RB
609%macro REP_RET 0
610 %if has_epilogue
611 RET
612 %else
613 rep ret
614 %endif
615%endmacro
616
25cb0c1a
LM
617%define last_branch_adr $$
618%macro AUTO_REP_RET 0
619 %ifndef cpuflags
620 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
621 %elif notcpuflag(ssse3)
622 times ((last_branch_adr-$)>>31)+1 rep
623 %endif
624 ret
625%endmacro
626
627%macro BRANCH_INSTR 0-*
628 %rep %0
629 %macro %1 1-2 %1
630 %2 %1
631 %%branch_instr:
632 %xdefine last_branch_adr %%branch_instr
633 %endmacro
634 %rotate 1
635 %endrep
636%endmacro
637
638BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
639
96c9cc10
RB
640%macro TAIL_CALL 2 ; callee, is_nonadjacent
641 %if has_epilogue
642 call %1
643 RET
644 %elif %2
645 jmp %1
646 %endif
647%endmacro
648
bafad220
LM
649;=============================================================================
650; arch-independent part
651;=============================================================================
652
653%assign function_align 16
654
2f7f2e4b
LM
655; Begin a function.
656; Applies any symbol mangling needed for C linkage, and sets up a define such that
657; subsequent uses of the function name automatically refer to the mangled version.
658; Appends cpuflags to the function name if cpuflags has been specified.
d633d12b
DB
659; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
660; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
a34d9ad9 661%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
d633d12b
DB
662 cglobal_internal 1, %1 %+ SUFFIX, %2
663%endmacro
664%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
665 cglobal_internal 0, %1 %+ SUFFIX, %2
666%endmacro
667%macro cglobal_internal 2-3+
668 %if %1
669 %xdefine %%FUNCTION_PREFIX private_prefix
670 %xdefine %%VISIBILITY hidden
671 %else
672 %xdefine %%FUNCTION_PREFIX public_prefix
673 %xdefine %%VISIBILITY
674 %endif
675 %ifndef cglobaled_%2
676 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
677 %xdefine %2.skip_prologue %2 %+ .skip_prologue
678 CAT_XDEFINE cglobaled_, %2, 1
2f7f2e4b 679 %endif
d633d12b 680 %xdefine current_function %2
bafad220 681 %ifidn __OUTPUT_FORMAT__,elf
d633d12b 682 global %2:function %%VISIBILITY
bafad220 683 %else
d633d12b 684 global %2
bafad220
LM
685 %endif
686 align function_align
d633d12b 687 %2:
bbe4a6db
HG
688 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
689 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
690 %assign stack_offset 0 ; stack pointer offset relative to the return address
691 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
692 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
693 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
d633d12b
DB
694 %ifnidn %3, ""
695 PROLOGUE %3
bafad220
LM
696 %endif
697%endmacro
698
699%macro cextern 1
ef5d41a5 700 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 701 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
702 extern %1
703%endmacro
704
2f7f2e4b 705; like cextern, but without the prefix
2966cc18
JGG
706%macro cextern_naked 1
707 %xdefine %1 mangle(%1)
2f7f2e4b 708 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 709 extern %1
bafad220
LM
710%endmacro
711
71155665 712%macro const 1-2+
ef5d41a5 713 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
ad76e6e7
HG
714 %ifidn __OUTPUT_FORMAT__,elf
715 global %1:data hidden
716 %else
717 global %1
718 %endif
2966cc18
JGG
719 %1: %2
720%endmacro
721
bafad220
LM
722; This is needed for ELF, otherwise the GNU linker assumes the stack is
723; executable by default.
724%ifidn __OUTPUT_FORMAT__,elf
dd4d709b 725[section .note.GNU-stack noalloc noexec nowrite progbits]
bafad220
LM
726%endif
727
dd4d709b
TG
728; Overrides the default .text section.
729; Silences warnings when defining structures.
730%define __SECT__
731
2f7f2e4b
LM
732; cpuflags
733
734%assign cpuflags_mmx (1<<0)
735%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
736%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 737%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
738%assign cpuflags_sse (1<<4) | cpuflags_mmx2
739%assign cpuflags_sse2 (1<<5) | cpuflags_sse
740%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
741%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
742%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
743%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
744%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
745%assign cpuflags_avx (1<<11)| cpuflags_sse42
746%assign cpuflags_xop (1<<12)| cpuflags_avx
747%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
748%assign cpuflags_avx2 (1<<14)| cpuflags_avx
749%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
750
751%assign cpuflags_cache32 (1<<16)
752%assign cpuflags_cache64 (1<<17)
753%assign cpuflags_slowctz (1<<18)
754%assign cpuflags_lzcnt (1<<19)
3e2fa991
HG
755%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
756%assign cpuflags_atom (1<<21)
757%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
758%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
2f7f2e4b
LM
759
760%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
761%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
762
f629705b 763; Takes an arbitrary number of cpuflags from the above list.
2f7f2e4b
LM
764; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
765; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
f629705b
HG
766%macro INIT_CPUFLAGS 0-*
767 %xdefine SUFFIX
768 %undef cpuname
769 %assign cpuflags 0
770
2f7f2e4b 771 %if %0 >= 1
f629705b
HG
772 %rep %0
773 %ifdef cpuname
774 %xdefine cpuname cpuname %+ _%1
775 %else
776 %xdefine cpuname %1
777 %endif
778 %assign cpuflags cpuflags | cpuflags_%1
779 %rotate 1
780 %endrep
2f7f2e4b 781 %xdefine SUFFIX _ %+ cpuname
f629705b 782
2f7f2e4b
LM
783 %if cpuflag(avx)
784 %assign avx_enabled 1
785 %endif
c108ba01 786 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
f2bd8a07
JR
787 %define mova movaps
788 %define movu movups
789 %define movnta movntps
790 %endif
2f7f2e4b
LM
791 %if cpuflag(aligned)
792 %define movu mova
f629705b 793 %elif cpuflag(sse3) && notcpuflag(ssse3)
2f7f2e4b
LM
794 %define movu lddqu
795 %endif
f629705b
HG
796 %endif
797
798 %if cpuflag(sse2)
799 CPUNOP amdnop
2f7f2e4b 800 %else
f629705b 801 CPUNOP basicnop
2f7f2e4b
LM
802 %endif
803%endmacro
804
3fb78e99 805; Merge mmx and sse*
176a0fca
HG
806; m# is a simd register of the currently selected size
807; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
808; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
3fb78e99 809; (All 3 remain in sync through SWAP.)
bafad220
LM
810
811%macro CAT_XDEFINE 3
812 %xdefine %1%2 %3
813%endmacro
814
815%macro CAT_UNDEF 2
816 %undef %1%2
817%endmacro
818
2f7f2e4b 819%macro INIT_MMX 0-1+
33cbfa6f 820 %assign avx_enabled 0
2f7f2e4b 821 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
822 %define mmsize 8
823 %define num_mmregs 8
824 %define mova movq
825 %define movu movq
826 %define movh movd
532e7697 827 %define movnta movntq
bafad220
LM
828 %assign %%i 0
829 %rep 8
830 CAT_XDEFINE m, %%i, mm %+ %%i
ec217218 831 CAT_XDEFINE nnmm, %%i, %%i
bafad220
LM
832 %assign %%i %%i+1
833 %endrep
834 %rep 8
835 CAT_UNDEF m, %%i
ec217218 836 CAT_UNDEF nnmm, %%i
bafad220
LM
837 %assign %%i %%i+1
838 %endrep
2f7f2e4b 839 INIT_CPUFLAGS %1
bafad220
LM
840%endmacro
841
2f7f2e4b 842%macro INIT_XMM 0-1+
33cbfa6f 843 %assign avx_enabled 0
2f7f2e4b 844 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
845 %define mmsize 16
846 %define num_mmregs 8
3b15a6d7 847 %if ARCH_X86_64
bafad220
LM
848 %define num_mmregs 16
849 %endif
850 %define mova movdqa
851 %define movu movdqu
852 %define movh movq
532e7697 853 %define movnta movntdq
bafad220
LM
854 %assign %%i 0
855 %rep num_mmregs
856 CAT_XDEFINE m, %%i, xmm %+ %%i
ec217218 857 CAT_XDEFINE nnxmm, %%i, %%i
bafad220
LM
858 %assign %%i %%i+1
859 %endrep
2f7f2e4b 860 INIT_CPUFLAGS %1
bafad220
LM
861%endmacro
862
2f7f2e4b 863%macro INIT_YMM 0-1+
33cbfa6f 864 %assign avx_enabled 1
2f7f2e4b 865 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
866 %define mmsize 32
867 %define num_mmregs 8
3b15a6d7 868 %if ARCH_X86_64
33cbfa6f
VS
869 %define num_mmregs 16
870 %endif
c108ba01
HG
871 %define mova movdqa
872 %define movu movdqu
2f7f2e4b 873 %undef movh
c108ba01 874 %define movnta movntdq
33cbfa6f
VS
875 %assign %%i 0
876 %rep num_mmregs
877 CAT_XDEFINE m, %%i, ymm %+ %%i
878 CAT_XDEFINE nymm, %%i, %%i
879 %assign %%i %%i+1
880 %endrep
2f7f2e4b 881 INIT_CPUFLAGS %1
33cbfa6f
VS
882%endmacro
883
2f7f2e4b 884INIT_XMM
bafad220 885
3fb78e99
LM
886%macro DECLARE_MMCAST 1
887 %define mmmm%1 mm%1
888 %define mmxmm%1 mm%1
889 %define mmymm%1 mm%1
890 %define xmmmm%1 mm%1
891 %define xmmxmm%1 xmm%1
892 %define xmmymm%1 xmm%1
893 %define ymmmm%1 mm%1
176a0fca 894 %define ymmxmm%1 xmm%1
3fb78e99
LM
895 %define ymmymm%1 ymm%1
896 %define xm%1 xmm %+ m%1
897 %define ym%1 ymm %+ m%1
898%endmacro
899
900%assign i 0
901%rep 16
902 DECLARE_MMCAST i
903%assign i i+1
904%endrep
905
bafad220
LM
906; I often want to use macros that permute their arguments. e.g. there's no
907; efficient way to implement butterfly or transpose or dct without swapping some
908; arguments.
909;
910; I would like to not have to manually keep track of the permutations:
911; If I insert a permutation in the middle of a function, it should automatically
912; change everything that follows. For more complex macros I may also have multiple
913; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
914;
915; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
916; permutes its arguments. It's equivalent to exchanging the contents of the
917; registers, except that this way you exchange the register names instead, so it
918; doesn't cost any cycles.
919
920%macro PERMUTE 2-* ; takes a list of pairs to swap
921%rep %0/2
49ebe3f9 922 %xdefine %%tmp%2 m%2
bafad220
LM
923 %rotate 2
924%endrep
925%rep %0/2
49ebe3f9 926 %xdefine m%1 %%tmp%2
ec217218 927 CAT_XDEFINE nn, m%1, %1
bafad220
LM
928 %rotate 2
929%endrep
930%endmacro
931
49ebe3f9
LM
932%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
933%ifnum %1 ; SWAP 0, 1, ...
934 SWAP_INTERNAL_NUM %1, %2
935%else ; SWAP m0, m1, ...
936 SWAP_INTERNAL_NAME %1, %2
bafad220 937%endif
49ebe3f9
LM
938%endmacro
939
940%macro SWAP_INTERNAL_NUM 2-*
941 %rep %0-1
942 %xdefine %%tmp m%1
943 %xdefine m%1 m%2
944 %xdefine m%2 %%tmp
ec217218
LM
945 CAT_XDEFINE nn, m%1, %1
946 CAT_XDEFINE nn, m%2, %2
bafad220 947 %rotate 1
49ebe3f9
LM
948 %endrep
949%endmacro
950
951%macro SWAP_INTERNAL_NAME 2-*
ec217218 952 %xdefine %%args nn %+ %1
49ebe3f9 953 %rep %0-1
ec217218 954 %xdefine %%args %%args, nn %+ %2
49ebe3f9
LM
955 %rotate 1
956 %endrep
957 SWAP_INTERNAL_NUM %%args
bafad220
LM
958%endmacro
959
2f7f2e4b
LM
960; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
961; calls to that function will automatically load the permutation, so values can
962; be returned in mmregs.
963%macro SAVE_MM_PERMUTATION 0-1
964 %if %0
965 %xdefine %%f %1_m
966 %else
967 %xdefine %%f current_function %+ _m
968 %endif
bafad220
LM
969 %assign %%i 0
970 %rep num_mmregs
2f7f2e4b 971 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
972 %assign %%i %%i+1
973 %endrep
974%endmacro
975
2966cc18 976%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
977 %ifdef %1_m0
978 %assign %%i 0
979 %rep num_mmregs
980 CAT_XDEFINE m, %%i, %1_m %+ %%i
ec217218 981 CAT_XDEFINE nn, m %+ %%i, %%i
2f7f2e4b
LM
982 %assign %%i %%i+1
983 %endrep
984 %endif
bafad220
LM
985%endmacro
986
2f7f2e4b 987; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 988%macro call 1
edd82267 989 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
990%endmacro
991%macro call_internal 2
edd82267
MR
992 %xdefine %%i %2
993 %ifndef cglobaled_%2
994 %ifdef cglobaled_%1
995 %xdefine %%i %1
2f7f2e4b 996 %endif
bafad220 997 %endif
2f7f2e4b
LM
998 call %%i
999 LOAD_MM_PERMUTATION %%i
bafad220
LM
1000%endmacro
1001
2966cc18 1002; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
1003%macro add 2
1004 %ifnum %2
1005 %if %2==128
1006 sub %1, -128
1007 %else
1008 add %1, %2
1009 %endif
1010 %else
1011 add %1, %2
1012 %endif
1013%endmacro
1014
1015%macro sub 2
1016 %ifnum %2
1017 %if %2==128
1018 add %1, -128
1019 %else
1020 sub %1, %2
1021 %endif
1022 %else
1023 sub %1, %2
1024 %endif
1025%endmacro
33cbfa6f
VS
1026
1027;=============================================================================
1028; AVX abstraction layer
1029;=============================================================================
1030
1031%assign i 0
1032%rep 16
1033 %if i < 8
1034 CAT_XDEFINE sizeofmm, i, 8
1035 %endif
1036 CAT_XDEFINE sizeofxmm, i, 16
1037 CAT_XDEFINE sizeofymm, i, 32
1038%assign i i+1
1039%endrep
1040%undef i
1041
96c9cc10
RB
1042%macro CHECK_AVX_INSTR_EMU 3-*
1043 %xdefine %%opcode %1
1044 %xdefine %%dst %2
1045 %rep %0-2
1046 %ifidn %%dst, %3
1047 %error non-avx emulation of ``%%opcode'' is not supported
1048 %endif
1049 %rotate 1
1050 %endrep
1051%endmacro
1052
33cbfa6f
VS
1053;%1 == instruction
1054;%2 == 1 if float, 0 if int
c108ba01
HG
1055;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1056;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
33cbfa6f 1057;%5+: operands
c108ba01
HG
1058%macro RUN_AVX_INSTR 5-8+
1059 %ifnum sizeof%6
b7d0d10a 1060 %assign __sizeofreg sizeof%6
c108ba01 1061 %elifnum sizeof%5
b7d0d10a 1062 %assign __sizeofreg sizeof%5
2f7f2e4b 1063 %else
b7d0d10a 1064 %assign __sizeofreg mmsize
2f7f2e4b 1065 %endif
b7d0d10a
LM
1066 %assign __emulate_avx 0
1067 %if avx_enabled && __sizeofreg >= 16
1068 %xdefine __instr v%1
33cbfa6f 1069 %else
b7d0d10a 1070 %xdefine __instr %1
c108ba01 1071 %if %0 >= 7+%3
b7d0d10a 1072 %assign __emulate_avx 1
33cbfa6f 1073 %endif
c108ba01 1074 %endif
33cbfa6f 1075
b7d0d10a
LM
1076 %if __emulate_avx
1077 %xdefine __src1 %6
1078 %xdefine __src2 %7
c108ba01
HG
1079 %ifnidn %5, %6
1080 %if %0 >= 8
1081 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
1082 %else
1083 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
1084 %endif
1085 %if %4 && %3 == 0
1086 %ifnid %7
1087 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1088 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1089 ; So, if the instruction is commutative with a memory arg, swap them.
b7d0d10a
LM
1090 %xdefine __src1 %7
1091 %xdefine __src2 %6
33cbfa6f 1092 %endif
c108ba01 1093 %endif
b7d0d10a
LM
1094 %if __sizeofreg == 8
1095 MOVQ %5, __src1
c108ba01 1096 %elif %2
b7d0d10a 1097 MOVAPS %5, __src1
33cbfa6f 1098 %else
b7d0d10a 1099 MOVDQA %5, __src1
33cbfa6f 1100 %endif
33cbfa6f 1101 %endif
c108ba01 1102 %if %0 >= 8
b7d0d10a 1103 %1 %5, __src2, %8
c108ba01 1104 %else
b7d0d10a 1105 %1 %5, __src2
2f7f2e4b 1106 %endif
c108ba01 1107 %elif %0 >= 8
b7d0d10a 1108 __instr %5, %6, %7, %8
c108ba01 1109 %elif %0 == 7
b7d0d10a 1110 __instr %5, %6, %7
c108ba01 1111 %elif %0 == 6
b7d0d10a 1112 __instr %5, %6
2f7f2e4b 1113 %else
b7d0d10a 1114 __instr %5
2f7f2e4b
LM
1115 %endif
1116%endmacro
1117
33cbfa6f
VS
1118;%1 == instruction
1119;%2 == 1 if float, 0 if int
c108ba01
HG
1120;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1121;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1122%macro AVX_INSTR 1-4 0, 1, 0
1123 %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
1124 %ifidn %2, fnord
1125 RUN_AVX_INSTR %6, %7, %8, %9, %1
1126 %elifidn %3, fnord
1127 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
33cbfa6f 1128 %elifidn %4, fnord
c108ba01 1129 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
33cbfa6f 1130 %elifidn %5, fnord
c108ba01 1131 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
33cbfa6f 1132 %else
c108ba01 1133 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
33cbfa6f
VS
1134 %endif
1135 %endmacro
1136%endmacro
1137
c108ba01
HG
1138; Instructions with both VEX and non-VEX encodings
1139; Non-destructive instructions are written without parameters
2f7f2e4b
LM
1140AVX_INSTR addpd, 1, 0, 1
1141AVX_INSTR addps, 1, 0, 1
1142AVX_INSTR addsd, 1, 0, 1
1143AVX_INSTR addss, 1, 0, 1
1144AVX_INSTR addsubpd, 1, 0, 0
1145AVX_INSTR addsubps, 1, 0, 0
c108ba01
HG
1146AVX_INSTR aesdec, 0, 0, 0
1147AVX_INSTR aesdeclast, 0, 0, 0
1148AVX_INSTR aesenc, 0, 0, 0
1149AVX_INSTR aesenclast, 0, 0, 0
1150AVX_INSTR aesimc
1151AVX_INSTR aeskeygenassist
2f7f2e4b
LM
1152AVX_INSTR andnpd, 1, 0, 0
1153AVX_INSTR andnps, 1, 0, 0
c108ba01
HG
1154AVX_INSTR andpd, 1, 0, 1
1155AVX_INSTR andps, 1, 0, 1
2f7f2e4b
LM
1156AVX_INSTR blendpd, 1, 0, 0
1157AVX_INSTR blendps, 1, 0, 0
1158AVX_INSTR blendvpd, 1, 0, 0
1159AVX_INSTR blendvps, 1, 0, 0
2e81acc6
CG
1160AVX_INSTR cmppd, 1, 1, 0
1161AVX_INSTR cmpps, 1, 1, 0
1162AVX_INSTR cmpsd, 1, 1, 0
1163AVX_INSTR cmpss, 1, 1, 0
c108ba01
HG
1164AVX_INSTR comisd
1165AVX_INSTR comiss
1166AVX_INSTR cvtdq2pd
1167AVX_INSTR cvtdq2ps
1168AVX_INSTR cvtpd2dq
1169AVX_INSTR cvtpd2ps
1170AVX_INSTR cvtps2dq
1171AVX_INSTR cvtps2pd
1172AVX_INSTR cvtsd2si
1173AVX_INSTR cvtsd2ss
1174AVX_INSTR cvtsi2sd
1175AVX_INSTR cvtsi2ss
1176AVX_INSTR cvtss2sd
1177AVX_INSTR cvtss2si
1178AVX_INSTR cvttpd2dq
1179AVX_INSTR cvttps2dq
1180AVX_INSTR cvttsd2si
1181AVX_INSTR cvttss2si
2f7f2e4b
LM
1182AVX_INSTR divpd, 1, 0, 0
1183AVX_INSTR divps, 1, 0, 0
1184AVX_INSTR divsd, 1, 0, 0
1185AVX_INSTR divss, 1, 0, 0
1186AVX_INSTR dppd, 1, 1, 0
1187AVX_INSTR dpps, 1, 1, 0
c108ba01 1188AVX_INSTR extractps
2f7f2e4b
LM
1189AVX_INSTR haddpd, 1, 0, 0
1190AVX_INSTR haddps, 1, 0, 0
1191AVX_INSTR hsubpd, 1, 0, 0
1192AVX_INSTR hsubps, 1, 0, 0
c108ba01
HG
1193AVX_INSTR insertps, 1, 1, 0
1194AVX_INSTR lddqu
1195AVX_INSTR ldmxcsr
1196AVX_INSTR maskmovdqu
2f7f2e4b
LM
1197AVX_INSTR maxpd, 1, 0, 1
1198AVX_INSTR maxps, 1, 0, 1
1199AVX_INSTR maxsd, 1, 0, 1
1200AVX_INSTR maxss, 1, 0, 1
1201AVX_INSTR minpd, 1, 0, 1
1202AVX_INSTR minps, 1, 0, 1
1203AVX_INSTR minsd, 1, 0, 1
1204AVX_INSTR minss, 1, 0, 1
c108ba01
HG
1205AVX_INSTR movapd
1206AVX_INSTR movaps
1207AVX_INSTR movd
1208AVX_INSTR movddup
1209AVX_INSTR movdqa
1210AVX_INSTR movdqu
39df0c43 1211AVX_INSTR movhlps, 1, 0, 0
c108ba01
HG
1212AVX_INSTR movhpd, 1, 0, 0
1213AVX_INSTR movhps, 1, 0, 0
39df0c43 1214AVX_INSTR movlhps, 1, 0, 0
c108ba01
HG
1215AVX_INSTR movlpd, 1, 0, 0
1216AVX_INSTR movlps, 1, 0, 0
1217AVX_INSTR movmskpd
1218AVX_INSTR movmskps
1219AVX_INSTR movntdq
1220AVX_INSTR movntdqa
1221AVX_INSTR movntpd
1222AVX_INSTR movntps
1223AVX_INSTR movq
2f7f2e4b 1224AVX_INSTR movsd, 1, 0, 0
c108ba01
HG
1225AVX_INSTR movshdup
1226AVX_INSTR movsldup
2f7f2e4b 1227AVX_INSTR movss, 1, 0, 0
c108ba01
HG
1228AVX_INSTR movupd
1229AVX_INSTR movups
2f7f2e4b
LM
1230AVX_INSTR mpsadbw, 0, 1, 0
1231AVX_INSTR mulpd, 1, 0, 1
1232AVX_INSTR mulps, 1, 0, 1
1233AVX_INSTR mulsd, 1, 0, 1
1234AVX_INSTR mulss, 1, 0, 1
1235AVX_INSTR orpd, 1, 0, 1
1236AVX_INSTR orps, 1, 0, 1
c108ba01
HG
1237AVX_INSTR pabsb
1238AVX_INSTR pabsd
1239AVX_INSTR pabsw
2f7f2e4b
LM
1240AVX_INSTR packsswb, 0, 0, 0
1241AVX_INSTR packssdw, 0, 0, 0
1242AVX_INSTR packuswb, 0, 0, 0
1243AVX_INSTR packusdw, 0, 0, 0
1244AVX_INSTR paddb, 0, 0, 1
1245AVX_INSTR paddw, 0, 0, 1
1246AVX_INSTR paddd, 0, 0, 1
1247AVX_INSTR paddq, 0, 0, 1
1248AVX_INSTR paddsb, 0, 0, 1
1249AVX_INSTR paddsw, 0, 0, 1
1250AVX_INSTR paddusb, 0, 0, 1
1251AVX_INSTR paddusw, 0, 0, 1
1252AVX_INSTR palignr, 0, 1, 0
1253AVX_INSTR pand, 0, 0, 1
1254AVX_INSTR pandn, 0, 0, 0
1255AVX_INSTR pavgb, 0, 0, 1
1256AVX_INSTR pavgw, 0, 0, 1
1257AVX_INSTR pblendvb, 0, 0, 0
1258AVX_INSTR pblendw, 0, 1, 0
c108ba01
HG
1259AVX_INSTR pclmulqdq, 0, 1, 0
1260AVX_INSTR pcmpestri
1261AVX_INSTR pcmpestrm
1262AVX_INSTR pcmpistri
1263AVX_INSTR pcmpistrm
2f7f2e4b
LM
1264AVX_INSTR pcmpeqb, 0, 0, 1
1265AVX_INSTR pcmpeqw, 0, 0, 1
1266AVX_INSTR pcmpeqd, 0, 0, 1
1267AVX_INSTR pcmpeqq, 0, 0, 1
1268AVX_INSTR pcmpgtb, 0, 0, 0
1269AVX_INSTR pcmpgtw, 0, 0, 0
1270AVX_INSTR pcmpgtd, 0, 0, 0
1271AVX_INSTR pcmpgtq, 0, 0, 0
c108ba01
HG
1272AVX_INSTR pextrb
1273AVX_INSTR pextrd
1274AVX_INSTR pextrq
1275AVX_INSTR pextrw
2f7f2e4b
LM
1276AVX_INSTR phaddw, 0, 0, 0
1277AVX_INSTR phaddd, 0, 0, 0
1278AVX_INSTR phaddsw, 0, 0, 0
c108ba01 1279AVX_INSTR phminposuw
2f7f2e4b
LM
1280AVX_INSTR phsubw, 0, 0, 0
1281AVX_INSTR phsubd, 0, 0, 0
1282AVX_INSTR phsubsw, 0, 0, 0
c108ba01
HG
1283AVX_INSTR pinsrb, 0, 1, 0
1284AVX_INSTR pinsrd, 0, 1, 0
1285AVX_INSTR pinsrq, 0, 1, 0
1286AVX_INSTR pinsrw, 0, 1, 0
2f7f2e4b
LM
1287AVX_INSTR pmaddwd, 0, 0, 1
1288AVX_INSTR pmaddubsw, 0, 0, 0
1289AVX_INSTR pmaxsb, 0, 0, 1
1290AVX_INSTR pmaxsw, 0, 0, 1
1291AVX_INSTR pmaxsd, 0, 0, 1
1292AVX_INSTR pmaxub, 0, 0, 1
1293AVX_INSTR pmaxuw, 0, 0, 1
1294AVX_INSTR pmaxud, 0, 0, 1
1295AVX_INSTR pminsb, 0, 0, 1
1296AVX_INSTR pminsw, 0, 0, 1
1297AVX_INSTR pminsd, 0, 0, 1
1298AVX_INSTR pminub, 0, 0, 1
1299AVX_INSTR pminuw, 0, 0, 1
1300AVX_INSTR pminud, 0, 0, 1
c108ba01
HG
1301AVX_INSTR pmovmskb
1302AVX_INSTR pmovsxbw
1303AVX_INSTR pmovsxbd
1304AVX_INSTR pmovsxbq
1305AVX_INSTR pmovsxwd
1306AVX_INSTR pmovsxwq
1307AVX_INSTR pmovsxdq
1308AVX_INSTR pmovzxbw
1309AVX_INSTR pmovzxbd
1310AVX_INSTR pmovzxbq
1311AVX_INSTR pmovzxwd
1312AVX_INSTR pmovzxwq
1313AVX_INSTR pmovzxdq
1314AVX_INSTR pmuldq, 0, 0, 1
2f7f2e4b 1315AVX_INSTR pmulhrsw, 0, 0, 1
c108ba01 1316AVX_INSTR pmulhuw, 0, 0, 1
2f7f2e4b
LM
1317AVX_INSTR pmulhw, 0, 0, 1
1318AVX_INSTR pmullw, 0, 0, 1
1319AVX_INSTR pmulld, 0, 0, 1
1320AVX_INSTR pmuludq, 0, 0, 1
2f7f2e4b
LM
1321AVX_INSTR por, 0, 0, 1
1322AVX_INSTR psadbw, 0, 0, 1
1323AVX_INSTR pshufb, 0, 0, 0
c108ba01
HG
1324AVX_INSTR pshufd
1325AVX_INSTR pshufhw
1326AVX_INSTR pshuflw
2f7f2e4b
LM
1327AVX_INSTR psignb, 0, 0, 0
1328AVX_INSTR psignw, 0, 0, 0
1329AVX_INSTR psignd, 0, 0, 0
1330AVX_INSTR psllw, 0, 0, 0
1331AVX_INSTR pslld, 0, 0, 0
1332AVX_INSTR psllq, 0, 0, 0
1333AVX_INSTR pslldq, 0, 0, 0
1334AVX_INSTR psraw, 0, 0, 0
1335AVX_INSTR psrad, 0, 0, 0
1336AVX_INSTR psrlw, 0, 0, 0
1337AVX_INSTR psrld, 0, 0, 0
1338AVX_INSTR psrlq, 0, 0, 0
1339AVX_INSTR psrldq, 0, 0, 0
1340AVX_INSTR psubb, 0, 0, 0
1341AVX_INSTR psubw, 0, 0, 0
1342AVX_INSTR psubd, 0, 0, 0
1343AVX_INSTR psubq, 0, 0, 0
1344AVX_INSTR psubsb, 0, 0, 0
1345AVX_INSTR psubsw, 0, 0, 0
1346AVX_INSTR psubusb, 0, 0, 0
1347AVX_INSTR psubusw, 0, 0, 0
c108ba01 1348AVX_INSTR ptest
2f7f2e4b
LM
1349AVX_INSTR punpckhbw, 0, 0, 0
1350AVX_INSTR punpckhwd, 0, 0, 0
1351AVX_INSTR punpckhdq, 0, 0, 0
1352AVX_INSTR punpckhqdq, 0, 0, 0
1353AVX_INSTR punpcklbw, 0, 0, 0
1354AVX_INSTR punpcklwd, 0, 0, 0
1355AVX_INSTR punpckldq, 0, 0, 0
1356AVX_INSTR punpcklqdq, 0, 0, 0
1357AVX_INSTR pxor, 0, 0, 1
c108ba01
HG
1358AVX_INSTR rcpps, 1, 0, 0
1359AVX_INSTR rcpss, 1, 0, 0
1360AVX_INSTR roundpd
1361AVX_INSTR roundps
1362AVX_INSTR roundsd
1363AVX_INSTR roundss
1364AVX_INSTR rsqrtps, 1, 0, 0
1365AVX_INSTR rsqrtss, 1, 0, 0
1366AVX_INSTR shufpd, 1, 1, 0
6b6ee582 1367AVX_INSTR shufps, 1, 1, 0
c108ba01
HG
1368AVX_INSTR sqrtpd, 1, 0, 0
1369AVX_INSTR sqrtps, 1, 0, 0
1370AVX_INSTR sqrtsd, 1, 0, 0
1371AVX_INSTR sqrtss, 1, 0, 0
1372AVX_INSTR stmxcsr
2f7f2e4b
LM
1373AVX_INSTR subpd, 1, 0, 0
1374AVX_INSTR subps, 1, 0, 0
1375AVX_INSTR subsd, 1, 0, 0
1376AVX_INSTR subss, 1, 0, 0
c108ba01
HG
1377AVX_INSTR ucomisd
1378AVX_INSTR ucomiss
2f7f2e4b
LM
1379AVX_INSTR unpckhpd, 1, 0, 0
1380AVX_INSTR unpckhps, 1, 0, 0
1381AVX_INSTR unpcklpd, 1, 0, 0
1382AVX_INSTR unpcklps, 1, 0, 0
1383AVX_INSTR xorpd, 1, 0, 1
1384AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1385
1386; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1387AVX_INSTR pfadd, 1, 0, 1
1388AVX_INSTR pfsub, 1, 0, 0
1389AVX_INSTR pfmul, 1, 0, 1
1390
1391; base-4 constants for shuffles
1392%assign i 0
1393%rep 256
1394 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1395 %if j < 10
1396 CAT_XDEFINE q000, j, i
1397 %elif j < 100
1398 CAT_XDEFINE q00, j, i
1399 %elif j < 1000
1400 CAT_XDEFINE q0, j, i
1401 %else
1402 CAT_XDEFINE q, j, i
1403 %endif
1404%assign i i+1
1405%endrep
1406%undef i
1407%undef j
1408
1409%macro FMA_INSTR 3
20689570
DB
1410 %macro %1 4-7 %1, %2, %3
1411 %if cpuflag(xop)
1412 v%5 %1, %2, %3, %4
2f7f2e4b 1413 %else
20689570
DB
1414 %6 %1, %2, %3
1415 %7 %1, %4
2f7f2e4b
LM
1416 %endif
1417 %endmacro
1418%endmacro
1419
1420FMA_INSTR pmacsdd, pmulld, paddd
1421FMA_INSTR pmacsww, pmullw, paddw
1422FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1423
1424; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1425; This lets us use tzcnt without bumping the yasm version requirement yet.
1426%define tzcnt rep bsf
c6908d6b
JGG
1427
1428; convert FMA4 to FMA3 if possible
1429%macro FMA4_INSTR 4
1430 %macro %1 4-8 %1, %2, %3, %4
1431 %if cpuflag(fma4)
1432 v%5 %1, %2, %3, %4
1433 %elifidn %1, %2
1434 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
1435 %elifidn %1, %3
1436 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
1437 %elifidn %1, %4
1438 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
1439 %else
1440 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
1441 %endif
1442 %endmacro
1443%endmacro
1444
1445FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
1446FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
1447FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
1448FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
1449
1450FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
1451FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
1452FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
1453FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
1454
1455FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
1456FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
1457FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
1458FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
1459
1460FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
1461FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
1462FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
1463FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
1464
1465FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
1466FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
1467FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
1468FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
a3fabc6c
JGG
1469
1470; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
1471%if ARCH_X86_64 == 0
1472%macro vpbroadcastq 2
1473%if sizeof%1 == 16
1474 movddup %1, %2
1475%else
1476 vbroadcastsd %1, %2
1477%endif
1478%endmacro
1479%endif