x86inc: Support arbitrary stack alignments
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
71155665 4;* Copyright (C) 2005-2013 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
79793f83 8;* Fiona Glaser <fiona@x264.com>
ad7d7d4f 9;* Henrik Gramner <henrik@gramner.com>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
d633d12b
DB
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
9f1245eb
HG
45%if HAVE_ALIGNED_STACK
46 %define STACK_ALIGNMENT 16
47%endif
48%ifndef STACK_ALIGNMENT
49 %if ARCH_X86_64
50 %define STACK_ALIGNMENT 16
51 %else
52 %define STACK_ALIGNMENT 4
53 %endif
54%endif
55
3b15a6d7 56%define WIN64 0
96c9cc10 57%define UNIX64 0
3b15a6d7 58%if ARCH_X86_64
3f87f39c 59 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 60 %define WIN64 1
166f3993
HY
61 %elifidn __OUTPUT_FORMAT__,win64
62 %define WIN64 1
47f9d7ce
DB
63 %elifidn __OUTPUT_FORMAT__,x64
64 %define WIN64 1
3f87f39c 65 %else
3b15a6d7 66 %define UNIX64 1
3f87f39c
JA
67 %endif
68%endif
69
2966cc18
JGG
70%ifdef PREFIX
71 %define mangle(x) _ %+ x
72%else
73 %define mangle(x) x
74%endif
75
ad7d7d4f
HG
76; aout does not support align=
77; NOTE: This section is out of sync with x264, in order to
78; keep supporting OS/2.
3f87f39c 79%macro SECTION_RODATA 0-1 16
ad7d7d4f 80 %ifidn __OUTPUT_FORMAT__,aout
d69f9a42 81 section .text
bafad220 82 %else
3f87f39c 83 SECTION .rodata align=%1
bafad220
LM
84 %endif
85%endmacro
86
d69f9a42
DY
87%macro SECTION_TEXT 0-1 16
88 %ifidn __OUTPUT_FORMAT__,aout
89 SECTION .text
90 %else
91 SECTION .text align=%1
92 %endif
93%endmacro
94
3b15a6d7 95%if WIN64
3f87f39c 96 %define PIC
412b248e 97%elif ARCH_X86_64 == 0
2966cc18
JGG
98; x86_32 doesn't require PIC.
99; Some distros prefer shared objects to be PIC, but nothing breaks if
100; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
101 %undef PIC
102%endif
103%ifdef PIC
2966cc18 104 default rel
bafad220
LM
105%endif
106
180d43bc
MR
107%macro CPUNOP 1
108 %if HAVE_CPUNOP
109 CPU %1
110 %endif
111%endmacro
112
bafad220
LM
113; Macros to eliminate most code duplication between x86_32 and x86_64:
114; Currently this works only for leaf functions which load all their arguments
115; into registers at the start, and make no other use of the stack. Luckily that
116; covers most of x264's asm.
117
118; PROLOGUE:
119; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
120; %2 = number of registers used. pushes callee-saved regs if needed.
121; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
9f1245eb
HG
122; %4 = (optional) stack size to be allocated. The stack will be aligned before
123; allocating the specified stack size. If the required stack alignment is
124; larger than the known stack alignment the stack will be manually aligned
6f40e9f0
RB
125; and an extra register will be allocated to hold the original stack
126; pointer (to not invalidate r0m etc.). To prevent the use of an extra
127; register as stack pointer, request a negative stack size.
128; %4+/%5+ = list of names to define to registers
bafad220
LM
129; PROLOGUE can also be invoked by adding the same options to cglobal
130
131; e.g.
9f1245eb
HG
132; cglobal foo, 2,3,7,0x40, dst, src, tmp
133; declares a function (foo) that automatically loads two arguments (dst and
134; src) into registers, uses one additional register (tmp) plus 7 vector
135; registers (m0-m6) and allocates 0x40 bytes of stack space.
bafad220
LM
136
137; TODO Some functions can use some args directly from the stack. If they're the
138; last args then you can just not declare them, but if they're in the middle
139; we need more flexible macro.
140
141; RET:
2f7f2e4b 142; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
143
144; REP_RET:
25cb0c1a 145; Use this instead of RET if it's a branch target.
bafad220 146
3f87f39c
JA
147; registers:
148; rN and rNq are the native-size register holding function argument N
149; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 150; rNh is the high 8 bits of the word size
3f87f39c
JA
151; rNm is the original location of arg N (a register or on the stack), dword
152; rNmp is native size
153
96c9cc10 154%macro DECLARE_REG 2-3
bafad220 155 %define r%1q %2
96c9cc10
RB
156 %define r%1d %2d
157 %define r%1w %2w
158 %define r%1b %2b
159 %define r%1h %2h
7a1944b9 160 %define %2q %2
96c9cc10
RB
161 %if %0 == 2
162 %define r%1m %2d
3f87f39c 163 %define r%1mp %2
3b15a6d7 164 %elif ARCH_X86_64 ; memory
6f40e9f0 165 %define r%1m [rstk + stack_offset + %3]
0995ad8d 166 %define r%1mp qword r %+ %1 %+ m
3f87f39c 167 %else
6f40e9f0 168 %define r%1m [rstk + stack_offset + %3]
0995ad8d 169 %define r%1mp dword r %+ %1 %+ m
3f87f39c 170 %endif
bafad220
LM
171 %define r%1 %2
172%endmacro
173
96c9cc10 174%macro DECLARE_REG_SIZE 3
bafad220
LM
175 %define r%1q r%1
176 %define e%1q r%1
177 %define r%1d e%1
178 %define e%1d e%1
179 %define r%1w %1
180 %define e%1w %1
96c9cc10
RB
181 %define r%1h %3
182 %define e%1h %3
bafad220
LM
183 %define r%1b %2
184 %define e%1b %2
3b15a6d7 185%if ARCH_X86_64 == 0
bafad220
LM
186 %define r%1 e%1
187%endif
188%endmacro
189
96c9cc10
RB
190DECLARE_REG_SIZE ax, al, ah
191DECLARE_REG_SIZE bx, bl, bh
192DECLARE_REG_SIZE cx, cl, ch
193DECLARE_REG_SIZE dx, dl, dh
194DECLARE_REG_SIZE si, sil, null
195DECLARE_REG_SIZE di, dil, null
196DECLARE_REG_SIZE bp, bpl, null
bafad220 197
3f87f39c
JA
198; t# defines for when per-arch register allocation is more complex than just function arguments
199
200%macro DECLARE_REG_TMP 1-*
201 %assign %%i 0
202 %rep %0
203 CAT_XDEFINE t, %%i, r%1
204 %assign %%i %%i+1
205 %rotate 1
206 %endrep
207%endmacro
208
209%macro DECLARE_REG_TMP_SIZE 0-*
210 %rep %0
211 %define t%1q t%1 %+ q
212 %define t%1d t%1 %+ d
213 %define t%1w t%1 %+ w
96c9cc10 214 %define t%1h t%1 %+ h
3f87f39c
JA
215 %define t%1b t%1 %+ b
216 %rotate 1
217 %endrep
218%endmacro
219
729f90e2 220DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 221
3b15a6d7 222%if ARCH_X86_64
bafad220
LM
223 %define gprsize 8
224%else
225 %define gprsize 4
226%endif
227
228%macro PUSH 1
229 push %1
6f40e9f0
RB
230 %ifidn rstk, rsp
231 %assign stack_offset stack_offset+gprsize
232 %endif
bafad220
LM
233%endmacro
234
235%macro POP 1
236 pop %1
6f40e9f0
RB
237 %ifidn rstk, rsp
238 %assign stack_offset stack_offset-gprsize
239 %endif
bafad220
LM
240%endmacro
241
729f90e2
HG
242%macro PUSH_IF_USED 1-*
243 %rep %0
244 %if %1 < regs_used
245 PUSH r%1
246 %endif
247 %rotate 1
248 %endrep
249%endmacro
250
251%macro POP_IF_USED 1-*
252 %rep %0
253 %if %1 < regs_used
254 pop r%1
255 %endif
256 %rotate 1
257 %endrep
258%endmacro
259
260%macro LOAD_IF_USED 1-*
261 %rep %0
262 %if %1 < num_args
263 mov r%1, r %+ %1 %+ mp
264 %endif
265 %rotate 1
266 %endrep
267%endmacro
268
bafad220
LM
269%macro SUB 2
270 sub %1, %2
6f40e9f0 271 %ifidn %1, rstk
bafad220
LM
272 %assign stack_offset stack_offset+(%2)
273 %endif
274%endmacro
275
276%macro ADD 2
277 add %1, %2
6f40e9f0 278 %ifidn %1, rstk
bafad220
LM
279 %assign stack_offset stack_offset-(%2)
280 %endif
281%endmacro
282
283%macro movifnidn 2
284 %ifnidn %1, %2
285 mov %1, %2
286 %endif
287%endmacro
288
289%macro movsxdifnidn 2
290 %ifnidn %1, %2
291 movsxd %1, %2
292 %endif
293%endmacro
294
295%macro ASSERT 1
296 %if (%1) == 0
297 %error assert failed
298 %endif
299%endmacro
300
301%macro DEFINE_ARGS 0-*
302 %ifdef n_arg_names
303 %assign %%i 0
304 %rep n_arg_names
305 CAT_UNDEF arg_name %+ %%i, q
306 CAT_UNDEF arg_name %+ %%i, d
307 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 308 CAT_UNDEF arg_name %+ %%i, h
bafad220 309 CAT_UNDEF arg_name %+ %%i, b
2f77923d 310 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 311 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
312 CAT_UNDEF arg_name, %%i
313 %assign %%i %%i+1
314 %endrep
315 %endif
316
0f53d0cf
LM
317 %xdefine %%stack_offset stack_offset
318 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
319 %assign %%i 0
320 %rep %0
321 %xdefine %1q r %+ %%i %+ q
322 %xdefine %1d r %+ %%i %+ d
323 %xdefine %1w r %+ %%i %+ w
96c9cc10 324 %xdefine %1h r %+ %%i %+ h
bafad220 325 %xdefine %1b r %+ %%i %+ b
2f77923d 326 %xdefine %1m r %+ %%i %+ m
98b9da2a 327 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
328 CAT_XDEFINE arg_name, %%i, %1
329 %assign %%i %%i+1
330 %rotate 1
331 %endrep
0f53d0cf
LM
332 %xdefine stack_offset %%stack_offset
333 %assign n_arg_names %0
bafad220
LM
334%endmacro
335
9f1245eb
HG
336%define required_stack_alignment ((mmsize + 15) & ~15)
337
6f40e9f0
RB
338%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
339 %ifnum %1
340 %if %1 != 0
9f1245eb 341 %assign %%pad 0
6f40e9f0
RB
342 %assign stack_size %1
343 %if stack_size < 0
344 %assign stack_size -stack_size
345 %endif
bbe4a6db 346 %if WIN64
9f1245eb 347 %assign %%pad %%pad + 32 ; shadow space
bbe4a6db
HG
348 %if mmsize != 8
349 %assign xmm_regs_used %2
350 %if xmm_regs_used > 8
9f1245eb 351 %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
bbe4a6db
HG
352 %endif
353 %endif
a34d9ad9 354 %endif
9f1245eb
HG
355 %if required_stack_alignment <= STACK_ALIGNMENT
356 ; maintain the current stack alignment
357 %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
6f40e9f0
RB
358 SUB rsp, stack_size_padded
359 %else
a34d9ad9
RB
360 %assign %%reg_num (regs_used - 1)
361 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
362 ; align stack, and save original stack location directly above
363 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
364 ; stack in a single instruction (i.e. mov rsp, rstk or mov
365 ; rsp, [rsp+stack_size_padded])
6f40e9f0 366 %if %1 < 0 ; need to store rsp on stack
9f1245eb
HG
367 %xdefine rstkm [rsp + stack_size + %%pad]
368 %assign %%pad %%pad + gprsize
6f40e9f0 369 %else ; can keep rsp in rstk during whole function
6f40e9f0
RB
370 %xdefine rstkm rstk
371 %endif
9f1245eb
HG
372 %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
373 mov rstk, rsp
374 and rsp, ~(required_stack_alignment-1)
375 sub rsp, stack_size_padded
376 movifnidn rstkm, rstk
6f40e9f0 377 %endif
bbe4a6db 378 WIN64_PUSH_XMM
6f40e9f0
RB
379 %endif
380 %endif
381%endmacro
382
383%macro SETUP_STACK_POINTER 1
384 %ifnum %1
9f1245eb 385 %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
6f40e9f0
RB
386 %if %1 > 0
387 %assign regs_used (regs_used + 1)
388 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
389 %warning "Stack pointer will overwrite register argument"
390 %endif
391 %endif
392 %endif
393%endmacro
394
395%macro DEFINE_ARGS_INTERNAL 3+
396 %ifnum %2
397 DEFINE_ARGS %3
398 %elif %1 == 4
399 DEFINE_ARGS %2
400 %elif %1 > 4
401 DEFINE_ARGS %2, %3
402 %endif
403%endmacro
404
3b15a6d7 405%if WIN64 ; Windows x64 ;=================================================
bafad220 406
96c9cc10
RB
407DECLARE_REG 0, rcx
408DECLARE_REG 1, rdx
409DECLARE_REG 2, R8
410DECLARE_REG 3, R9
411DECLARE_REG 4, R10, 40
412DECLARE_REG 5, R11, 48
413DECLARE_REG 6, rax, 56
414DECLARE_REG 7, rdi, 64
415DECLARE_REG 8, rsi, 72
416DECLARE_REG 9, rbx, 80
417DECLARE_REG 10, rbp, 88
418DECLARE_REG 11, R12, 96
419DECLARE_REG 12, R13, 104
420DECLARE_REG 13, R14, 112
421DECLARE_REG 14, R15, 120
3f87f39c 422
6f40e9f0 423%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 424 %assign num_args %1
3f87f39c 425 %assign regs_used %2
729f90e2 426 ASSERT regs_used >= num_args
a34d9ad9 427 SETUP_STACK_POINTER %4
729f90e2
HG
428 ASSERT regs_used <= 15
429 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
430 ALLOC_STACK %4, %3
431 %if mmsize != 8 && stack_size == 0
9cf73853
HG
432 WIN64_SPILL_XMM %3
433 %endif
729f90e2 434 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
435 DEFINE_ARGS_INTERNAL %0, %4, %5
436%endmacro
437
438%macro WIN64_PUSH_XMM 0
bbe4a6db
HG
439 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
440 %if xmm_regs_used > 6
441 movaps [rstk + stack_offset + 8], xmm6
442 %endif
443 %if xmm_regs_used > 7
444 movaps [rstk + stack_offset + 24], xmm7
445 %endif
446 %if xmm_regs_used > 8
447 %assign %%i 8
448 %rep xmm_regs_used-8
449 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
450 %assign %%i %%i+1
451 %endrep
452 %endif
532e7697
LM
453%endmacro
454
455%macro WIN64_SPILL_XMM 1
456 %assign xmm_regs_used %1
457 ASSERT xmm_regs_used <= 16
bbe4a6db 458 %if xmm_regs_used > 8
9f1245eb
HG
459 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
460 %assign %%pad (xmm_regs_used-8)*16 + 32
461 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
bbe4a6db 462 SUB rsp, stack_size_padded
3f87f39c 463 %endif
bbe4a6db 464 WIN64_PUSH_XMM
3f87f39c
JA
465%endmacro
466
532e7697 467%macro WIN64_RESTORE_XMM_INTERNAL 1
bbe4a6db
HG
468 %assign %%pad_size 0
469 %if xmm_regs_used > 8
3f87f39c 470 %assign %%i xmm_regs_used
bbe4a6db 471 %rep xmm_regs_used-8
3f87f39c 472 %assign %%i %%i-1
bbe4a6db 473 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
3f87f39c 474 %endrep
6f40e9f0
RB
475 %endif
476 %if stack_size_padded > 0
9f1245eb 477 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
6f40e9f0
RB
478 mov rsp, rstkm
479 %else
480 add %1, stack_size_padded
bbe4a6db 481 %assign %%pad_size stack_size_padded
6f40e9f0 482 %endif
3f87f39c 483 %endif
bbe4a6db
HG
484 %if xmm_regs_used > 7
485 movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
486 %endif
487 %if xmm_regs_used > 6
488 movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
489 %endif
3f87f39c
JA
490%endmacro
491
532e7697
LM
492%macro WIN64_RESTORE_XMM 1
493 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 494 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
495 %assign xmm_regs_used 0
496%endmacro
497
6f40e9f0 498%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 499
3f87f39c 500%macro RET 0
532e7697 501 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 502 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
503%if mmsize == 32
504 vzeroupper
505%endif
25cb0c1a 506 AUTO_REP_RET
bafad220
LM
507%endmacro
508
3b15a6d7 509%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 510
96c9cc10
RB
511DECLARE_REG 0, rdi
512DECLARE_REG 1, rsi
513DECLARE_REG 2, rdx
514DECLARE_REG 3, rcx
515DECLARE_REG 4, R8
516DECLARE_REG 5, R9
517DECLARE_REG 6, rax, 8
518DECLARE_REG 7, R10, 16
519DECLARE_REG 8, R11, 24
520DECLARE_REG 9, rbx, 32
521DECLARE_REG 10, rbp, 40
522DECLARE_REG 11, R12, 48
523DECLARE_REG 12, R13, 56
524DECLARE_REG 13, R14, 64
525DECLARE_REG 14, R15, 72
bafad220 526
6f40e9f0 527%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
528 %assign num_args %1
529 %assign regs_used %2
530 ASSERT regs_used >= num_args
a34d9ad9 531 SETUP_STACK_POINTER %4
729f90e2
HG
532 ASSERT regs_used <= 15
533 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 534 ALLOC_STACK %4
729f90e2 535 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 536 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
537%endmacro
538
6f40e9f0 539%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 540
bafad220 541%macro RET 0
6f40e9f0 542%if stack_size_padded > 0
9f1245eb 543%if required_stack_alignment > STACK_ALIGNMENT
6f40e9f0
RB
544 mov rsp, rstkm
545%else
546 add rsp, stack_size_padded
547%endif
548%endif
729f90e2 549 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
550%if mmsize == 32
551 vzeroupper
552%endif
25cb0c1a 553 AUTO_REP_RET
bafad220
LM
554%endmacro
555
bafad220
LM
556%else ; X86_32 ;==============================================================
557
96c9cc10
RB
558DECLARE_REG 0, eax, 4
559DECLARE_REG 1, ecx, 8
560DECLARE_REG 2, edx, 12
561DECLARE_REG 3, ebx, 16
562DECLARE_REG 4, esi, 20
563DECLARE_REG 5, edi, 24
564DECLARE_REG 6, ebp, 28
bafad220
LM
565%define rsp esp
566
729f90e2
HG
567%macro DECLARE_ARG 1-*
568 %rep %0
6f40e9f0 569 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
570 %define r%1mp dword r%1m
571 %rotate 1
572 %endrep
bafad220
LM
573%endmacro
574
729f90e2 575DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 576
6f40e9f0 577%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 578 %assign num_args %1
bafad220 579 %assign regs_used %2
a34d9ad9
RB
580 ASSERT regs_used >= num_args
581 %if num_args > 7
582 %assign num_args 7
583 %endif
729f90e2
HG
584 %if regs_used > 7
585 %assign regs_used 7
586 %endif
6f40e9f0
RB
587 SETUP_STACK_POINTER %4
588 ASSERT regs_used <= 7
729f90e2 589 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 590 ALLOC_STACK %4
729f90e2 591 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 592 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
593%endmacro
594
6f40e9f0 595%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 596
bafad220 597%macro RET 0
6f40e9f0 598%if stack_size_padded > 0
9f1245eb 599%if required_stack_alignment > STACK_ALIGNMENT
6f40e9f0
RB
600 mov rsp, rstkm
601%else
602 add rsp, stack_size_padded
603%endif
604%endif
729f90e2 605 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
606%if mmsize == 32
607 vzeroupper
608%endif
25cb0c1a 609 AUTO_REP_RET
bafad220
LM
610%endmacro
611
bafad220
LM
612%endif ;======================================================================
613
3b15a6d7 614%if WIN64 == 0
532e7697
LM
615%macro WIN64_SPILL_XMM 1
616%endmacro
617%macro WIN64_RESTORE_XMM 1
618%endmacro
6f40e9f0
RB
619%macro WIN64_PUSH_XMM 0
620%endmacro
532e7697
LM
621%endif
622
25cb0c1a
LM
623; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
624; a branch or a branch target. So switch to a 2-byte form of ret in that case.
625; We can automatically detect "follows a branch", but not a branch target.
626; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
96c9cc10
RB
627%macro REP_RET 0
628 %if has_epilogue
629 RET
630 %else
631 rep ret
632 %endif
633%endmacro
634
25cb0c1a
LM
635%define last_branch_adr $$
636%macro AUTO_REP_RET 0
637 %ifndef cpuflags
638 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
639 %elif notcpuflag(ssse3)
640 times ((last_branch_adr-$)>>31)+1 rep
641 %endif
642 ret
643%endmacro
644
645%macro BRANCH_INSTR 0-*
646 %rep %0
647 %macro %1 1-2 %1
648 %2 %1
649 %%branch_instr:
650 %xdefine last_branch_adr %%branch_instr
651 %endmacro
652 %rotate 1
653 %endrep
654%endmacro
655
656BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
657
96c9cc10
RB
658%macro TAIL_CALL 2 ; callee, is_nonadjacent
659 %if has_epilogue
660 call %1
661 RET
662 %elif %2
663 jmp %1
664 %endif
665%endmacro
666
bafad220
LM
667;=============================================================================
668; arch-independent part
669;=============================================================================
670
671%assign function_align 16
672
2f7f2e4b
LM
673; Begin a function.
674; Applies any symbol mangling needed for C linkage, and sets up a define such that
675; subsequent uses of the function name automatically refer to the mangled version.
676; Appends cpuflags to the function name if cpuflags has been specified.
d633d12b
DB
677; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
678; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
a34d9ad9 679%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
d633d12b
DB
680 cglobal_internal 1, %1 %+ SUFFIX, %2
681%endmacro
682%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
683 cglobal_internal 0, %1 %+ SUFFIX, %2
684%endmacro
685%macro cglobal_internal 2-3+
686 %if %1
687 %xdefine %%FUNCTION_PREFIX private_prefix
688 %xdefine %%VISIBILITY hidden
689 %else
690 %xdefine %%FUNCTION_PREFIX public_prefix
691 %xdefine %%VISIBILITY
692 %endif
693 %ifndef cglobaled_%2
694 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
695 %xdefine %2.skip_prologue %2 %+ .skip_prologue
696 CAT_XDEFINE cglobaled_, %2, 1
2f7f2e4b 697 %endif
d633d12b 698 %xdefine current_function %2
bafad220 699 %ifidn __OUTPUT_FORMAT__,elf
d633d12b 700 global %2:function %%VISIBILITY
bafad220 701 %else
d633d12b 702 global %2
bafad220
LM
703 %endif
704 align function_align
d633d12b 705 %2:
bbe4a6db
HG
706 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
707 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
708 %assign stack_offset 0 ; stack pointer offset relative to the return address
709 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
710 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
711 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
d633d12b
DB
712 %ifnidn %3, ""
713 PROLOGUE %3
bafad220
LM
714 %endif
715%endmacro
716
717%macro cextern 1
ef5d41a5 718 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 719 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
720 extern %1
721%endmacro
722
2f7f2e4b 723; like cextern, but without the prefix
2966cc18
JGG
724%macro cextern_naked 1
725 %xdefine %1 mangle(%1)
2f7f2e4b 726 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 727 extern %1
bafad220
LM
728%endmacro
729
71155665 730%macro const 1-2+
ef5d41a5 731 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
ad76e6e7
HG
732 %ifidn __OUTPUT_FORMAT__,elf
733 global %1:data hidden
734 %else
735 global %1
736 %endif
2966cc18
JGG
737 %1: %2
738%endmacro
739
bafad220
LM
740; This is needed for ELF, otherwise the GNU linker assumes the stack is
741; executable by default.
742%ifidn __OUTPUT_FORMAT__,elf
dd4d709b 743[section .note.GNU-stack noalloc noexec nowrite progbits]
bafad220
LM
744%endif
745
dd4d709b
TG
746; Overrides the default .text section.
747; Silences warnings when defining structures.
748%define __SECT__
749
2f7f2e4b
LM
750; cpuflags
751
752%assign cpuflags_mmx (1<<0)
753%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
754%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 755%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
756%assign cpuflags_sse (1<<4) | cpuflags_mmx2
757%assign cpuflags_sse2 (1<<5) | cpuflags_sse
758%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
759%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
760%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
761%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
762%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
763%assign cpuflags_avx (1<<11)| cpuflags_sse42
764%assign cpuflags_xop (1<<12)| cpuflags_avx
765%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
766%assign cpuflags_avx2 (1<<14)| cpuflags_avx
767%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
768
769%assign cpuflags_cache32 (1<<16)
770%assign cpuflags_cache64 (1<<17)
771%assign cpuflags_slowctz (1<<18)
772%assign cpuflags_lzcnt (1<<19)
3e2fa991
HG
773%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
774%assign cpuflags_atom (1<<21)
775%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
776%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
2f7f2e4b
LM
777
778%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
779%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
780
f629705b 781; Takes an arbitrary number of cpuflags from the above list.
2f7f2e4b
LM
782; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
783; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
f629705b
HG
784%macro INIT_CPUFLAGS 0-*
785 %xdefine SUFFIX
786 %undef cpuname
787 %assign cpuflags 0
788
2f7f2e4b 789 %if %0 >= 1
f629705b
HG
790 %rep %0
791 %ifdef cpuname
792 %xdefine cpuname cpuname %+ _%1
793 %else
794 %xdefine cpuname %1
795 %endif
796 %assign cpuflags cpuflags | cpuflags_%1
797 %rotate 1
798 %endrep
2f7f2e4b 799 %xdefine SUFFIX _ %+ cpuname
f629705b 800
2f7f2e4b
LM
801 %if cpuflag(avx)
802 %assign avx_enabled 1
803 %endif
c108ba01 804 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
f2bd8a07
JR
805 %define mova movaps
806 %define movu movups
807 %define movnta movntps
808 %endif
2f7f2e4b
LM
809 %if cpuflag(aligned)
810 %define movu mova
f629705b 811 %elif cpuflag(sse3) && notcpuflag(ssse3)
2f7f2e4b
LM
812 %define movu lddqu
813 %endif
f629705b
HG
814 %endif
815
816 %if cpuflag(sse2)
817 CPUNOP amdnop
2f7f2e4b 818 %else
f629705b 819 CPUNOP basicnop
2f7f2e4b
LM
820 %endif
821%endmacro
822
3fb78e99 823; Merge mmx and sse*
176a0fca
HG
824; m# is a simd register of the currently selected size
825; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
826; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
3fb78e99 827; (All 3 remain in sync through SWAP.)
bafad220
LM
828
829%macro CAT_XDEFINE 3
830 %xdefine %1%2 %3
831%endmacro
832
833%macro CAT_UNDEF 2
834 %undef %1%2
835%endmacro
836
2f7f2e4b 837%macro INIT_MMX 0-1+
33cbfa6f 838 %assign avx_enabled 0
2f7f2e4b 839 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
840 %define mmsize 8
841 %define num_mmregs 8
842 %define mova movq
843 %define movu movq
844 %define movh movd
532e7697 845 %define movnta movntq
bafad220
LM
846 %assign %%i 0
847 %rep 8
848 CAT_XDEFINE m, %%i, mm %+ %%i
ec217218 849 CAT_XDEFINE nnmm, %%i, %%i
bafad220
LM
850 %assign %%i %%i+1
851 %endrep
852 %rep 8
853 CAT_UNDEF m, %%i
ec217218 854 CAT_UNDEF nnmm, %%i
bafad220
LM
855 %assign %%i %%i+1
856 %endrep
2f7f2e4b 857 INIT_CPUFLAGS %1
bafad220
LM
858%endmacro
859
2f7f2e4b 860%macro INIT_XMM 0-1+
33cbfa6f 861 %assign avx_enabled 0
2f7f2e4b 862 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
863 %define mmsize 16
864 %define num_mmregs 8
3b15a6d7 865 %if ARCH_X86_64
bafad220
LM
866 %define num_mmregs 16
867 %endif
868 %define mova movdqa
869 %define movu movdqu
870 %define movh movq
532e7697 871 %define movnta movntdq
bafad220
LM
872 %assign %%i 0
873 %rep num_mmregs
874 CAT_XDEFINE m, %%i, xmm %+ %%i
ec217218 875 CAT_XDEFINE nnxmm, %%i, %%i
bafad220
LM
876 %assign %%i %%i+1
877 %endrep
2f7f2e4b 878 INIT_CPUFLAGS %1
bafad220
LM
879%endmacro
880
2f7f2e4b 881%macro INIT_YMM 0-1+
33cbfa6f 882 %assign avx_enabled 1
2f7f2e4b 883 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
884 %define mmsize 32
885 %define num_mmregs 8
3b15a6d7 886 %if ARCH_X86_64
33cbfa6f
VS
887 %define num_mmregs 16
888 %endif
c108ba01
HG
889 %define mova movdqa
890 %define movu movdqu
2f7f2e4b 891 %undef movh
c108ba01 892 %define movnta movntdq
33cbfa6f
VS
893 %assign %%i 0
894 %rep num_mmregs
895 CAT_XDEFINE m, %%i, ymm %+ %%i
896 CAT_XDEFINE nymm, %%i, %%i
897 %assign %%i %%i+1
898 %endrep
2f7f2e4b 899 INIT_CPUFLAGS %1
33cbfa6f
VS
900%endmacro
901
2f7f2e4b 902INIT_XMM
bafad220 903
3fb78e99
LM
904%macro DECLARE_MMCAST 1
905 %define mmmm%1 mm%1
906 %define mmxmm%1 mm%1
907 %define mmymm%1 mm%1
908 %define xmmmm%1 mm%1
909 %define xmmxmm%1 xmm%1
910 %define xmmymm%1 xmm%1
911 %define ymmmm%1 mm%1
176a0fca 912 %define ymmxmm%1 xmm%1
3fb78e99
LM
913 %define ymmymm%1 ymm%1
914 %define xm%1 xmm %+ m%1
915 %define ym%1 ymm %+ m%1
916%endmacro
917
918%assign i 0
919%rep 16
920 DECLARE_MMCAST i
921%assign i i+1
922%endrep
923
bafad220
LM
924; I often want to use macros that permute their arguments. e.g. there's no
925; efficient way to implement butterfly or transpose or dct without swapping some
926; arguments.
927;
928; I would like to not have to manually keep track of the permutations:
929; If I insert a permutation in the middle of a function, it should automatically
930; change everything that follows. For more complex macros I may also have multiple
931; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
932;
933; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
934; permutes its arguments. It's equivalent to exchanging the contents of the
935; registers, except that this way you exchange the register names instead, so it
936; doesn't cost any cycles.
937
938%macro PERMUTE 2-* ; takes a list of pairs to swap
939%rep %0/2
49ebe3f9 940 %xdefine %%tmp%2 m%2
bafad220
LM
941 %rotate 2
942%endrep
943%rep %0/2
49ebe3f9 944 %xdefine m%1 %%tmp%2
ec217218 945 CAT_XDEFINE nn, m%1, %1
bafad220
LM
946 %rotate 2
947%endrep
948%endmacro
949
49ebe3f9
LM
950%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
951%ifnum %1 ; SWAP 0, 1, ...
952 SWAP_INTERNAL_NUM %1, %2
953%else ; SWAP m0, m1, ...
954 SWAP_INTERNAL_NAME %1, %2
bafad220 955%endif
49ebe3f9
LM
956%endmacro
957
958%macro SWAP_INTERNAL_NUM 2-*
959 %rep %0-1
960 %xdefine %%tmp m%1
961 %xdefine m%1 m%2
962 %xdefine m%2 %%tmp
ec217218
LM
963 CAT_XDEFINE nn, m%1, %1
964 CAT_XDEFINE nn, m%2, %2
bafad220 965 %rotate 1
49ebe3f9
LM
966 %endrep
967%endmacro
968
969%macro SWAP_INTERNAL_NAME 2-*
ec217218 970 %xdefine %%args nn %+ %1
49ebe3f9 971 %rep %0-1
ec217218 972 %xdefine %%args %%args, nn %+ %2
49ebe3f9
LM
973 %rotate 1
974 %endrep
975 SWAP_INTERNAL_NUM %%args
bafad220
LM
976%endmacro
977
2f7f2e4b
LM
978; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
979; calls to that function will automatically load the permutation, so values can
980; be returned in mmregs.
981%macro SAVE_MM_PERMUTATION 0-1
982 %if %0
983 %xdefine %%f %1_m
984 %else
985 %xdefine %%f current_function %+ _m
986 %endif
bafad220
LM
987 %assign %%i 0
988 %rep num_mmregs
2f7f2e4b 989 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
990 %assign %%i %%i+1
991 %endrep
992%endmacro
993
2966cc18 994%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
995 %ifdef %1_m0
996 %assign %%i 0
997 %rep num_mmregs
998 CAT_XDEFINE m, %%i, %1_m %+ %%i
ec217218 999 CAT_XDEFINE nn, m %+ %%i, %%i
2f7f2e4b
LM
1000 %assign %%i %%i+1
1001 %endrep
1002 %endif
bafad220
LM
1003%endmacro
1004
2f7f2e4b 1005; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 1006%macro call 1
edd82267 1007 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
1008%endmacro
1009%macro call_internal 2
edd82267
MR
1010 %xdefine %%i %2
1011 %ifndef cglobaled_%2
1012 %ifdef cglobaled_%1
1013 %xdefine %%i %1
2f7f2e4b 1014 %endif
bafad220 1015 %endif
2f7f2e4b
LM
1016 call %%i
1017 LOAD_MM_PERMUTATION %%i
bafad220
LM
1018%endmacro
1019
2966cc18 1020; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
1021%macro add 2
1022 %ifnum %2
1023 %if %2==128
1024 sub %1, -128
1025 %else
1026 add %1, %2
1027 %endif
1028 %else
1029 add %1, %2
1030 %endif
1031%endmacro
1032
1033%macro sub 2
1034 %ifnum %2
1035 %if %2==128
1036 add %1, -128
1037 %else
1038 sub %1, %2
1039 %endif
1040 %else
1041 sub %1, %2
1042 %endif
1043%endmacro
33cbfa6f
VS
1044
1045;=============================================================================
1046; AVX abstraction layer
1047;=============================================================================
1048
1049%assign i 0
1050%rep 16
1051 %if i < 8
1052 CAT_XDEFINE sizeofmm, i, 8
1053 %endif
1054 CAT_XDEFINE sizeofxmm, i, 16
1055 CAT_XDEFINE sizeofymm, i, 32
1056%assign i i+1
1057%endrep
1058%undef i
1059
96c9cc10
RB
1060%macro CHECK_AVX_INSTR_EMU 3-*
1061 %xdefine %%opcode %1
1062 %xdefine %%dst %2
1063 %rep %0-2
1064 %ifidn %%dst, %3
1065 %error non-avx emulation of ``%%opcode'' is not supported
1066 %endif
1067 %rotate 1
1068 %endrep
1069%endmacro
1070
33cbfa6f
VS
1071;%1 == instruction
1072;%2 == 1 if float, 0 if int
c108ba01
HG
1073;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1074;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
33cbfa6f 1075;%5+: operands
c108ba01
HG
1076%macro RUN_AVX_INSTR 5-8+
1077 %ifnum sizeof%6
b7d0d10a 1078 %assign __sizeofreg sizeof%6
c108ba01 1079 %elifnum sizeof%5
b7d0d10a 1080 %assign __sizeofreg sizeof%5
2f7f2e4b 1081 %else
b7d0d10a 1082 %assign __sizeofreg mmsize
2f7f2e4b 1083 %endif
b7d0d10a
LM
1084 %assign __emulate_avx 0
1085 %if avx_enabled && __sizeofreg >= 16
1086 %xdefine __instr v%1
33cbfa6f 1087 %else
b7d0d10a 1088 %xdefine __instr %1
c108ba01 1089 %if %0 >= 7+%3
b7d0d10a 1090 %assign __emulate_avx 1
33cbfa6f 1091 %endif
c108ba01 1092 %endif
33cbfa6f 1093
b7d0d10a
LM
1094 %if __emulate_avx
1095 %xdefine __src1 %6
1096 %xdefine __src2 %7
c108ba01
HG
1097 %ifnidn %5, %6
1098 %if %0 >= 8
1099 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
1100 %else
1101 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
1102 %endif
1103 %if %4 && %3 == 0
1104 %ifnid %7
1105 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1106 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1107 ; So, if the instruction is commutative with a memory arg, swap them.
b7d0d10a
LM
1108 %xdefine __src1 %7
1109 %xdefine __src2 %6
33cbfa6f 1110 %endif
c108ba01 1111 %endif
b7d0d10a
LM
1112 %if __sizeofreg == 8
1113 MOVQ %5, __src1
c108ba01 1114 %elif %2
b7d0d10a 1115 MOVAPS %5, __src1
33cbfa6f 1116 %else
b7d0d10a 1117 MOVDQA %5, __src1
33cbfa6f 1118 %endif
33cbfa6f 1119 %endif
c108ba01 1120 %if %0 >= 8
b7d0d10a 1121 %1 %5, __src2, %8
c108ba01 1122 %else
b7d0d10a 1123 %1 %5, __src2
2f7f2e4b 1124 %endif
c108ba01 1125 %elif %0 >= 8
b7d0d10a 1126 __instr %5, %6, %7, %8
c108ba01 1127 %elif %0 == 7
b7d0d10a 1128 __instr %5, %6, %7
c108ba01 1129 %elif %0 == 6
b7d0d10a 1130 __instr %5, %6
2f7f2e4b 1131 %else
b7d0d10a 1132 __instr %5
2f7f2e4b
LM
1133 %endif
1134%endmacro
1135
33cbfa6f
VS
1136;%1 == instruction
1137;%2 == 1 if float, 0 if int
c108ba01
HG
1138;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1139;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1140%macro AVX_INSTR 1-4 0, 1, 0
1141 %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
1142 %ifidn %2, fnord
1143 RUN_AVX_INSTR %6, %7, %8, %9, %1
1144 %elifidn %3, fnord
1145 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
33cbfa6f 1146 %elifidn %4, fnord
c108ba01 1147 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
33cbfa6f 1148 %elifidn %5, fnord
c108ba01 1149 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
33cbfa6f 1150 %else
c108ba01 1151 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
33cbfa6f
VS
1152 %endif
1153 %endmacro
1154%endmacro
1155
c108ba01
HG
1156; Instructions with both VEX and non-VEX encodings
1157; Non-destructive instructions are written without parameters
2f7f2e4b
LM
1158AVX_INSTR addpd, 1, 0, 1
1159AVX_INSTR addps, 1, 0, 1
1160AVX_INSTR addsd, 1, 0, 1
1161AVX_INSTR addss, 1, 0, 1
1162AVX_INSTR addsubpd, 1, 0, 0
1163AVX_INSTR addsubps, 1, 0, 0
c108ba01
HG
1164AVX_INSTR aesdec, 0, 0, 0
1165AVX_INSTR aesdeclast, 0, 0, 0
1166AVX_INSTR aesenc, 0, 0, 0
1167AVX_INSTR aesenclast, 0, 0, 0
1168AVX_INSTR aesimc
1169AVX_INSTR aeskeygenassist
2f7f2e4b
LM
1170AVX_INSTR andnpd, 1, 0, 0
1171AVX_INSTR andnps, 1, 0, 0
c108ba01
HG
1172AVX_INSTR andpd, 1, 0, 1
1173AVX_INSTR andps, 1, 0, 1
2f7f2e4b
LM
1174AVX_INSTR blendpd, 1, 0, 0
1175AVX_INSTR blendps, 1, 0, 0
1176AVX_INSTR blendvpd, 1, 0, 0
1177AVX_INSTR blendvps, 1, 0, 0
2e81acc6
CG
1178AVX_INSTR cmppd, 1, 1, 0
1179AVX_INSTR cmpps, 1, 1, 0
1180AVX_INSTR cmpsd, 1, 1, 0
1181AVX_INSTR cmpss, 1, 1, 0
c108ba01
HG
1182AVX_INSTR comisd
1183AVX_INSTR comiss
1184AVX_INSTR cvtdq2pd
1185AVX_INSTR cvtdq2ps
1186AVX_INSTR cvtpd2dq
1187AVX_INSTR cvtpd2ps
1188AVX_INSTR cvtps2dq
1189AVX_INSTR cvtps2pd
1190AVX_INSTR cvtsd2si
1191AVX_INSTR cvtsd2ss
1192AVX_INSTR cvtsi2sd
1193AVX_INSTR cvtsi2ss
1194AVX_INSTR cvtss2sd
1195AVX_INSTR cvtss2si
1196AVX_INSTR cvttpd2dq
1197AVX_INSTR cvttps2dq
1198AVX_INSTR cvttsd2si
1199AVX_INSTR cvttss2si
2f7f2e4b
LM
1200AVX_INSTR divpd, 1, 0, 0
1201AVX_INSTR divps, 1, 0, 0
1202AVX_INSTR divsd, 1, 0, 0
1203AVX_INSTR divss, 1, 0, 0
1204AVX_INSTR dppd, 1, 1, 0
1205AVX_INSTR dpps, 1, 1, 0
c108ba01 1206AVX_INSTR extractps
2f7f2e4b
LM
1207AVX_INSTR haddpd, 1, 0, 0
1208AVX_INSTR haddps, 1, 0, 0
1209AVX_INSTR hsubpd, 1, 0, 0
1210AVX_INSTR hsubps, 1, 0, 0
c108ba01
HG
1211AVX_INSTR insertps, 1, 1, 0
1212AVX_INSTR lddqu
1213AVX_INSTR ldmxcsr
1214AVX_INSTR maskmovdqu
2f7f2e4b
LM
1215AVX_INSTR maxpd, 1, 0, 1
1216AVX_INSTR maxps, 1, 0, 1
1217AVX_INSTR maxsd, 1, 0, 1
1218AVX_INSTR maxss, 1, 0, 1
1219AVX_INSTR minpd, 1, 0, 1
1220AVX_INSTR minps, 1, 0, 1
1221AVX_INSTR minsd, 1, 0, 1
1222AVX_INSTR minss, 1, 0, 1
c108ba01
HG
1223AVX_INSTR movapd
1224AVX_INSTR movaps
1225AVX_INSTR movd
1226AVX_INSTR movddup
1227AVX_INSTR movdqa
1228AVX_INSTR movdqu
39df0c43 1229AVX_INSTR movhlps, 1, 0, 0
c108ba01
HG
1230AVX_INSTR movhpd, 1, 0, 0
1231AVX_INSTR movhps, 1, 0, 0
39df0c43 1232AVX_INSTR movlhps, 1, 0, 0
c108ba01
HG
1233AVX_INSTR movlpd, 1, 0, 0
1234AVX_INSTR movlps, 1, 0, 0
1235AVX_INSTR movmskpd
1236AVX_INSTR movmskps
1237AVX_INSTR movntdq
1238AVX_INSTR movntdqa
1239AVX_INSTR movntpd
1240AVX_INSTR movntps
1241AVX_INSTR movq
2f7f2e4b 1242AVX_INSTR movsd, 1, 0, 0
c108ba01
HG
1243AVX_INSTR movshdup
1244AVX_INSTR movsldup
2f7f2e4b 1245AVX_INSTR movss, 1, 0, 0
c108ba01
HG
1246AVX_INSTR movupd
1247AVX_INSTR movups
2f7f2e4b
LM
1248AVX_INSTR mpsadbw, 0, 1, 0
1249AVX_INSTR mulpd, 1, 0, 1
1250AVX_INSTR mulps, 1, 0, 1
1251AVX_INSTR mulsd, 1, 0, 1
1252AVX_INSTR mulss, 1, 0, 1
1253AVX_INSTR orpd, 1, 0, 1
1254AVX_INSTR orps, 1, 0, 1
c108ba01
HG
1255AVX_INSTR pabsb
1256AVX_INSTR pabsd
1257AVX_INSTR pabsw
2f7f2e4b
LM
1258AVX_INSTR packsswb, 0, 0, 0
1259AVX_INSTR packssdw, 0, 0, 0
1260AVX_INSTR packuswb, 0, 0, 0
1261AVX_INSTR packusdw, 0, 0, 0
1262AVX_INSTR paddb, 0, 0, 1
1263AVX_INSTR paddw, 0, 0, 1
1264AVX_INSTR paddd, 0, 0, 1
1265AVX_INSTR paddq, 0, 0, 1
1266AVX_INSTR paddsb, 0, 0, 1
1267AVX_INSTR paddsw, 0, 0, 1
1268AVX_INSTR paddusb, 0, 0, 1
1269AVX_INSTR paddusw, 0, 0, 1
1270AVX_INSTR palignr, 0, 1, 0
1271AVX_INSTR pand, 0, 0, 1
1272AVX_INSTR pandn, 0, 0, 0
1273AVX_INSTR pavgb, 0, 0, 1
1274AVX_INSTR pavgw, 0, 0, 1
1275AVX_INSTR pblendvb, 0, 0, 0
1276AVX_INSTR pblendw, 0, 1, 0
c108ba01
HG
1277AVX_INSTR pclmulqdq, 0, 1, 0
1278AVX_INSTR pcmpestri
1279AVX_INSTR pcmpestrm
1280AVX_INSTR pcmpistri
1281AVX_INSTR pcmpistrm
2f7f2e4b
LM
1282AVX_INSTR pcmpeqb, 0, 0, 1
1283AVX_INSTR pcmpeqw, 0, 0, 1
1284AVX_INSTR pcmpeqd, 0, 0, 1
1285AVX_INSTR pcmpeqq, 0, 0, 1
1286AVX_INSTR pcmpgtb, 0, 0, 0
1287AVX_INSTR pcmpgtw, 0, 0, 0
1288AVX_INSTR pcmpgtd, 0, 0, 0
1289AVX_INSTR pcmpgtq, 0, 0, 0
c108ba01
HG
1290AVX_INSTR pextrb
1291AVX_INSTR pextrd
1292AVX_INSTR pextrq
1293AVX_INSTR pextrw
2f7f2e4b
LM
1294AVX_INSTR phaddw, 0, 0, 0
1295AVX_INSTR phaddd, 0, 0, 0
1296AVX_INSTR phaddsw, 0, 0, 0
c108ba01 1297AVX_INSTR phminposuw
2f7f2e4b
LM
1298AVX_INSTR phsubw, 0, 0, 0
1299AVX_INSTR phsubd, 0, 0, 0
1300AVX_INSTR phsubsw, 0, 0, 0
c108ba01
HG
1301AVX_INSTR pinsrb, 0, 1, 0
1302AVX_INSTR pinsrd, 0, 1, 0
1303AVX_INSTR pinsrq, 0, 1, 0
1304AVX_INSTR pinsrw, 0, 1, 0
2f7f2e4b
LM
1305AVX_INSTR pmaddwd, 0, 0, 1
1306AVX_INSTR pmaddubsw, 0, 0, 0
1307AVX_INSTR pmaxsb, 0, 0, 1
1308AVX_INSTR pmaxsw, 0, 0, 1
1309AVX_INSTR pmaxsd, 0, 0, 1
1310AVX_INSTR pmaxub, 0, 0, 1
1311AVX_INSTR pmaxuw, 0, 0, 1
1312AVX_INSTR pmaxud, 0, 0, 1
1313AVX_INSTR pminsb, 0, 0, 1
1314AVX_INSTR pminsw, 0, 0, 1
1315AVX_INSTR pminsd, 0, 0, 1
1316AVX_INSTR pminub, 0, 0, 1
1317AVX_INSTR pminuw, 0, 0, 1
1318AVX_INSTR pminud, 0, 0, 1
c108ba01
HG
1319AVX_INSTR pmovmskb
1320AVX_INSTR pmovsxbw
1321AVX_INSTR pmovsxbd
1322AVX_INSTR pmovsxbq
1323AVX_INSTR pmovsxwd
1324AVX_INSTR pmovsxwq
1325AVX_INSTR pmovsxdq
1326AVX_INSTR pmovzxbw
1327AVX_INSTR pmovzxbd
1328AVX_INSTR pmovzxbq
1329AVX_INSTR pmovzxwd
1330AVX_INSTR pmovzxwq
1331AVX_INSTR pmovzxdq
1332AVX_INSTR pmuldq, 0, 0, 1
2f7f2e4b 1333AVX_INSTR pmulhrsw, 0, 0, 1
c108ba01 1334AVX_INSTR pmulhuw, 0, 0, 1
2f7f2e4b
LM
1335AVX_INSTR pmulhw, 0, 0, 1
1336AVX_INSTR pmullw, 0, 0, 1
1337AVX_INSTR pmulld, 0, 0, 1
1338AVX_INSTR pmuludq, 0, 0, 1
2f7f2e4b
LM
1339AVX_INSTR por, 0, 0, 1
1340AVX_INSTR psadbw, 0, 0, 1
1341AVX_INSTR pshufb, 0, 0, 0
c108ba01
HG
1342AVX_INSTR pshufd
1343AVX_INSTR pshufhw
1344AVX_INSTR pshuflw
2f7f2e4b
LM
1345AVX_INSTR psignb, 0, 0, 0
1346AVX_INSTR psignw, 0, 0, 0
1347AVX_INSTR psignd, 0, 0, 0
1348AVX_INSTR psllw, 0, 0, 0
1349AVX_INSTR pslld, 0, 0, 0
1350AVX_INSTR psllq, 0, 0, 0
1351AVX_INSTR pslldq, 0, 0, 0
1352AVX_INSTR psraw, 0, 0, 0
1353AVX_INSTR psrad, 0, 0, 0
1354AVX_INSTR psrlw, 0, 0, 0
1355AVX_INSTR psrld, 0, 0, 0
1356AVX_INSTR psrlq, 0, 0, 0
1357AVX_INSTR psrldq, 0, 0, 0
1358AVX_INSTR psubb, 0, 0, 0
1359AVX_INSTR psubw, 0, 0, 0
1360AVX_INSTR psubd, 0, 0, 0
1361AVX_INSTR psubq, 0, 0, 0
1362AVX_INSTR psubsb, 0, 0, 0
1363AVX_INSTR psubsw, 0, 0, 0
1364AVX_INSTR psubusb, 0, 0, 0
1365AVX_INSTR psubusw, 0, 0, 0
c108ba01 1366AVX_INSTR ptest
2f7f2e4b
LM
1367AVX_INSTR punpckhbw, 0, 0, 0
1368AVX_INSTR punpckhwd, 0, 0, 0
1369AVX_INSTR punpckhdq, 0, 0, 0
1370AVX_INSTR punpckhqdq, 0, 0, 0
1371AVX_INSTR punpcklbw, 0, 0, 0
1372AVX_INSTR punpcklwd, 0, 0, 0
1373AVX_INSTR punpckldq, 0, 0, 0
1374AVX_INSTR punpcklqdq, 0, 0, 0
1375AVX_INSTR pxor, 0, 0, 1
c108ba01
HG
1376AVX_INSTR rcpps, 1, 0, 0
1377AVX_INSTR rcpss, 1, 0, 0
1378AVX_INSTR roundpd
1379AVX_INSTR roundps
1380AVX_INSTR roundsd
1381AVX_INSTR roundss
1382AVX_INSTR rsqrtps, 1, 0, 0
1383AVX_INSTR rsqrtss, 1, 0, 0
1384AVX_INSTR shufpd, 1, 1, 0
6b6ee582 1385AVX_INSTR shufps, 1, 1, 0
c108ba01
HG
1386AVX_INSTR sqrtpd, 1, 0, 0
1387AVX_INSTR sqrtps, 1, 0, 0
1388AVX_INSTR sqrtsd, 1, 0, 0
1389AVX_INSTR sqrtss, 1, 0, 0
1390AVX_INSTR stmxcsr
2f7f2e4b
LM
1391AVX_INSTR subpd, 1, 0, 0
1392AVX_INSTR subps, 1, 0, 0
1393AVX_INSTR subsd, 1, 0, 0
1394AVX_INSTR subss, 1, 0, 0
c108ba01
HG
1395AVX_INSTR ucomisd
1396AVX_INSTR ucomiss
2f7f2e4b
LM
1397AVX_INSTR unpckhpd, 1, 0, 0
1398AVX_INSTR unpckhps, 1, 0, 0
1399AVX_INSTR unpcklpd, 1, 0, 0
1400AVX_INSTR unpcklps, 1, 0, 0
1401AVX_INSTR xorpd, 1, 0, 1
1402AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1403
1404; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1405AVX_INSTR pfadd, 1, 0, 1
1406AVX_INSTR pfsub, 1, 0, 0
1407AVX_INSTR pfmul, 1, 0, 1
1408
1409; base-4 constants for shuffles
1410%assign i 0
1411%rep 256
1412 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1413 %if j < 10
1414 CAT_XDEFINE q000, j, i
1415 %elif j < 100
1416 CAT_XDEFINE q00, j, i
1417 %elif j < 1000
1418 CAT_XDEFINE q0, j, i
1419 %else
1420 CAT_XDEFINE q, j, i
1421 %endif
1422%assign i i+1
1423%endrep
1424%undef i
1425%undef j
1426
1427%macro FMA_INSTR 3
20689570
DB
1428 %macro %1 4-7 %1, %2, %3
1429 %if cpuflag(xop)
1430 v%5 %1, %2, %3, %4
8c75ba55 1431 %elifnidn %1, %4
20689570
DB
1432 %6 %1, %2, %3
1433 %7 %1, %4
8c75ba55
AM
1434 %else
1435 %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
2f7f2e4b
LM
1436 %endif
1437 %endmacro
1438%endmacro
1439
2f7f2e4b 1440FMA_INSTR pmacsww, pmullw, paddw
8c75ba55
AM
1441FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation
1442FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation
2f7f2e4b 1443FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1444
1445; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1446; This lets us use tzcnt without bumping the yasm version requirement yet.
1447%define tzcnt rep bsf
c6908d6b
JGG
1448
1449; convert FMA4 to FMA3 if possible
1450%macro FMA4_INSTR 4
1451 %macro %1 4-8 %1, %2, %3, %4
1452 %if cpuflag(fma4)
1453 v%5 %1, %2, %3, %4
1454 %elifidn %1, %2
1455 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
1456 %elifidn %1, %3
1457 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
1458 %elifidn %1, %4
1459 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
1460 %else
1461 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
1462 %endif
1463 %endmacro
1464%endmacro
1465
1466FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
1467FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
1468FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
1469FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
1470
1471FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
1472FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
1473FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
1474FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
1475
1476FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
1477FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
1478FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
1479FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
1480
1481FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
1482FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
1483FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
1484FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
1485
1486FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
1487FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
1488FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
1489FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
a3fabc6c
JGG
1490
1491; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
1492%if ARCH_X86_64 == 0
1493%macro vpbroadcastq 2
1494%if sizeof%1 == 16
1495 movddup %1, %2
1496%else
1497 vbroadcastsd %1, %2
1498%endif
1499%endmacro
1500%endif