x86inc: Free up variable name "n" in global namespace
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
71155665 4;* Copyright (C) 2005-2013 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
79793f83 8;* Fiona Glaser <fiona@x264.com>
ad7d7d4f 9;* Henrik Gramner <henrik@gramner.com>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
d633d12b
DB
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
3b15a6d7 45%define WIN64 0
96c9cc10 46%define UNIX64 0
3b15a6d7 47%if ARCH_X86_64
3f87f39c 48 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 49 %define WIN64 1
166f3993
HY
50 %elifidn __OUTPUT_FORMAT__,win64
51 %define WIN64 1
47f9d7ce
DB
52 %elifidn __OUTPUT_FORMAT__,x64
53 %define WIN64 1
3f87f39c 54 %else
3b15a6d7 55 %define UNIX64 1
3f87f39c
JA
56 %endif
57%endif
58
2966cc18
JGG
59%ifdef PREFIX
60 %define mangle(x) _ %+ x
61%else
62 %define mangle(x) x
63%endif
64
ad7d7d4f
HG
65; aout does not support align=
66; NOTE: This section is out of sync with x264, in order to
67; keep supporting OS/2.
3f87f39c 68%macro SECTION_RODATA 0-1 16
ad7d7d4f 69 %ifidn __OUTPUT_FORMAT__,aout
d69f9a42 70 section .text
bafad220 71 %else
3f87f39c 72 SECTION .rodata align=%1
bafad220
LM
73 %endif
74%endmacro
75
d69f9a42
DY
76%macro SECTION_TEXT 0-1 16
77 %ifidn __OUTPUT_FORMAT__,aout
78 SECTION .text
79 %else
80 SECTION .text align=%1
81 %endif
82%endmacro
83
3b15a6d7 84%if WIN64
3f87f39c 85 %define PIC
412b248e 86%elif ARCH_X86_64 == 0
2966cc18
JGG
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
90 %undef PIC
91%endif
92%ifdef PIC
2966cc18 93 default rel
bafad220
LM
94%endif
95
180d43bc
MR
96%macro CPUNOP 1
97 %if HAVE_CPUNOP
98 CPU %1
99 %endif
100%endmacro
101
729f90e2 102; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
180d43bc 103CPUNOP amdnop
729f90e2 104
bafad220
LM
105; Macros to eliminate most code duplication between x86_32 and x86_64:
106; Currently this works only for leaf functions which load all their arguments
107; into registers at the start, and make no other use of the stack. Luckily that
108; covers most of x264's asm.
109
110; PROLOGUE:
111; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
112; %2 = number of registers used. pushes callee-saved regs if needed.
113; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
6f40e9f0
RB
114; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
115; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
116; and an extra register will be allocated to hold the original stack
117; pointer (to not invalidate r0m etc.). To prevent the use of an extra
118; register as stack pointer, request a negative stack size.
119; %4+/%5+ = list of names to define to registers
bafad220
LM
120; PROLOGUE can also be invoked by adding the same options to cglobal
121
122; e.g.
29e4edbb 123; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 124; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
125
126; TODO Some functions can use some args directly from the stack. If they're the
127; last args then you can just not declare them, but if they're in the middle
128; we need more flexible macro.
129
130; RET:
2f7f2e4b 131; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
132
133; REP_RET:
25cb0c1a 134; Use this instead of RET if it's a branch target.
bafad220 135
3f87f39c
JA
136; registers:
137; rN and rNq are the native-size register holding function argument N
138; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 139; rNh is the high 8 bits of the word size
3f87f39c
JA
140; rNm is the original location of arg N (a register or on the stack), dword
141; rNmp is native size
142
96c9cc10 143%macro DECLARE_REG 2-3
bafad220 144 %define r%1q %2
96c9cc10
RB
145 %define r%1d %2d
146 %define r%1w %2w
147 %define r%1b %2b
148 %define r%1h %2h
7a1944b9 149 %define %2q %2
96c9cc10
RB
150 %if %0 == 2
151 %define r%1m %2d
3f87f39c 152 %define r%1mp %2
3b15a6d7 153 %elif ARCH_X86_64 ; memory
6f40e9f0 154 %define r%1m [rstk + stack_offset + %3]
0995ad8d 155 %define r%1mp qword r %+ %1 %+ m
3f87f39c 156 %else
6f40e9f0 157 %define r%1m [rstk + stack_offset + %3]
0995ad8d 158 %define r%1mp dword r %+ %1 %+ m
3f87f39c 159 %endif
bafad220
LM
160 %define r%1 %2
161%endmacro
162
96c9cc10 163%macro DECLARE_REG_SIZE 3
bafad220
LM
164 %define r%1q r%1
165 %define e%1q r%1
166 %define r%1d e%1
167 %define e%1d e%1
168 %define r%1w %1
169 %define e%1w %1
96c9cc10
RB
170 %define r%1h %3
171 %define e%1h %3
bafad220
LM
172 %define r%1b %2
173 %define e%1b %2
3b15a6d7 174%if ARCH_X86_64 == 0
bafad220
LM
175 %define r%1 e%1
176%endif
177%endmacro
178
96c9cc10
RB
179DECLARE_REG_SIZE ax, al, ah
180DECLARE_REG_SIZE bx, bl, bh
181DECLARE_REG_SIZE cx, cl, ch
182DECLARE_REG_SIZE dx, dl, dh
183DECLARE_REG_SIZE si, sil, null
184DECLARE_REG_SIZE di, dil, null
185DECLARE_REG_SIZE bp, bpl, null
bafad220 186
3f87f39c
JA
187; t# defines for when per-arch register allocation is more complex than just function arguments
188
189%macro DECLARE_REG_TMP 1-*
190 %assign %%i 0
191 %rep %0
192 CAT_XDEFINE t, %%i, r%1
193 %assign %%i %%i+1
194 %rotate 1
195 %endrep
196%endmacro
197
198%macro DECLARE_REG_TMP_SIZE 0-*
199 %rep %0
200 %define t%1q t%1 %+ q
201 %define t%1d t%1 %+ d
202 %define t%1w t%1 %+ w
96c9cc10 203 %define t%1h t%1 %+ h
3f87f39c
JA
204 %define t%1b t%1 %+ b
205 %rotate 1
206 %endrep
207%endmacro
208
729f90e2 209DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 210
3b15a6d7 211%if ARCH_X86_64
bafad220
LM
212 %define gprsize 8
213%else
214 %define gprsize 4
215%endif
216
217%macro PUSH 1
218 push %1
6f40e9f0
RB
219 %ifidn rstk, rsp
220 %assign stack_offset stack_offset+gprsize
221 %endif
bafad220
LM
222%endmacro
223
224%macro POP 1
225 pop %1
6f40e9f0
RB
226 %ifidn rstk, rsp
227 %assign stack_offset stack_offset-gprsize
228 %endif
bafad220
LM
229%endmacro
230
729f90e2
HG
231%macro PUSH_IF_USED 1-*
232 %rep %0
233 %if %1 < regs_used
234 PUSH r%1
235 %endif
236 %rotate 1
237 %endrep
238%endmacro
239
240%macro POP_IF_USED 1-*
241 %rep %0
242 %if %1 < regs_used
243 pop r%1
244 %endif
245 %rotate 1
246 %endrep
247%endmacro
248
249%macro LOAD_IF_USED 1-*
250 %rep %0
251 %if %1 < num_args
252 mov r%1, r %+ %1 %+ mp
253 %endif
254 %rotate 1
255 %endrep
256%endmacro
257
bafad220
LM
258%macro SUB 2
259 sub %1, %2
6f40e9f0 260 %ifidn %1, rstk
bafad220
LM
261 %assign stack_offset stack_offset+(%2)
262 %endif
263%endmacro
264
265%macro ADD 2
266 add %1, %2
6f40e9f0 267 %ifidn %1, rstk
bafad220
LM
268 %assign stack_offset stack_offset-(%2)
269 %endif
270%endmacro
271
272%macro movifnidn 2
273 %ifnidn %1, %2
274 mov %1, %2
275 %endif
276%endmacro
277
278%macro movsxdifnidn 2
279 %ifnidn %1, %2
280 movsxd %1, %2
281 %endif
282%endmacro
283
284%macro ASSERT 1
285 %if (%1) == 0
286 %error assert failed
287 %endif
288%endmacro
289
290%macro DEFINE_ARGS 0-*
291 %ifdef n_arg_names
292 %assign %%i 0
293 %rep n_arg_names
294 CAT_UNDEF arg_name %+ %%i, q
295 CAT_UNDEF arg_name %+ %%i, d
296 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 297 CAT_UNDEF arg_name %+ %%i, h
bafad220 298 CAT_UNDEF arg_name %+ %%i, b
2f77923d 299 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 300 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
301 CAT_UNDEF arg_name, %%i
302 %assign %%i %%i+1
303 %endrep
304 %endif
305
0f53d0cf
LM
306 %xdefine %%stack_offset stack_offset
307 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
308 %assign %%i 0
309 %rep %0
310 %xdefine %1q r %+ %%i %+ q
311 %xdefine %1d r %+ %%i %+ d
312 %xdefine %1w r %+ %%i %+ w
96c9cc10 313 %xdefine %1h r %+ %%i %+ h
bafad220 314 %xdefine %1b r %+ %%i %+ b
2f77923d 315 %xdefine %1m r %+ %%i %+ m
98b9da2a 316 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
317 CAT_XDEFINE arg_name, %%i, %1
318 %assign %%i %%i+1
319 %rotate 1
320 %endrep
0f53d0cf
LM
321 %xdefine stack_offset %%stack_offset
322 %assign n_arg_names %0
bafad220
LM
323%endmacro
324
6f40e9f0
RB
325%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
326 %ifnum %1
327 %if %1 != 0
328 %assign %%stack_alignment ((mmsize + 15) & ~15)
329 %assign stack_size %1
330 %if stack_size < 0
331 %assign stack_size -stack_size
332 %endif
bbe4a6db
HG
333 %assign stack_size_padded stack_size
334 %if WIN64
335 %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
336 %if mmsize != 8
337 %assign xmm_regs_used %2
338 %if xmm_regs_used > 8
339 %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
340 %endif
341 %endif
a34d9ad9 342 %endif
6f40e9f0 343 %if mmsize <= 16 && HAVE_ALIGNED_STACK
bbe4a6db 344 %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
6f40e9f0
RB
345 SUB rsp, stack_size_padded
346 %else
a34d9ad9
RB
347 %assign %%reg_num (regs_used - 1)
348 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
349 ; align stack, and save original stack location directly above
350 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
351 ; stack in a single instruction (i.e. mov rsp, rstk or mov
352 ; rsp, [rsp+stack_size_padded])
353 mov rstk, rsp
6f40e9f0
RB
354 %if %1 < 0 ; need to store rsp on stack
355 sub rsp, gprsize+stack_size_padded
356 and rsp, ~(%%stack_alignment-1)
357 %xdefine rstkm [rsp+stack_size_padded]
358 mov rstkm, rstk
359 %else ; can keep rsp in rstk during whole function
360 sub rsp, stack_size_padded
361 and rsp, ~(%%stack_alignment-1)
362 %xdefine rstkm rstk
363 %endif
364 %endif
bbe4a6db 365 WIN64_PUSH_XMM
6f40e9f0
RB
366 %endif
367 %endif
368%endmacro
369
370%macro SETUP_STACK_POINTER 1
371 %ifnum %1
372 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
373 %if %1 > 0
374 %assign regs_used (regs_used + 1)
375 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
376 %warning "Stack pointer will overwrite register argument"
377 %endif
378 %endif
379 %endif
380%endmacro
381
382%macro DEFINE_ARGS_INTERNAL 3+
383 %ifnum %2
384 DEFINE_ARGS %3
385 %elif %1 == 4
386 DEFINE_ARGS %2
387 %elif %1 > 4
388 DEFINE_ARGS %2, %3
389 %endif
390%endmacro
391
3b15a6d7 392%if WIN64 ; Windows x64 ;=================================================
bafad220 393
96c9cc10
RB
394DECLARE_REG 0, rcx
395DECLARE_REG 1, rdx
396DECLARE_REG 2, R8
397DECLARE_REG 3, R9
398DECLARE_REG 4, R10, 40
399DECLARE_REG 5, R11, 48
400DECLARE_REG 6, rax, 56
401DECLARE_REG 7, rdi, 64
402DECLARE_REG 8, rsi, 72
403DECLARE_REG 9, rbx, 80
404DECLARE_REG 10, rbp, 88
405DECLARE_REG 11, R12, 96
406DECLARE_REG 12, R13, 104
407DECLARE_REG 13, R14, 112
408DECLARE_REG 14, R15, 120
3f87f39c 409
6f40e9f0 410%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 411 %assign num_args %1
3f87f39c 412 %assign regs_used %2
729f90e2 413 ASSERT regs_used >= num_args
a34d9ad9 414 SETUP_STACK_POINTER %4
729f90e2
HG
415 ASSERT regs_used <= 15
416 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
417 ALLOC_STACK %4, %3
418 %if mmsize != 8 && stack_size == 0
9cf73853
HG
419 WIN64_SPILL_XMM %3
420 %endif
729f90e2 421 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
422 DEFINE_ARGS_INTERNAL %0, %4, %5
423%endmacro
424
425%macro WIN64_PUSH_XMM 0
bbe4a6db
HG
426 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
427 %if xmm_regs_used > 6
428 movaps [rstk + stack_offset + 8], xmm6
429 %endif
430 %if xmm_regs_used > 7
431 movaps [rstk + stack_offset + 24], xmm7
432 %endif
433 %if xmm_regs_used > 8
434 %assign %%i 8
435 %rep xmm_regs_used-8
436 movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
437 %assign %%i %%i+1
438 %endrep
439 %endif
532e7697
LM
440%endmacro
441
442%macro WIN64_SPILL_XMM 1
443 %assign xmm_regs_used %1
444 ASSERT xmm_regs_used <= 16
bbe4a6db
HG
445 %if xmm_regs_used > 8
446 %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
447 SUB rsp, stack_size_padded
3f87f39c 448 %endif
bbe4a6db 449 WIN64_PUSH_XMM
3f87f39c
JA
450%endmacro
451
532e7697 452%macro WIN64_RESTORE_XMM_INTERNAL 1
bbe4a6db
HG
453 %assign %%pad_size 0
454 %if xmm_regs_used > 8
3f87f39c 455 %assign %%i xmm_regs_used
bbe4a6db 456 %rep xmm_regs_used-8
3f87f39c 457 %assign %%i %%i-1
bbe4a6db 458 movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
3f87f39c 459 %endrep
6f40e9f0
RB
460 %endif
461 %if stack_size_padded > 0
462 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
463 mov rsp, rstkm
464 %else
465 add %1, stack_size_padded
bbe4a6db 466 %assign %%pad_size stack_size_padded
6f40e9f0 467 %endif
3f87f39c 468 %endif
bbe4a6db
HG
469 %if xmm_regs_used > 7
470 movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
471 %endif
472 %if xmm_regs_used > 6
473 movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
474 %endif
3f87f39c
JA
475%endmacro
476
532e7697
LM
477%macro WIN64_RESTORE_XMM 1
478 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 479 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
480 %assign xmm_regs_used 0
481%endmacro
482
6f40e9f0 483%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 484
3f87f39c 485%macro RET 0
532e7697 486 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 487 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
488%if mmsize == 32
489 vzeroupper
490%endif
25cb0c1a 491 AUTO_REP_RET
bafad220
LM
492%endmacro
493
3b15a6d7 494%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 495
96c9cc10
RB
496DECLARE_REG 0, rdi
497DECLARE_REG 1, rsi
498DECLARE_REG 2, rdx
499DECLARE_REG 3, rcx
500DECLARE_REG 4, R8
501DECLARE_REG 5, R9
502DECLARE_REG 6, rax, 8
503DECLARE_REG 7, R10, 16
504DECLARE_REG 8, R11, 24
505DECLARE_REG 9, rbx, 32
506DECLARE_REG 10, rbp, 40
507DECLARE_REG 11, R12, 48
508DECLARE_REG 12, R13, 56
509DECLARE_REG 13, R14, 64
510DECLARE_REG 14, R15, 72
bafad220 511
6f40e9f0 512%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
513 %assign num_args %1
514 %assign regs_used %2
515 ASSERT regs_used >= num_args
a34d9ad9 516 SETUP_STACK_POINTER %4
729f90e2
HG
517 ASSERT regs_used <= 15
518 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 519 ALLOC_STACK %4
729f90e2 520 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 521 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
522%endmacro
523
6f40e9f0 524%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 525
bafad220 526%macro RET 0
6f40e9f0
RB
527%if stack_size_padded > 0
528%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
529 mov rsp, rstkm
530%else
531 add rsp, stack_size_padded
532%endif
533%endif
729f90e2 534 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
535%if mmsize == 32
536 vzeroupper
537%endif
25cb0c1a 538 AUTO_REP_RET
bafad220
LM
539%endmacro
540
bafad220
LM
541%else ; X86_32 ;==============================================================
542
96c9cc10
RB
543DECLARE_REG 0, eax, 4
544DECLARE_REG 1, ecx, 8
545DECLARE_REG 2, edx, 12
546DECLARE_REG 3, ebx, 16
547DECLARE_REG 4, esi, 20
548DECLARE_REG 5, edi, 24
549DECLARE_REG 6, ebp, 28
bafad220
LM
550%define rsp esp
551
729f90e2
HG
552%macro DECLARE_ARG 1-*
553 %rep %0
6f40e9f0 554 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
555 %define r%1mp dword r%1m
556 %rotate 1
557 %endrep
bafad220
LM
558%endmacro
559
729f90e2 560DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 561
6f40e9f0 562%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 563 %assign num_args %1
bafad220 564 %assign regs_used %2
a34d9ad9
RB
565 ASSERT regs_used >= num_args
566 %if num_args > 7
567 %assign num_args 7
568 %endif
729f90e2
HG
569 %if regs_used > 7
570 %assign regs_used 7
571 %endif
6f40e9f0
RB
572 SETUP_STACK_POINTER %4
573 ASSERT regs_used <= 7
729f90e2 574 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 575 ALLOC_STACK %4
729f90e2 576 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 577 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
578%endmacro
579
6f40e9f0 580%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 581
bafad220 582%macro RET 0
6f40e9f0
RB
583%if stack_size_padded > 0
584%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
585 mov rsp, rstkm
586%else
587 add rsp, stack_size_padded
588%endif
589%endif
729f90e2 590 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
591%if mmsize == 32
592 vzeroupper
593%endif
25cb0c1a 594 AUTO_REP_RET
bafad220
LM
595%endmacro
596
bafad220
LM
597%endif ;======================================================================
598
3b15a6d7 599%if WIN64 == 0
532e7697
LM
600%macro WIN64_SPILL_XMM 1
601%endmacro
602%macro WIN64_RESTORE_XMM 1
603%endmacro
6f40e9f0
RB
604%macro WIN64_PUSH_XMM 0
605%endmacro
532e7697
LM
606%endif
607
25cb0c1a
LM
608; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
609; a branch or a branch target. So switch to a 2-byte form of ret in that case.
610; We can automatically detect "follows a branch", but not a branch target.
611; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
96c9cc10
RB
612%macro REP_RET 0
613 %if has_epilogue
614 RET
615 %else
616 rep ret
617 %endif
618%endmacro
619
25cb0c1a
LM
620%define last_branch_adr $$
621%macro AUTO_REP_RET 0
622 %ifndef cpuflags
623 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
624 %elif notcpuflag(ssse3)
625 times ((last_branch_adr-$)>>31)+1 rep
626 %endif
627 ret
628%endmacro
629
630%macro BRANCH_INSTR 0-*
631 %rep %0
632 %macro %1 1-2 %1
633 %2 %1
634 %%branch_instr:
635 %xdefine last_branch_adr %%branch_instr
636 %endmacro
637 %rotate 1
638 %endrep
639%endmacro
640
641BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
642
96c9cc10
RB
643%macro TAIL_CALL 2 ; callee, is_nonadjacent
644 %if has_epilogue
645 call %1
646 RET
647 %elif %2
648 jmp %1
649 %endif
650%endmacro
651
bafad220
LM
652;=============================================================================
653; arch-independent part
654;=============================================================================
655
656%assign function_align 16
657
2f7f2e4b
LM
658; Begin a function.
659; Applies any symbol mangling needed for C linkage, and sets up a define such that
660; subsequent uses of the function name automatically refer to the mangled version.
661; Appends cpuflags to the function name if cpuflags has been specified.
d633d12b
DB
662; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
663; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
a34d9ad9 664%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
d633d12b
DB
665 cglobal_internal 1, %1 %+ SUFFIX, %2
666%endmacro
667%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
668 cglobal_internal 0, %1 %+ SUFFIX, %2
669%endmacro
670%macro cglobal_internal 2-3+
671 %if %1
672 %xdefine %%FUNCTION_PREFIX private_prefix
673 %xdefine %%VISIBILITY hidden
674 %else
675 %xdefine %%FUNCTION_PREFIX public_prefix
676 %xdefine %%VISIBILITY
677 %endif
678 %ifndef cglobaled_%2
679 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
680 %xdefine %2.skip_prologue %2 %+ .skip_prologue
681 CAT_XDEFINE cglobaled_, %2, 1
2f7f2e4b 682 %endif
d633d12b 683 %xdefine current_function %2
bafad220 684 %ifidn __OUTPUT_FORMAT__,elf
d633d12b 685 global %2:function %%VISIBILITY
bafad220 686 %else
d633d12b 687 global %2
bafad220
LM
688 %endif
689 align function_align
d633d12b 690 %2:
bbe4a6db
HG
691 RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer
692 %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required
693 %assign stack_offset 0 ; stack pointer offset relative to the return address
694 %assign stack_size 0 ; amount of stack space that can be freely used inside a function
695 %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
696 %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
d633d12b
DB
697 %ifnidn %3, ""
698 PROLOGUE %3
bafad220
LM
699 %endif
700%endmacro
701
702%macro cextern 1
ef5d41a5 703 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 704 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
705 extern %1
706%endmacro
707
2f7f2e4b 708; like cextern, but without the prefix
2966cc18
JGG
709%macro cextern_naked 1
710 %xdefine %1 mangle(%1)
2f7f2e4b 711 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 712 extern %1
bafad220
LM
713%endmacro
714
71155665 715%macro const 1-2+
ef5d41a5 716 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
ad76e6e7
HG
717 %ifidn __OUTPUT_FORMAT__,elf
718 global %1:data hidden
719 %else
720 global %1
721 %endif
2966cc18
JGG
722 %1: %2
723%endmacro
724
bafad220
LM
725; This is needed for ELF, otherwise the GNU linker assumes the stack is
726; executable by default.
727%ifidn __OUTPUT_FORMAT__,elf
728SECTION .note.GNU-stack noalloc noexec nowrite progbits
729%endif
730
2f7f2e4b
LM
731; cpuflags
732
733%assign cpuflags_mmx (1<<0)
734%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
735%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 736%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
737%assign cpuflags_sse (1<<4) | cpuflags_mmx2
738%assign cpuflags_sse2 (1<<5) | cpuflags_sse
739%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
740%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
741%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
742%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
743%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
744%assign cpuflags_avx (1<<11)| cpuflags_sse42
745%assign cpuflags_xop (1<<12)| cpuflags_avx
746%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
747%assign cpuflags_avx2 (1<<14)| cpuflags_avx
748%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
749
750%assign cpuflags_cache32 (1<<16)
751%assign cpuflags_cache64 (1<<17)
752%assign cpuflags_slowctz (1<<18)
753%assign cpuflags_lzcnt (1<<19)
3e2fa991
HG
754%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
755%assign cpuflags_atom (1<<21)
756%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
757%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
2f7f2e4b
LM
758
759%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
760%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
761
762; Takes up to 2 cpuflags from the above list.
763; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
764; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
765%macro INIT_CPUFLAGS 0-2
180d43bc 766 CPUNOP amdnop
2f7f2e4b
LM
767 %if %0 >= 1
768 %xdefine cpuname %1
769 %assign cpuflags cpuflags_%1
770 %if %0 >= 2
771 %xdefine cpuname %1_%2
772 %assign cpuflags cpuflags | cpuflags_%2
773 %endif
774 %xdefine SUFFIX _ %+ cpuname
775 %if cpuflag(avx)
776 %assign avx_enabled 1
777 %endif
c108ba01 778 %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2))
f2bd8a07
JR
779 %define mova movaps
780 %define movu movups
781 %define movnta movntps
782 %endif
2f7f2e4b
LM
783 %if cpuflag(aligned)
784 %define movu mova
785 %elifidn %1, sse3
786 %define movu lddqu
787 %endif
0c0828ec 788 %if notcpuflag(sse2)
180d43bc 789 CPUNOP basicnop
2cd1f5ca 790 %endif
2f7f2e4b
LM
791 %else
792 %xdefine SUFFIX
793 %undef cpuname
794 %undef cpuflags
795 %endif
796%endmacro
797
3fb78e99 798; Merge mmx and sse*
176a0fca
HG
799; m# is a simd register of the currently selected size
800; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
801; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
3fb78e99 802; (All 3 remain in sync through SWAP.)
bafad220
LM
803
804%macro CAT_XDEFINE 3
805 %xdefine %1%2 %3
806%endmacro
807
808%macro CAT_UNDEF 2
809 %undef %1%2
810%endmacro
811
2f7f2e4b 812%macro INIT_MMX 0-1+
33cbfa6f 813 %assign avx_enabled 0
2f7f2e4b 814 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
815 %define mmsize 8
816 %define num_mmregs 8
817 %define mova movq
818 %define movu movq
819 %define movh movd
532e7697 820 %define movnta movntq
bafad220
LM
821 %assign %%i 0
822 %rep 8
823 CAT_XDEFINE m, %%i, mm %+ %%i
ec217218 824 CAT_XDEFINE nnmm, %%i, %%i
bafad220
LM
825 %assign %%i %%i+1
826 %endrep
827 %rep 8
828 CAT_UNDEF m, %%i
ec217218 829 CAT_UNDEF nnmm, %%i
bafad220
LM
830 %assign %%i %%i+1
831 %endrep
2f7f2e4b 832 INIT_CPUFLAGS %1
bafad220
LM
833%endmacro
834
2f7f2e4b 835%macro INIT_XMM 0-1+
33cbfa6f 836 %assign avx_enabled 0
2f7f2e4b 837 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
838 %define mmsize 16
839 %define num_mmregs 8
3b15a6d7 840 %if ARCH_X86_64
bafad220
LM
841 %define num_mmregs 16
842 %endif
843 %define mova movdqa
844 %define movu movdqu
845 %define movh movq
532e7697 846 %define movnta movntdq
bafad220
LM
847 %assign %%i 0
848 %rep num_mmregs
849 CAT_XDEFINE m, %%i, xmm %+ %%i
ec217218 850 CAT_XDEFINE nnxmm, %%i, %%i
bafad220
LM
851 %assign %%i %%i+1
852 %endrep
2f7f2e4b 853 INIT_CPUFLAGS %1
bafad220
LM
854%endmacro
855
2f7f2e4b 856%macro INIT_YMM 0-1+
33cbfa6f 857 %assign avx_enabled 1
2f7f2e4b 858 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
859 %define mmsize 32
860 %define num_mmregs 8
3b15a6d7 861 %if ARCH_X86_64
33cbfa6f
VS
862 %define num_mmregs 16
863 %endif
c108ba01
HG
864 %define mova movdqa
865 %define movu movdqu
2f7f2e4b 866 %undef movh
c108ba01 867 %define movnta movntdq
33cbfa6f
VS
868 %assign %%i 0
869 %rep num_mmregs
870 CAT_XDEFINE m, %%i, ymm %+ %%i
871 CAT_XDEFINE nymm, %%i, %%i
872 %assign %%i %%i+1
873 %endrep
2f7f2e4b 874 INIT_CPUFLAGS %1
33cbfa6f
VS
875%endmacro
876
2f7f2e4b 877INIT_XMM
bafad220 878
3fb78e99
LM
879%macro DECLARE_MMCAST 1
880 %define mmmm%1 mm%1
881 %define mmxmm%1 mm%1
882 %define mmymm%1 mm%1
883 %define xmmmm%1 mm%1
884 %define xmmxmm%1 xmm%1
885 %define xmmymm%1 xmm%1
886 %define ymmmm%1 mm%1
176a0fca 887 %define ymmxmm%1 xmm%1
3fb78e99
LM
888 %define ymmymm%1 ymm%1
889 %define xm%1 xmm %+ m%1
890 %define ym%1 ymm %+ m%1
891%endmacro
892
893%assign i 0
894%rep 16
895 DECLARE_MMCAST i
896%assign i i+1
897%endrep
898
bafad220
LM
899; I often want to use macros that permute their arguments. e.g. there's no
900; efficient way to implement butterfly or transpose or dct without swapping some
901; arguments.
902;
903; I would like to not have to manually keep track of the permutations:
904; If I insert a permutation in the middle of a function, it should automatically
905; change everything that follows. For more complex macros I may also have multiple
906; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
907;
908; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
909; permutes its arguments. It's equivalent to exchanging the contents of the
910; registers, except that this way you exchange the register names instead, so it
911; doesn't cost any cycles.
912
913%macro PERMUTE 2-* ; takes a list of pairs to swap
914%rep %0/2
49ebe3f9 915 %xdefine %%tmp%2 m%2
bafad220
LM
916 %rotate 2
917%endrep
918%rep %0/2
49ebe3f9 919 %xdefine m%1 %%tmp%2
ec217218 920 CAT_XDEFINE nn, m%1, %1
bafad220
LM
921 %rotate 2
922%endrep
923%endmacro
924
49ebe3f9
LM
925%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
926%ifnum %1 ; SWAP 0, 1, ...
927 SWAP_INTERNAL_NUM %1, %2
928%else ; SWAP m0, m1, ...
929 SWAP_INTERNAL_NAME %1, %2
bafad220 930%endif
49ebe3f9
LM
931%endmacro
932
933%macro SWAP_INTERNAL_NUM 2-*
934 %rep %0-1
935 %xdefine %%tmp m%1
936 %xdefine m%1 m%2
937 %xdefine m%2 %%tmp
ec217218
LM
938 CAT_XDEFINE nn, m%1, %1
939 CAT_XDEFINE nn, m%2, %2
bafad220 940 %rotate 1
49ebe3f9
LM
941 %endrep
942%endmacro
943
944%macro SWAP_INTERNAL_NAME 2-*
ec217218 945 %xdefine %%args nn %+ %1
49ebe3f9 946 %rep %0-1
ec217218 947 %xdefine %%args %%args, nn %+ %2
49ebe3f9
LM
948 %rotate 1
949 %endrep
950 SWAP_INTERNAL_NUM %%args
bafad220
LM
951%endmacro
952
2f7f2e4b
LM
953; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
954; calls to that function will automatically load the permutation, so values can
955; be returned in mmregs.
956%macro SAVE_MM_PERMUTATION 0-1
957 %if %0
958 %xdefine %%f %1_m
959 %else
960 %xdefine %%f current_function %+ _m
961 %endif
bafad220
LM
962 %assign %%i 0
963 %rep num_mmregs
2f7f2e4b 964 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
965 %assign %%i %%i+1
966 %endrep
967%endmacro
968
2966cc18 969%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
970 %ifdef %1_m0
971 %assign %%i 0
972 %rep num_mmregs
973 CAT_XDEFINE m, %%i, %1_m %+ %%i
ec217218 974 CAT_XDEFINE nn, m %+ %%i, %%i
2f7f2e4b
LM
975 %assign %%i %%i+1
976 %endrep
977 %endif
bafad220
LM
978%endmacro
979
2f7f2e4b 980; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 981%macro call 1
edd82267 982 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
983%endmacro
984%macro call_internal 2
edd82267
MR
985 %xdefine %%i %2
986 %ifndef cglobaled_%2
987 %ifdef cglobaled_%1
988 %xdefine %%i %1
2f7f2e4b 989 %endif
bafad220 990 %endif
2f7f2e4b
LM
991 call %%i
992 LOAD_MM_PERMUTATION %%i
bafad220
LM
993%endmacro
994
2966cc18 995; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
996%macro add 2
997 %ifnum %2
998 %if %2==128
999 sub %1, -128
1000 %else
1001 add %1, %2
1002 %endif
1003 %else
1004 add %1, %2
1005 %endif
1006%endmacro
1007
1008%macro sub 2
1009 %ifnum %2
1010 %if %2==128
1011 add %1, -128
1012 %else
1013 sub %1, %2
1014 %endif
1015 %else
1016 sub %1, %2
1017 %endif
1018%endmacro
33cbfa6f
VS
1019
1020;=============================================================================
1021; AVX abstraction layer
1022;=============================================================================
1023
1024%assign i 0
1025%rep 16
1026 %if i < 8
1027 CAT_XDEFINE sizeofmm, i, 8
1028 %endif
1029 CAT_XDEFINE sizeofxmm, i, 16
1030 CAT_XDEFINE sizeofymm, i, 32
1031%assign i i+1
1032%endrep
1033%undef i
1034
96c9cc10
RB
1035%macro CHECK_AVX_INSTR_EMU 3-*
1036 %xdefine %%opcode %1
1037 %xdefine %%dst %2
1038 %rep %0-2
1039 %ifidn %%dst, %3
1040 %error non-avx emulation of ``%%opcode'' is not supported
1041 %endif
1042 %rotate 1
1043 %endrep
1044%endmacro
1045
33cbfa6f
VS
1046;%1 == instruction
1047;%2 == 1 if float, 0 if int
c108ba01
HG
1048;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1049;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
33cbfa6f 1050;%5+: operands
c108ba01
HG
1051%macro RUN_AVX_INSTR 5-8+
1052 %ifnum sizeof%6
b7d0d10a 1053 %assign __sizeofreg sizeof%6
c108ba01 1054 %elifnum sizeof%5
b7d0d10a 1055 %assign __sizeofreg sizeof%5
2f7f2e4b 1056 %else
b7d0d10a 1057 %assign __sizeofreg mmsize
2f7f2e4b 1058 %endif
b7d0d10a
LM
1059 %assign __emulate_avx 0
1060 %if avx_enabled && __sizeofreg >= 16
1061 %xdefine __instr v%1
33cbfa6f 1062 %else
b7d0d10a 1063 %xdefine __instr %1
c108ba01 1064 %if %0 >= 7+%3
b7d0d10a 1065 %assign __emulate_avx 1
33cbfa6f 1066 %endif
c108ba01 1067 %endif
33cbfa6f 1068
b7d0d10a
LM
1069 %if __emulate_avx
1070 %xdefine __src1 %6
1071 %xdefine __src2 %7
c108ba01
HG
1072 %ifnidn %5, %6
1073 %if %0 >= 8
1074 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8
1075 %else
1076 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
1077 %endif
1078 %if %4 && %3 == 0
1079 %ifnid %7
1080 ; 3-operand AVX instructions with a memory arg can only have it in src2,
1081 ; whereas SSE emulation prefers to have it in src1 (i.e. the mov).
1082 ; So, if the instruction is commutative with a memory arg, swap them.
b7d0d10a
LM
1083 %xdefine __src1 %7
1084 %xdefine __src2 %6
33cbfa6f 1085 %endif
c108ba01 1086 %endif
b7d0d10a
LM
1087 %if __sizeofreg == 8
1088 MOVQ %5, __src1
c108ba01 1089 %elif %2
b7d0d10a 1090 MOVAPS %5, __src1
33cbfa6f 1091 %else
b7d0d10a 1092 MOVDQA %5, __src1
33cbfa6f 1093 %endif
33cbfa6f 1094 %endif
c108ba01 1095 %if %0 >= 8
b7d0d10a 1096 %1 %5, __src2, %8
c108ba01 1097 %else
b7d0d10a 1098 %1 %5, __src2
2f7f2e4b 1099 %endif
c108ba01 1100 %elif %0 >= 8
b7d0d10a 1101 __instr %5, %6, %7, %8
c108ba01 1102 %elif %0 == 7
b7d0d10a 1103 __instr %5, %6, %7
c108ba01 1104 %elif %0 == 6
b7d0d10a 1105 __instr %5, %6
2f7f2e4b 1106 %else
b7d0d10a 1107 __instr %5
2f7f2e4b
LM
1108 %endif
1109%endmacro
1110
33cbfa6f
VS
1111;%1 == instruction
1112;%2 == 1 if float, 0 if int
c108ba01
HG
1113;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise
1114;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not
1115%macro AVX_INSTR 1-4 0, 1, 0
1116 %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4
1117 %ifidn %2, fnord
1118 RUN_AVX_INSTR %6, %7, %8, %9, %1
1119 %elifidn %3, fnord
1120 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2
33cbfa6f 1121 %elifidn %4, fnord
c108ba01 1122 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3
33cbfa6f 1123 %elifidn %5, fnord
c108ba01 1124 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4
33cbfa6f 1125 %else
c108ba01 1126 RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5
33cbfa6f
VS
1127 %endif
1128 %endmacro
1129%endmacro
1130
c108ba01
HG
1131; Instructions with both VEX and non-VEX encodings
1132; Non-destructive instructions are written without parameters
2f7f2e4b
LM
1133AVX_INSTR addpd, 1, 0, 1
1134AVX_INSTR addps, 1, 0, 1
1135AVX_INSTR addsd, 1, 0, 1
1136AVX_INSTR addss, 1, 0, 1
1137AVX_INSTR addsubpd, 1, 0, 0
1138AVX_INSTR addsubps, 1, 0, 0
c108ba01
HG
1139AVX_INSTR aesdec, 0, 0, 0
1140AVX_INSTR aesdeclast, 0, 0, 0
1141AVX_INSTR aesenc, 0, 0, 0
1142AVX_INSTR aesenclast, 0, 0, 0
1143AVX_INSTR aesimc
1144AVX_INSTR aeskeygenassist
2f7f2e4b
LM
1145AVX_INSTR andnpd, 1, 0, 0
1146AVX_INSTR andnps, 1, 0, 0
c108ba01
HG
1147AVX_INSTR andpd, 1, 0, 1
1148AVX_INSTR andps, 1, 0, 1
2f7f2e4b
LM
1149AVX_INSTR blendpd, 1, 0, 0
1150AVX_INSTR blendps, 1, 0, 0
1151AVX_INSTR blendvpd, 1, 0, 0
1152AVX_INSTR blendvps, 1, 0, 0
2e81acc6
CG
1153AVX_INSTR cmppd, 1, 1, 0
1154AVX_INSTR cmpps, 1, 1, 0
1155AVX_INSTR cmpsd, 1, 1, 0
1156AVX_INSTR cmpss, 1, 1, 0
c108ba01
HG
1157AVX_INSTR comisd
1158AVX_INSTR comiss
1159AVX_INSTR cvtdq2pd
1160AVX_INSTR cvtdq2ps
1161AVX_INSTR cvtpd2dq
1162AVX_INSTR cvtpd2ps
1163AVX_INSTR cvtps2dq
1164AVX_INSTR cvtps2pd
1165AVX_INSTR cvtsd2si
1166AVX_INSTR cvtsd2ss
1167AVX_INSTR cvtsi2sd
1168AVX_INSTR cvtsi2ss
1169AVX_INSTR cvtss2sd
1170AVX_INSTR cvtss2si
1171AVX_INSTR cvttpd2dq
1172AVX_INSTR cvttps2dq
1173AVX_INSTR cvttsd2si
1174AVX_INSTR cvttss2si
2f7f2e4b
LM
1175AVX_INSTR divpd, 1, 0, 0
1176AVX_INSTR divps, 1, 0, 0
1177AVX_INSTR divsd, 1, 0, 0
1178AVX_INSTR divss, 1, 0, 0
1179AVX_INSTR dppd, 1, 1, 0
1180AVX_INSTR dpps, 1, 1, 0
c108ba01 1181AVX_INSTR extractps
2f7f2e4b
LM
1182AVX_INSTR haddpd, 1, 0, 0
1183AVX_INSTR haddps, 1, 0, 0
1184AVX_INSTR hsubpd, 1, 0, 0
1185AVX_INSTR hsubps, 1, 0, 0
c108ba01
HG
1186AVX_INSTR insertps, 1, 1, 0
1187AVX_INSTR lddqu
1188AVX_INSTR ldmxcsr
1189AVX_INSTR maskmovdqu
2f7f2e4b
LM
1190AVX_INSTR maxpd, 1, 0, 1
1191AVX_INSTR maxps, 1, 0, 1
1192AVX_INSTR maxsd, 1, 0, 1
1193AVX_INSTR maxss, 1, 0, 1
1194AVX_INSTR minpd, 1, 0, 1
1195AVX_INSTR minps, 1, 0, 1
1196AVX_INSTR minsd, 1, 0, 1
1197AVX_INSTR minss, 1, 0, 1
c108ba01
HG
1198AVX_INSTR movapd
1199AVX_INSTR movaps
1200AVX_INSTR movd
1201AVX_INSTR movddup
1202AVX_INSTR movdqa
1203AVX_INSTR movdqu
39df0c43 1204AVX_INSTR movhlps, 1, 0, 0
c108ba01
HG
1205AVX_INSTR movhpd, 1, 0, 0
1206AVX_INSTR movhps, 1, 0, 0
39df0c43 1207AVX_INSTR movlhps, 1, 0, 0
c108ba01
HG
1208AVX_INSTR movlpd, 1, 0, 0
1209AVX_INSTR movlps, 1, 0, 0
1210AVX_INSTR movmskpd
1211AVX_INSTR movmskps
1212AVX_INSTR movntdq
1213AVX_INSTR movntdqa
1214AVX_INSTR movntpd
1215AVX_INSTR movntps
1216AVX_INSTR movq
2f7f2e4b 1217AVX_INSTR movsd, 1, 0, 0
c108ba01
HG
1218AVX_INSTR movshdup
1219AVX_INSTR movsldup
2f7f2e4b 1220AVX_INSTR movss, 1, 0, 0
c108ba01
HG
1221AVX_INSTR movupd
1222AVX_INSTR movups
2f7f2e4b
LM
1223AVX_INSTR mpsadbw, 0, 1, 0
1224AVX_INSTR mulpd, 1, 0, 1
1225AVX_INSTR mulps, 1, 0, 1
1226AVX_INSTR mulsd, 1, 0, 1
1227AVX_INSTR mulss, 1, 0, 1
1228AVX_INSTR orpd, 1, 0, 1
1229AVX_INSTR orps, 1, 0, 1
c108ba01
HG
1230AVX_INSTR pabsb
1231AVX_INSTR pabsd
1232AVX_INSTR pabsw
2f7f2e4b
LM
1233AVX_INSTR packsswb, 0, 0, 0
1234AVX_INSTR packssdw, 0, 0, 0
1235AVX_INSTR packuswb, 0, 0, 0
1236AVX_INSTR packusdw, 0, 0, 0
1237AVX_INSTR paddb, 0, 0, 1
1238AVX_INSTR paddw, 0, 0, 1
1239AVX_INSTR paddd, 0, 0, 1
1240AVX_INSTR paddq, 0, 0, 1
1241AVX_INSTR paddsb, 0, 0, 1
1242AVX_INSTR paddsw, 0, 0, 1
1243AVX_INSTR paddusb, 0, 0, 1
1244AVX_INSTR paddusw, 0, 0, 1
1245AVX_INSTR palignr, 0, 1, 0
1246AVX_INSTR pand, 0, 0, 1
1247AVX_INSTR pandn, 0, 0, 0
1248AVX_INSTR pavgb, 0, 0, 1
1249AVX_INSTR pavgw, 0, 0, 1
1250AVX_INSTR pblendvb, 0, 0, 0
1251AVX_INSTR pblendw, 0, 1, 0
c108ba01
HG
1252AVX_INSTR pclmulqdq, 0, 1, 0
1253AVX_INSTR pcmpestri
1254AVX_INSTR pcmpestrm
1255AVX_INSTR pcmpistri
1256AVX_INSTR pcmpistrm
2f7f2e4b
LM
1257AVX_INSTR pcmpeqb, 0, 0, 1
1258AVX_INSTR pcmpeqw, 0, 0, 1
1259AVX_INSTR pcmpeqd, 0, 0, 1
1260AVX_INSTR pcmpeqq, 0, 0, 1
1261AVX_INSTR pcmpgtb, 0, 0, 0
1262AVX_INSTR pcmpgtw, 0, 0, 0
1263AVX_INSTR pcmpgtd, 0, 0, 0
1264AVX_INSTR pcmpgtq, 0, 0, 0
c108ba01
HG
1265AVX_INSTR pextrb
1266AVX_INSTR pextrd
1267AVX_INSTR pextrq
1268AVX_INSTR pextrw
2f7f2e4b
LM
1269AVX_INSTR phaddw, 0, 0, 0
1270AVX_INSTR phaddd, 0, 0, 0
1271AVX_INSTR phaddsw, 0, 0, 0
c108ba01 1272AVX_INSTR phminposuw
2f7f2e4b
LM
1273AVX_INSTR phsubw, 0, 0, 0
1274AVX_INSTR phsubd, 0, 0, 0
1275AVX_INSTR phsubsw, 0, 0, 0
c108ba01
HG
1276AVX_INSTR pinsrb, 0, 1, 0
1277AVX_INSTR pinsrd, 0, 1, 0
1278AVX_INSTR pinsrq, 0, 1, 0
1279AVX_INSTR pinsrw, 0, 1, 0
2f7f2e4b
LM
1280AVX_INSTR pmaddwd, 0, 0, 1
1281AVX_INSTR pmaddubsw, 0, 0, 0
1282AVX_INSTR pmaxsb, 0, 0, 1
1283AVX_INSTR pmaxsw, 0, 0, 1
1284AVX_INSTR pmaxsd, 0, 0, 1
1285AVX_INSTR pmaxub, 0, 0, 1
1286AVX_INSTR pmaxuw, 0, 0, 1
1287AVX_INSTR pmaxud, 0, 0, 1
1288AVX_INSTR pminsb, 0, 0, 1
1289AVX_INSTR pminsw, 0, 0, 1
1290AVX_INSTR pminsd, 0, 0, 1
1291AVX_INSTR pminub, 0, 0, 1
1292AVX_INSTR pminuw, 0, 0, 1
1293AVX_INSTR pminud, 0, 0, 1
c108ba01
HG
1294AVX_INSTR pmovmskb
1295AVX_INSTR pmovsxbw
1296AVX_INSTR pmovsxbd
1297AVX_INSTR pmovsxbq
1298AVX_INSTR pmovsxwd
1299AVX_INSTR pmovsxwq
1300AVX_INSTR pmovsxdq
1301AVX_INSTR pmovzxbw
1302AVX_INSTR pmovzxbd
1303AVX_INSTR pmovzxbq
1304AVX_INSTR pmovzxwd
1305AVX_INSTR pmovzxwq
1306AVX_INSTR pmovzxdq
1307AVX_INSTR pmuldq, 0, 0, 1
2f7f2e4b 1308AVX_INSTR pmulhrsw, 0, 0, 1
c108ba01 1309AVX_INSTR pmulhuw, 0, 0, 1
2f7f2e4b
LM
1310AVX_INSTR pmulhw, 0, 0, 1
1311AVX_INSTR pmullw, 0, 0, 1
1312AVX_INSTR pmulld, 0, 0, 1
1313AVX_INSTR pmuludq, 0, 0, 1
2f7f2e4b
LM
1314AVX_INSTR por, 0, 0, 1
1315AVX_INSTR psadbw, 0, 0, 1
1316AVX_INSTR pshufb, 0, 0, 0
c108ba01
HG
1317AVX_INSTR pshufd
1318AVX_INSTR pshufhw
1319AVX_INSTR pshuflw
2f7f2e4b
LM
1320AVX_INSTR psignb, 0, 0, 0
1321AVX_INSTR psignw, 0, 0, 0
1322AVX_INSTR psignd, 0, 0, 0
1323AVX_INSTR psllw, 0, 0, 0
1324AVX_INSTR pslld, 0, 0, 0
1325AVX_INSTR psllq, 0, 0, 0
1326AVX_INSTR pslldq, 0, 0, 0
1327AVX_INSTR psraw, 0, 0, 0
1328AVX_INSTR psrad, 0, 0, 0
1329AVX_INSTR psrlw, 0, 0, 0
1330AVX_INSTR psrld, 0, 0, 0
1331AVX_INSTR psrlq, 0, 0, 0
1332AVX_INSTR psrldq, 0, 0, 0
1333AVX_INSTR psubb, 0, 0, 0
1334AVX_INSTR psubw, 0, 0, 0
1335AVX_INSTR psubd, 0, 0, 0
1336AVX_INSTR psubq, 0, 0, 0
1337AVX_INSTR psubsb, 0, 0, 0
1338AVX_INSTR psubsw, 0, 0, 0
1339AVX_INSTR psubusb, 0, 0, 0
1340AVX_INSTR psubusw, 0, 0, 0
c108ba01 1341AVX_INSTR ptest
2f7f2e4b
LM
1342AVX_INSTR punpckhbw, 0, 0, 0
1343AVX_INSTR punpckhwd, 0, 0, 0
1344AVX_INSTR punpckhdq, 0, 0, 0
1345AVX_INSTR punpckhqdq, 0, 0, 0
1346AVX_INSTR punpcklbw, 0, 0, 0
1347AVX_INSTR punpcklwd, 0, 0, 0
1348AVX_INSTR punpckldq, 0, 0, 0
1349AVX_INSTR punpcklqdq, 0, 0, 0
1350AVX_INSTR pxor, 0, 0, 1
c108ba01
HG
1351AVX_INSTR rcpps, 1, 0, 0
1352AVX_INSTR rcpss, 1, 0, 0
1353AVX_INSTR roundpd
1354AVX_INSTR roundps
1355AVX_INSTR roundsd
1356AVX_INSTR roundss
1357AVX_INSTR rsqrtps, 1, 0, 0
1358AVX_INSTR rsqrtss, 1, 0, 0
1359AVX_INSTR shufpd, 1, 1, 0
6b6ee582 1360AVX_INSTR shufps, 1, 1, 0
c108ba01
HG
1361AVX_INSTR sqrtpd, 1, 0, 0
1362AVX_INSTR sqrtps, 1, 0, 0
1363AVX_INSTR sqrtsd, 1, 0, 0
1364AVX_INSTR sqrtss, 1, 0, 0
1365AVX_INSTR stmxcsr
2f7f2e4b
LM
1366AVX_INSTR subpd, 1, 0, 0
1367AVX_INSTR subps, 1, 0, 0
1368AVX_INSTR subsd, 1, 0, 0
1369AVX_INSTR subss, 1, 0, 0
c108ba01
HG
1370AVX_INSTR ucomisd
1371AVX_INSTR ucomiss
2f7f2e4b
LM
1372AVX_INSTR unpckhpd, 1, 0, 0
1373AVX_INSTR unpckhps, 1, 0, 0
1374AVX_INSTR unpcklpd, 1, 0, 0
1375AVX_INSTR unpcklps, 1, 0, 0
1376AVX_INSTR xorpd, 1, 0, 1
1377AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1378
1379; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1380AVX_INSTR pfadd, 1, 0, 1
1381AVX_INSTR pfsub, 1, 0, 0
1382AVX_INSTR pfmul, 1, 0, 1
1383
1384; base-4 constants for shuffles
1385%assign i 0
1386%rep 256
1387 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1388 %if j < 10
1389 CAT_XDEFINE q000, j, i
1390 %elif j < 100
1391 CAT_XDEFINE q00, j, i
1392 %elif j < 1000
1393 CAT_XDEFINE q0, j, i
1394 %else
1395 CAT_XDEFINE q, j, i
1396 %endif
1397%assign i i+1
1398%endrep
1399%undef i
1400%undef j
1401
1402%macro FMA_INSTR 3
20689570
DB
1403 %macro %1 4-7 %1, %2, %3
1404 %if cpuflag(xop)
1405 v%5 %1, %2, %3, %4
2f7f2e4b 1406 %else
20689570
DB
1407 %6 %1, %2, %3
1408 %7 %1, %4
2f7f2e4b
LM
1409 %endif
1410 %endmacro
1411%endmacro
1412
1413FMA_INSTR pmacsdd, pmulld, paddd
1414FMA_INSTR pmacsww, pmullw, paddw
1415FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1416
1417; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1418; This lets us use tzcnt without bumping the yasm version requirement yet.
1419%define tzcnt rep bsf
c6908d6b
JGG
1420
1421; convert FMA4 to FMA3 if possible
1422%macro FMA4_INSTR 4
1423 %macro %1 4-8 %1, %2, %3, %4
1424 %if cpuflag(fma4)
1425 v%5 %1, %2, %3, %4
1426 %elifidn %1, %2
1427 v%6 %1, %4, %3 ; %1 = %1 * %3 + %4
1428 %elifidn %1, %3
1429 v%7 %1, %2, %4 ; %1 = %2 * %1 + %4
1430 %elifidn %1, %4
1431 v%8 %1, %2, %3 ; %1 = %2 * %3 + %1
1432 %else
1433 %error fma3 emulation of ``%5 %1, %2, %3, %4'' is not supported
1434 %endif
1435 %endmacro
1436%endmacro
1437
1438FMA4_INSTR fmaddpd, fmadd132pd, fmadd213pd, fmadd231pd
1439FMA4_INSTR fmaddps, fmadd132ps, fmadd213ps, fmadd231ps
1440FMA4_INSTR fmaddsd, fmadd132sd, fmadd213sd, fmadd231sd
1441FMA4_INSTR fmaddss, fmadd132ss, fmadd213ss, fmadd231ss
1442
1443FMA4_INSTR fmaddsubpd, fmaddsub132pd, fmaddsub213pd, fmaddsub231pd
1444FMA4_INSTR fmaddsubps, fmaddsub132ps, fmaddsub213ps, fmaddsub231ps
1445FMA4_INSTR fmsubaddpd, fmsubadd132pd, fmsubadd213pd, fmsubadd231pd
1446FMA4_INSTR fmsubaddps, fmsubadd132ps, fmsubadd213ps, fmsubadd231ps
1447
1448FMA4_INSTR fmsubpd, fmsub132pd, fmsub213pd, fmsub231pd
1449FMA4_INSTR fmsubps, fmsub132ps, fmsub213ps, fmsub231ps
1450FMA4_INSTR fmsubsd, fmsub132sd, fmsub213sd, fmsub231sd
1451FMA4_INSTR fmsubss, fmsub132ss, fmsub213ss, fmsub231ss
1452
1453FMA4_INSTR fnmaddpd, fnmadd132pd, fnmadd213pd, fnmadd231pd
1454FMA4_INSTR fnmaddps, fnmadd132ps, fnmadd213ps, fnmadd231ps
1455FMA4_INSTR fnmaddsd, fnmadd132sd, fnmadd213sd, fnmadd231sd
1456FMA4_INSTR fnmaddss, fnmadd132ss, fnmadd213ss, fnmadd231ss
1457
1458FMA4_INSTR fnmsubpd, fnmsub132pd, fnmsub213pd, fnmsub231pd
1459FMA4_INSTR fnmsubps, fnmsub132ps, fnmsub213ps, fnmsub231ps
1460FMA4_INSTR fnmsubsd, fnmsub132sd, fnmsub213sd, fnmsub231sd
1461FMA4_INSTR fnmsubss, fnmsub132ss, fnmsub213ss, fnmsub231ss
a3fabc6c
JGG
1462
1463; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug
1464%if ARCH_X86_64 == 0
1465%macro vpbroadcastq 2
1466%if sizeof%1 == 16
1467 movddup %1, %2
1468%else
1469 vbroadcastsd %1, %2
1470%endif
1471%endmacro
1472%endif