x86inc: create xm# and ym#, analagous to m#
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
729f90e2 4;* Copyright (C) 2005-2012 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
729f90e2 9;* Henrik Gramner <hengar-6@student.ltu.se>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
d633d12b
DB
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
3b15a6d7 45%define WIN64 0
96c9cc10 46%define UNIX64 0
3b15a6d7 47%if ARCH_X86_64
3f87f39c 48 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 49 %define WIN64 1
166f3993
HY
50 %elifidn __OUTPUT_FORMAT__,win64
51 %define WIN64 1
3f87f39c 52 %else
3b15a6d7 53 %define UNIX64 1
3f87f39c
JA
54 %endif
55%endif
56
2966cc18
JGG
57%ifdef PREFIX
58 %define mangle(x) _ %+ x
59%else
60 %define mangle(x) x
61%endif
62
bafad220
LM
63; Name of the .rodata section.
64; Kludge: Something on OS X fails to align .rodata even given an align attribute,
65; so use a different read-only section.
3f87f39c 66%macro SECTION_RODATA 0-1 16
bafad220 67 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 68 SECTION .text align=%1
bafad220 69 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 70 SECTION .text align=%1
bafad220 71 fakegot:
d69f9a42
DY
72 %elifidn __OUTPUT_FORMAT__,aout
73 section .text
bafad220 74 %else
3f87f39c 75 SECTION .rodata align=%1
bafad220
LM
76 %endif
77%endmacro
78
d69f9a42
DY
79; aout does not support align=
80%macro SECTION_TEXT 0-1 16
81 %ifidn __OUTPUT_FORMAT__,aout
82 SECTION .text
83 %else
84 SECTION .text align=%1
85 %endif
86%endmacro
87
3b15a6d7 88%if WIN64
3f87f39c 89 %define PIC
412b248e 90%elif ARCH_X86_64 == 0
2966cc18
JGG
91; x86_32 doesn't require PIC.
92; Some distros prefer shared objects to be PIC, but nothing breaks if
93; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
94 %undef PIC
95%endif
96%ifdef PIC
2966cc18 97 default rel
bafad220
LM
98%endif
99
180d43bc
MR
100%macro CPUNOP 1
101 %if HAVE_CPUNOP
102 CPU %1
103 %endif
104%endmacro
105
729f90e2 106; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
180d43bc 107CPUNOP amdnop
729f90e2 108
bafad220
LM
109; Macros to eliminate most code duplication between x86_32 and x86_64:
110; Currently this works only for leaf functions which load all their arguments
111; into registers at the start, and make no other use of the stack. Luckily that
112; covers most of x264's asm.
113
114; PROLOGUE:
115; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
116; %2 = number of registers used. pushes callee-saved regs if needed.
117; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
6f40e9f0
RB
118; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
119; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
120; and an extra register will be allocated to hold the original stack
121; pointer (to not invalidate r0m etc.). To prevent the use of an extra
122; register as stack pointer, request a negative stack size.
123; %4+/%5+ = list of names to define to registers
bafad220
LM
124; PROLOGUE can also be invoked by adding the same options to cglobal
125
126; e.g.
29e4edbb 127; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 128; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
129
130; TODO Some functions can use some args directly from the stack. If they're the
131; last args then you can just not declare them, but if they're in the middle
132; we need more flexible macro.
133
134; RET:
2f7f2e4b 135; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
136
137; REP_RET:
25cb0c1a 138; Use this instead of RET if it's a branch target.
bafad220 139
3f87f39c
JA
140; registers:
141; rN and rNq are the native-size register holding function argument N
142; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 143; rNh is the high 8 bits of the word size
3f87f39c
JA
144; rNm is the original location of arg N (a register or on the stack), dword
145; rNmp is native size
146
96c9cc10 147%macro DECLARE_REG 2-3
bafad220 148 %define r%1q %2
96c9cc10
RB
149 %define r%1d %2d
150 %define r%1w %2w
151 %define r%1b %2b
152 %define r%1h %2h
7a1944b9 153 %define %2q %2
96c9cc10
RB
154 %if %0 == 2
155 %define r%1m %2d
3f87f39c 156 %define r%1mp %2
3b15a6d7 157 %elif ARCH_X86_64 ; memory
6f40e9f0 158 %define r%1m [rstk + stack_offset + %3]
0995ad8d 159 %define r%1mp qword r %+ %1 %+ m
3f87f39c 160 %else
6f40e9f0 161 %define r%1m [rstk + stack_offset + %3]
0995ad8d 162 %define r%1mp dword r %+ %1 %+ m
3f87f39c 163 %endif
bafad220
LM
164 %define r%1 %2
165%endmacro
166
96c9cc10 167%macro DECLARE_REG_SIZE 3
bafad220
LM
168 %define r%1q r%1
169 %define e%1q r%1
170 %define r%1d e%1
171 %define e%1d e%1
172 %define r%1w %1
173 %define e%1w %1
96c9cc10
RB
174 %define r%1h %3
175 %define e%1h %3
bafad220
LM
176 %define r%1b %2
177 %define e%1b %2
3b15a6d7 178%if ARCH_X86_64 == 0
bafad220
LM
179 %define r%1 e%1
180%endif
181%endmacro
182
96c9cc10
RB
183DECLARE_REG_SIZE ax, al, ah
184DECLARE_REG_SIZE bx, bl, bh
185DECLARE_REG_SIZE cx, cl, ch
186DECLARE_REG_SIZE dx, dl, dh
187DECLARE_REG_SIZE si, sil, null
188DECLARE_REG_SIZE di, dil, null
189DECLARE_REG_SIZE bp, bpl, null
bafad220 190
3f87f39c
JA
191; t# defines for when per-arch register allocation is more complex than just function arguments
192
193%macro DECLARE_REG_TMP 1-*
194 %assign %%i 0
195 %rep %0
196 CAT_XDEFINE t, %%i, r%1
197 %assign %%i %%i+1
198 %rotate 1
199 %endrep
200%endmacro
201
202%macro DECLARE_REG_TMP_SIZE 0-*
203 %rep %0
204 %define t%1q t%1 %+ q
205 %define t%1d t%1 %+ d
206 %define t%1w t%1 %+ w
96c9cc10 207 %define t%1h t%1 %+ h
3f87f39c
JA
208 %define t%1b t%1 %+ b
209 %rotate 1
210 %endrep
211%endmacro
212
729f90e2 213DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 214
3b15a6d7 215%if ARCH_X86_64
bafad220
LM
216 %define gprsize 8
217%else
218 %define gprsize 4
219%endif
220
221%macro PUSH 1
222 push %1
6f40e9f0
RB
223 %ifidn rstk, rsp
224 %assign stack_offset stack_offset+gprsize
225 %endif
bafad220
LM
226%endmacro
227
228%macro POP 1
229 pop %1
6f40e9f0
RB
230 %ifidn rstk, rsp
231 %assign stack_offset stack_offset-gprsize
232 %endif
bafad220
LM
233%endmacro
234
729f90e2
HG
235%macro PUSH_IF_USED 1-*
236 %rep %0
237 %if %1 < regs_used
238 PUSH r%1
239 %endif
240 %rotate 1
241 %endrep
242%endmacro
243
244%macro POP_IF_USED 1-*
245 %rep %0
246 %if %1 < regs_used
247 pop r%1
248 %endif
249 %rotate 1
250 %endrep
251%endmacro
252
253%macro LOAD_IF_USED 1-*
254 %rep %0
255 %if %1 < num_args
256 mov r%1, r %+ %1 %+ mp
257 %endif
258 %rotate 1
259 %endrep
260%endmacro
261
bafad220
LM
262%macro SUB 2
263 sub %1, %2
6f40e9f0 264 %ifidn %1, rstk
bafad220
LM
265 %assign stack_offset stack_offset+(%2)
266 %endif
267%endmacro
268
269%macro ADD 2
270 add %1, %2
6f40e9f0 271 %ifidn %1, rstk
bafad220
LM
272 %assign stack_offset stack_offset-(%2)
273 %endif
274%endmacro
275
276%macro movifnidn 2
277 %ifnidn %1, %2
278 mov %1, %2
279 %endif
280%endmacro
281
282%macro movsxdifnidn 2
283 %ifnidn %1, %2
284 movsxd %1, %2
285 %endif
286%endmacro
287
288%macro ASSERT 1
289 %if (%1) == 0
290 %error assert failed
291 %endif
292%endmacro
293
294%macro DEFINE_ARGS 0-*
295 %ifdef n_arg_names
296 %assign %%i 0
297 %rep n_arg_names
298 CAT_UNDEF arg_name %+ %%i, q
299 CAT_UNDEF arg_name %+ %%i, d
300 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 301 CAT_UNDEF arg_name %+ %%i, h
bafad220 302 CAT_UNDEF arg_name %+ %%i, b
2f77923d 303 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 304 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
305 CAT_UNDEF arg_name, %%i
306 %assign %%i %%i+1
307 %endrep
308 %endif
309
0f53d0cf
LM
310 %xdefine %%stack_offset stack_offset
311 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
312 %assign %%i 0
313 %rep %0
314 %xdefine %1q r %+ %%i %+ q
315 %xdefine %1d r %+ %%i %+ d
316 %xdefine %1w r %+ %%i %+ w
96c9cc10 317 %xdefine %1h r %+ %%i %+ h
bafad220 318 %xdefine %1b r %+ %%i %+ b
2f77923d 319 %xdefine %1m r %+ %%i %+ m
98b9da2a 320 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
321 CAT_XDEFINE arg_name, %%i, %1
322 %assign %%i %%i+1
323 %rotate 1
324 %endrep
0f53d0cf
LM
325 %xdefine stack_offset %%stack_offset
326 %assign n_arg_names %0
bafad220
LM
327%endmacro
328
6f40e9f0
RB
329%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
330 %ifnum %1
331 %if %1 != 0
332 %assign %%stack_alignment ((mmsize + 15) & ~15)
333 %assign stack_size %1
334 %if stack_size < 0
335 %assign stack_size -stack_size
336 %endif
a34d9ad9
RB
337 %if mmsize != 8
338 %assign xmm_regs_used %2
339 %endif
6f40e9f0
RB
340 %if mmsize <= 16 && HAVE_ALIGNED_STACK
341 %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
342 %if xmm_regs_used > 6
343 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
344 %endif
345 SUB rsp, stack_size_padded
346 %else
a34d9ad9
RB
347 %assign %%reg_num (regs_used - 1)
348 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
349 ; align stack, and save original stack location directly above
350 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
351 ; stack in a single instruction (i.e. mov rsp, rstk or mov
352 ; rsp, [rsp+stack_size_padded])
353 mov rstk, rsp
354 %assign stack_size_padded stack_size
355 %if xmm_regs_used > 6
356 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
a34d9ad9
RB
357 %if mmsize == 32 && xmm_regs_used & 1
358 ; re-align to 32 bytes
359 %assign stack_size_padded (stack_size_padded + 16)
360 %endif
6f40e9f0
RB
361 %endif
362 %if %1 < 0 ; need to store rsp on stack
363 sub rsp, gprsize+stack_size_padded
364 and rsp, ~(%%stack_alignment-1)
365 %xdefine rstkm [rsp+stack_size_padded]
366 mov rstkm, rstk
367 %else ; can keep rsp in rstk during whole function
368 sub rsp, stack_size_padded
369 and rsp, ~(%%stack_alignment-1)
370 %xdefine rstkm rstk
371 %endif
372 %endif
373 %if xmm_regs_used > 6
374 WIN64_PUSH_XMM
375 %endif
376 %endif
377 %endif
378%endmacro
379
380%macro SETUP_STACK_POINTER 1
381 %ifnum %1
382 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
383 %if %1 > 0
384 %assign regs_used (regs_used + 1)
385 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
386 %warning "Stack pointer will overwrite register argument"
387 %endif
388 %endif
389 %endif
390%endmacro
391
392%macro DEFINE_ARGS_INTERNAL 3+
393 %ifnum %2
394 DEFINE_ARGS %3
395 %elif %1 == 4
396 DEFINE_ARGS %2
397 %elif %1 > 4
398 DEFINE_ARGS %2, %3
399 %endif
400%endmacro
401
3b15a6d7 402%if WIN64 ; Windows x64 ;=================================================
bafad220 403
96c9cc10
RB
404DECLARE_REG 0, rcx
405DECLARE_REG 1, rdx
406DECLARE_REG 2, R8
407DECLARE_REG 3, R9
408DECLARE_REG 4, R10, 40
409DECLARE_REG 5, R11, 48
410DECLARE_REG 6, rax, 56
411DECLARE_REG 7, rdi, 64
412DECLARE_REG 8, rsi, 72
413DECLARE_REG 9, rbx, 80
414DECLARE_REG 10, rbp, 88
415DECLARE_REG 11, R12, 96
416DECLARE_REG 12, R13, 104
417DECLARE_REG 13, R14, 112
418DECLARE_REG 14, R15, 120
3f87f39c 419
6f40e9f0 420%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 421 %assign num_args %1
3f87f39c 422 %assign regs_used %2
729f90e2 423 ASSERT regs_used >= num_args
a34d9ad9 424 SETUP_STACK_POINTER %4
729f90e2
HG
425 ASSERT regs_used <= 15
426 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
427 ALLOC_STACK %4, %3
428 %if mmsize != 8 && stack_size == 0
9cf73853
HG
429 WIN64_SPILL_XMM %3
430 %endif
729f90e2 431 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
432 DEFINE_ARGS_INTERNAL %0, %4, %5
433%endmacro
434
435%macro WIN64_PUSH_XMM 0
436 %assign %%i xmm_regs_used
437 %rep (xmm_regs_used-6)
438 %assign %%i %%i-1
63f0d623 439 movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
6f40e9f0 440 %endrep
532e7697
LM
441%endmacro
442
443%macro WIN64_SPILL_XMM 1
444 %assign xmm_regs_used %1
445 ASSERT xmm_regs_used <= 16
3f87f39c 446 %if xmm_regs_used > 6
140367af 447 SUB rsp, (xmm_regs_used-6)*16+16
6f40e9f0 448 WIN64_PUSH_XMM
3f87f39c 449 %endif
3f87f39c
JA
450%endmacro
451
532e7697 452%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
453 %if xmm_regs_used > 6
454 %assign %%i xmm_regs_used
455 %rep (xmm_regs_used-6)
456 %assign %%i %%i-1
63f0d623 457 movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
3f87f39c 458 %endrep
140367af
RB
459 %if stack_size_padded == 0
460 add %1, (xmm_regs_used-6)*16+16
461 %endif
6f40e9f0
RB
462 %endif
463 %if stack_size_padded > 0
464 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
465 mov rsp, rstkm
466 %else
467 add %1, stack_size_padded
468 %endif
3f87f39c
JA
469 %endif
470%endmacro
471
532e7697
LM
472%macro WIN64_RESTORE_XMM 1
473 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 474 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
475 %assign xmm_regs_used 0
476%endmacro
477
6f40e9f0 478%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 479
3f87f39c 480%macro RET 0
532e7697 481 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 482 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
483%if mmsize == 32
484 vzeroupper
485%endif
25cb0c1a 486 AUTO_REP_RET
bafad220
LM
487%endmacro
488
3b15a6d7 489%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 490
96c9cc10
RB
491DECLARE_REG 0, rdi
492DECLARE_REG 1, rsi
493DECLARE_REG 2, rdx
494DECLARE_REG 3, rcx
495DECLARE_REG 4, R8
496DECLARE_REG 5, R9
497DECLARE_REG 6, rax, 8
498DECLARE_REG 7, R10, 16
499DECLARE_REG 8, R11, 24
500DECLARE_REG 9, rbx, 32
501DECLARE_REG 10, rbp, 40
502DECLARE_REG 11, R12, 48
503DECLARE_REG 12, R13, 56
504DECLARE_REG 13, R14, 64
505DECLARE_REG 14, R15, 72
bafad220 506
6f40e9f0 507%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
508 %assign num_args %1
509 %assign regs_used %2
510 ASSERT regs_used >= num_args
a34d9ad9 511 SETUP_STACK_POINTER %4
729f90e2
HG
512 ASSERT regs_used <= 15
513 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 514 ALLOC_STACK %4
729f90e2 515 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 516 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
517%endmacro
518
6f40e9f0 519%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 520
bafad220 521%macro RET 0
6f40e9f0
RB
522%if stack_size_padded > 0
523%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
524 mov rsp, rstkm
525%else
526 add rsp, stack_size_padded
527%endif
528%endif
729f90e2 529 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
530%if mmsize == 32
531 vzeroupper
532%endif
25cb0c1a 533 AUTO_REP_RET
bafad220
LM
534%endmacro
535
bafad220
LM
536%else ; X86_32 ;==============================================================
537
96c9cc10
RB
538DECLARE_REG 0, eax, 4
539DECLARE_REG 1, ecx, 8
540DECLARE_REG 2, edx, 12
541DECLARE_REG 3, ebx, 16
542DECLARE_REG 4, esi, 20
543DECLARE_REG 5, edi, 24
544DECLARE_REG 6, ebp, 28
bafad220
LM
545%define rsp esp
546
729f90e2
HG
547%macro DECLARE_ARG 1-*
548 %rep %0
6f40e9f0 549 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
550 %define r%1mp dword r%1m
551 %rotate 1
552 %endrep
bafad220
LM
553%endmacro
554
729f90e2 555DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 556
6f40e9f0 557%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 558 %assign num_args %1
bafad220 559 %assign regs_used %2
a34d9ad9
RB
560 ASSERT regs_used >= num_args
561 %if num_args > 7
562 %assign num_args 7
563 %endif
729f90e2
HG
564 %if regs_used > 7
565 %assign regs_used 7
566 %endif
6f40e9f0
RB
567 SETUP_STACK_POINTER %4
568 ASSERT regs_used <= 7
729f90e2 569 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 570 ALLOC_STACK %4
729f90e2 571 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 572 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
573%endmacro
574
6f40e9f0 575%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 576
bafad220 577%macro RET 0
6f40e9f0
RB
578%if stack_size_padded > 0
579%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
580 mov rsp, rstkm
581%else
582 add rsp, stack_size_padded
583%endif
584%endif
729f90e2 585 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
586%if mmsize == 32
587 vzeroupper
588%endif
25cb0c1a 589 AUTO_REP_RET
bafad220
LM
590%endmacro
591
bafad220
LM
592%endif ;======================================================================
593
3b15a6d7 594%if WIN64 == 0
532e7697
LM
595%macro WIN64_SPILL_XMM 1
596%endmacro
597%macro WIN64_RESTORE_XMM 1
598%endmacro
6f40e9f0
RB
599%macro WIN64_PUSH_XMM 0
600%endmacro
532e7697
LM
601%endif
602
25cb0c1a
LM
603; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
604; a branch or a branch target. So switch to a 2-byte form of ret in that case.
605; We can automatically detect "follows a branch", but not a branch target.
606; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
96c9cc10
RB
607%macro REP_RET 0
608 %if has_epilogue
609 RET
610 %else
611 rep ret
612 %endif
613%endmacro
614
25cb0c1a
LM
615%define last_branch_adr $$
616%macro AUTO_REP_RET 0
617 %ifndef cpuflags
618 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
619 %elif notcpuflag(ssse3)
620 times ((last_branch_adr-$)>>31)+1 rep
621 %endif
622 ret
623%endmacro
624
625%macro BRANCH_INSTR 0-*
626 %rep %0
627 %macro %1 1-2 %1
628 %2 %1
629 %%branch_instr:
630 %xdefine last_branch_adr %%branch_instr
631 %endmacro
632 %rotate 1
633 %endrep
634%endmacro
635
636BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
637
96c9cc10
RB
638%macro TAIL_CALL 2 ; callee, is_nonadjacent
639 %if has_epilogue
640 call %1
641 RET
642 %elif %2
643 jmp %1
644 %endif
645%endmacro
646
bafad220
LM
647;=============================================================================
648; arch-independent part
649;=============================================================================
650
651%assign function_align 16
652
2f7f2e4b
LM
653; Begin a function.
654; Applies any symbol mangling needed for C linkage, and sets up a define such that
655; subsequent uses of the function name automatically refer to the mangled version.
656; Appends cpuflags to the function name if cpuflags has been specified.
d633d12b
DB
657; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
658; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
a34d9ad9 659%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
d633d12b
DB
660 cglobal_internal 1, %1 %+ SUFFIX, %2
661%endmacro
662%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
663 cglobal_internal 0, %1 %+ SUFFIX, %2
664%endmacro
665%macro cglobal_internal 2-3+
666 %if %1
667 %xdefine %%FUNCTION_PREFIX private_prefix
668 %xdefine %%VISIBILITY hidden
669 %else
670 %xdefine %%FUNCTION_PREFIX public_prefix
671 %xdefine %%VISIBILITY
672 %endif
673 %ifndef cglobaled_%2
674 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
675 %xdefine %2.skip_prologue %2 %+ .skip_prologue
676 CAT_XDEFINE cglobaled_, %2, 1
2f7f2e4b 677 %endif
d633d12b 678 %xdefine current_function %2
bafad220 679 %ifidn __OUTPUT_FORMAT__,elf
d633d12b 680 global %2:function %%VISIBILITY
bafad220 681 %else
d633d12b 682 global %2
bafad220
LM
683 %endif
684 align function_align
d633d12b 685 %2:
bafad220 686 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
6f40e9f0 687 %xdefine rstk rsp
3f87f39c 688 %assign stack_offset 0
6f40e9f0
RB
689 %assign stack_size 0
690 %assign stack_size_padded 0
a34d9ad9 691 %assign xmm_regs_used 0
d633d12b
DB
692 %ifnidn %3, ""
693 PROLOGUE %3
bafad220
LM
694 %endif
695%endmacro
696
697%macro cextern 1
ef5d41a5 698 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 699 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
700 extern %1
701%endmacro
702
2f7f2e4b 703; like cextern, but without the prefix
2966cc18
JGG
704%macro cextern_naked 1
705 %xdefine %1 mangle(%1)
2f7f2e4b 706 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 707 extern %1
bafad220
LM
708%endmacro
709
2966cc18 710%macro const 2+
ef5d41a5 711 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
ad76e6e7
HG
712 %ifidn __OUTPUT_FORMAT__,elf
713 global %1:data hidden
714 %else
715 global %1
716 %endif
2966cc18
JGG
717 %1: %2
718%endmacro
719
bafad220
LM
720; This is needed for ELF, otherwise the GNU linker assumes the stack is
721; executable by default.
722%ifidn __OUTPUT_FORMAT__,elf
723SECTION .note.GNU-stack noalloc noexec nowrite progbits
724%endif
725
2f7f2e4b
LM
726; cpuflags
727
728%assign cpuflags_mmx (1<<0)
729%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
730%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 731%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
732%assign cpuflags_sse (1<<4) | cpuflags_mmx2
733%assign cpuflags_sse2 (1<<5) | cpuflags_sse
734%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
735%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
736%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
737%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
738%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
739%assign cpuflags_avx (1<<11)| cpuflags_sse42
740%assign cpuflags_xop (1<<12)| cpuflags_avx
741%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
742%assign cpuflags_avx2 (1<<14)| cpuflags_avx
743%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
744
745%assign cpuflags_cache32 (1<<16)
746%assign cpuflags_cache64 (1<<17)
747%assign cpuflags_slowctz (1<<18)
748%assign cpuflags_lzcnt (1<<19)
749%assign cpuflags_misalign (1<<20)
750%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
751%assign cpuflags_atom (1<<22)
96c9cc10
RB
752%assign cpuflags_bmi1 (1<<23)
753%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
754%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
2f7f2e4b
LM
755
756%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
757%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
758
759; Takes up to 2 cpuflags from the above list.
760; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
761; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
762%macro INIT_CPUFLAGS 0-2
180d43bc 763 CPUNOP amdnop
2f7f2e4b
LM
764 %if %0 >= 1
765 %xdefine cpuname %1
766 %assign cpuflags cpuflags_%1
767 %if %0 >= 2
768 %xdefine cpuname %1_%2
769 %assign cpuflags cpuflags | cpuflags_%2
770 %endif
771 %xdefine SUFFIX _ %+ cpuname
772 %if cpuflag(avx)
773 %assign avx_enabled 1
774 %endif
f2bd8a07
JR
775 %if mmsize == 16 && notcpuflag(sse2)
776 %define mova movaps
777 %define movu movups
778 %define movnta movntps
779 %endif
2f7f2e4b
LM
780 %if cpuflag(aligned)
781 %define movu mova
782 %elifidn %1, sse3
783 %define movu lddqu
784 %endif
0c0828ec 785 %if notcpuflag(sse2)
180d43bc 786 CPUNOP basicnop
2cd1f5ca 787 %endif
2f7f2e4b
LM
788 %else
789 %xdefine SUFFIX
790 %undef cpuname
791 %undef cpuflags
792 %endif
793%endmacro
794
3fb78e99
LM
795; Merge mmx and sse*
796; m# is a simd regsiter of the currently selected size
797; xm# is the corresponding xmmreg (if selcted xmm or ymm size), or mmreg (if selected mmx)
798; ym# is the corresponding ymmreg (if selcted xmm or ymm size), or mmreg (if selected mmx)
799; (All 3 remain in sync through SWAP.)
bafad220
LM
800
801%macro CAT_XDEFINE 3
802 %xdefine %1%2 %3
803%endmacro
804
805%macro CAT_UNDEF 2
806 %undef %1%2
807%endmacro
808
2f7f2e4b 809%macro INIT_MMX 0-1+
33cbfa6f 810 %assign avx_enabled 0
2f7f2e4b 811 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
812 %define mmsize 8
813 %define num_mmregs 8
814 %define mova movq
815 %define movu movq
816 %define movh movd
532e7697 817 %define movnta movntq
bafad220
LM
818 %assign %%i 0
819 %rep 8
820 CAT_XDEFINE m, %%i, mm %+ %%i
821 CAT_XDEFINE nmm, %%i, %%i
822 %assign %%i %%i+1
823 %endrep
824 %rep 8
825 CAT_UNDEF m, %%i
826 CAT_UNDEF nmm, %%i
827 %assign %%i %%i+1
828 %endrep
2f7f2e4b 829 INIT_CPUFLAGS %1
bafad220
LM
830%endmacro
831
2f7f2e4b 832%macro INIT_XMM 0-1+
33cbfa6f 833 %assign avx_enabled 0
2f7f2e4b 834 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
835 %define mmsize 16
836 %define num_mmregs 8
3b15a6d7 837 %if ARCH_X86_64
bafad220
LM
838 %define num_mmregs 16
839 %endif
840 %define mova movdqa
841 %define movu movdqu
842 %define movh movq
532e7697 843 %define movnta movntdq
bafad220
LM
844 %assign %%i 0
845 %rep num_mmregs
846 CAT_XDEFINE m, %%i, xmm %+ %%i
847 CAT_XDEFINE nxmm, %%i, %%i
848 %assign %%i %%i+1
849 %endrep
2f7f2e4b 850 INIT_CPUFLAGS %1
bafad220
LM
851%endmacro
852
2f7f2e4b 853%macro INIT_YMM 0-1+
33cbfa6f 854 %assign avx_enabled 1
2f7f2e4b 855 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
856 %define mmsize 32
857 %define num_mmregs 8
3b15a6d7 858 %if ARCH_X86_64
33cbfa6f
VS
859 %define num_mmregs 16
860 %endif
861 %define mova vmovaps
862 %define movu vmovups
2f7f2e4b
LM
863 %undef movh
864 %define movnta vmovntps
33cbfa6f
VS
865 %assign %%i 0
866 %rep num_mmregs
867 CAT_XDEFINE m, %%i, ymm %+ %%i
868 CAT_XDEFINE nymm, %%i, %%i
869 %assign %%i %%i+1
870 %endrep
2f7f2e4b 871 INIT_CPUFLAGS %1
33cbfa6f
VS
872%endmacro
873
2f7f2e4b 874INIT_XMM
bafad220 875
3fb78e99
LM
876%macro DECLARE_MMCAST 1
877 %define mmmm%1 mm%1
878 %define mmxmm%1 mm%1
879 %define mmymm%1 mm%1
880 %define xmmmm%1 mm%1
881 %define xmmxmm%1 xmm%1
882 %define xmmymm%1 xmm%1
883 %define ymmmm%1 mm%1
884 %define ymmxmm%1 ymm%1
885 %define ymmymm%1 ymm%1
886 %define xm%1 xmm %+ m%1
887 %define ym%1 ymm %+ m%1
888%endmacro
889
890%assign i 0
891%rep 16
892 DECLARE_MMCAST i
893%assign i i+1
894%endrep
895
bafad220
LM
896; I often want to use macros that permute their arguments. e.g. there's no
897; efficient way to implement butterfly or transpose or dct without swapping some
898; arguments.
899;
900; I would like to not have to manually keep track of the permutations:
901; If I insert a permutation in the middle of a function, it should automatically
902; change everything that follows. For more complex macros I may also have multiple
903; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
904;
905; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
906; permutes its arguments. It's equivalent to exchanging the contents of the
907; registers, except that this way you exchange the register names instead, so it
908; doesn't cost any cycles.
909
910%macro PERMUTE 2-* ; takes a list of pairs to swap
911%rep %0/2
49ebe3f9 912 %xdefine %%tmp%2 m%2
bafad220
LM
913 %rotate 2
914%endrep
915%rep %0/2
49ebe3f9
LM
916 %xdefine m%1 %%tmp%2
917 CAT_XDEFINE n, m%1, %1
bafad220
LM
918 %rotate 2
919%endrep
920%endmacro
921
49ebe3f9
LM
922%macro SWAP 2+ ; swaps a single chain (sometimes more concise than pairs)
923%ifnum %1 ; SWAP 0, 1, ...
924 SWAP_INTERNAL_NUM %1, %2
925%else ; SWAP m0, m1, ...
926 SWAP_INTERNAL_NAME %1, %2
bafad220 927%endif
49ebe3f9
LM
928%endmacro
929
930%macro SWAP_INTERNAL_NUM 2-*
931 %rep %0-1
932 %xdefine %%tmp m%1
933 %xdefine m%1 m%2
934 %xdefine m%2 %%tmp
935 CAT_XDEFINE n, m%1, %1
936 CAT_XDEFINE n, m%2, %2
bafad220 937 %rotate 1
49ebe3f9
LM
938 %endrep
939%endmacro
940
941%macro SWAP_INTERNAL_NAME 2-*
942 %xdefine %%args n %+ %1
943 %rep %0-1
944 %xdefine %%args %%args, n %+ %2
945 %rotate 1
946 %endrep
947 SWAP_INTERNAL_NUM %%args
bafad220
LM
948%endmacro
949
2f7f2e4b
LM
950; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
951; calls to that function will automatically load the permutation, so values can
952; be returned in mmregs.
953%macro SAVE_MM_PERMUTATION 0-1
954 %if %0
955 %xdefine %%f %1_m
956 %else
957 %xdefine %%f current_function %+ _m
958 %endif
bafad220
LM
959 %assign %%i 0
960 %rep num_mmregs
2f7f2e4b 961 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
962 %assign %%i %%i+1
963 %endrep
964%endmacro
965
2966cc18 966%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
967 %ifdef %1_m0
968 %assign %%i 0
969 %rep num_mmregs
970 CAT_XDEFINE m, %%i, %1_m %+ %%i
971 CAT_XDEFINE n, m %+ %%i, %%i
972 %assign %%i %%i+1
973 %endrep
974 %endif
bafad220
LM
975%endmacro
976
2f7f2e4b 977; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 978%macro call 1
edd82267 979 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
980%endmacro
981%macro call_internal 2
edd82267
MR
982 %xdefine %%i %2
983 %ifndef cglobaled_%2
984 %ifdef cglobaled_%1
985 %xdefine %%i %1
2f7f2e4b 986 %endif
bafad220 987 %endif
2f7f2e4b
LM
988 call %%i
989 LOAD_MM_PERMUTATION %%i
bafad220
LM
990%endmacro
991
2966cc18 992; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
993%macro add 2
994 %ifnum %2
995 %if %2==128
996 sub %1, -128
997 %else
998 add %1, %2
999 %endif
1000 %else
1001 add %1, %2
1002 %endif
1003%endmacro
1004
1005%macro sub 2
1006 %ifnum %2
1007 %if %2==128
1008 add %1, -128
1009 %else
1010 sub %1, %2
1011 %endif
1012 %else
1013 sub %1, %2
1014 %endif
1015%endmacro
33cbfa6f
VS
1016
1017;=============================================================================
1018; AVX abstraction layer
1019;=============================================================================
1020
1021%assign i 0
1022%rep 16
1023 %if i < 8
1024 CAT_XDEFINE sizeofmm, i, 8
1025 %endif
1026 CAT_XDEFINE sizeofxmm, i, 16
1027 CAT_XDEFINE sizeofymm, i, 32
1028%assign i i+1
1029%endrep
1030%undef i
1031
96c9cc10
RB
1032%macro CHECK_AVX_INSTR_EMU 3-*
1033 %xdefine %%opcode %1
1034 %xdefine %%dst %2
1035 %rep %0-2
1036 %ifidn %%dst, %3
1037 %error non-avx emulation of ``%%opcode'' is not supported
1038 %endif
1039 %rotate 1
1040 %endrep
1041%endmacro
1042
33cbfa6f
VS
1043;%1 == instruction
1044;%2 == 1 if float, 0 if int
705f3d47 1045;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
33cbfa6f
VS
1046;%4 == number of operands given
1047;%5+: operands
1048%macro RUN_AVX_INSTR 6-7+
96c9cc10
RB
1049 %ifid %6
1050 %define %%sizeofreg sizeof%6
1051 %elifid %5
1052 %define %%sizeofreg sizeof%5
2f7f2e4b 1053 %else
96c9cc10 1054 %define %%sizeofreg mmsize
2f7f2e4b 1055 %endif
96c9cc10
RB
1056 %if %%sizeofreg==32
1057 %if %4>=3
705f3d47
LM
1058 v%1 %5, %6, %7
1059 %else
1060 v%1 %5, %6
1061 %endif
33cbfa6f 1062 %else
96c9cc10 1063 %if %%sizeofreg==8
33cbfa6f
VS
1064 %define %%regmov movq
1065 %elif %2
1066 %define %%regmov movaps
1067 %else
1068 %define %%regmov movdqa
1069 %endif
1070
1071 %if %4>=3+%3
1072 %ifnidn %5, %6
96c9cc10 1073 %if avx_enabled && %%sizeofreg==16
33cbfa6f
VS
1074 v%1 %5, %6, %7
1075 %else
96c9cc10 1076 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
33cbfa6f
VS
1077 %%regmov %5, %6
1078 %1 %5, %7
1079 %endif
1080 %else
1081 %1 %5, %7
1082 %endif
96c9cc10 1083 %elif %4>=3
33cbfa6f
VS
1084 %1 %5, %6, %7
1085 %else
1086 %1 %5, %6
1087 %endif
1088 %endif
1089%endmacro
1090
2f7f2e4b
LM
1091; 3arg AVX ops with a memory arg can only have it in src2,
1092; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
1093; So, if the op is symmetric and the wrong one is memory, swap them.
1094%macro RUN_AVX_INSTR1 8
1095 %assign %%swap 0
1096 %if avx_enabled
1097 %ifnid %6
1098 %assign %%swap 1
1099 %endif
1100 %elifnidn %5, %6
1101 %ifnid %7
1102 %assign %%swap 1
1103 %endif
1104 %endif
1105 %if %%swap && %3 == 0 && %8 == 1
1106 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
1107 %else
1108 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
1109 %endif
1110%endmacro
1111
33cbfa6f
VS
1112;%1 == instruction
1113;%2 == 1 if float, 0 if int
96c9cc10 1114;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
2f7f2e4b
LM
1115;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
1116%macro AVX_INSTR 4
1117 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
33cbfa6f
VS
1118 %ifidn %3, fnord
1119 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
1120 %elifidn %4, fnord
2f7f2e4b 1121 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
33cbfa6f
VS
1122 %elifidn %5, fnord
1123 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
1124 %else
1125 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
1126 %endif
1127 %endmacro
1128%endmacro
1129
2f7f2e4b
LM
1130AVX_INSTR addpd, 1, 0, 1
1131AVX_INSTR addps, 1, 0, 1
1132AVX_INSTR addsd, 1, 0, 1
1133AVX_INSTR addss, 1, 0, 1
1134AVX_INSTR addsubpd, 1, 0, 0
1135AVX_INSTR addsubps, 1, 0, 0
1136AVX_INSTR andpd, 1, 0, 1
1137AVX_INSTR andps, 1, 0, 1
1138AVX_INSTR andnpd, 1, 0, 0
1139AVX_INSTR andnps, 1, 0, 0
1140AVX_INSTR blendpd, 1, 0, 0
1141AVX_INSTR blendps, 1, 0, 0
1142AVX_INSTR blendvpd, 1, 0, 0
1143AVX_INSTR blendvps, 1, 0, 0
2e81acc6
CG
1144AVX_INSTR cmppd, 1, 1, 0
1145AVX_INSTR cmpps, 1, 1, 0
1146AVX_INSTR cmpsd, 1, 1, 0
1147AVX_INSTR cmpss, 1, 1, 0
705f3d47 1148AVX_INSTR cvtdq2ps, 1, 0, 0
b30a3633 1149AVX_INSTR cvtpd2dq, 1, 0, 0
705f3d47 1150AVX_INSTR cvtps2dq, 1, 0, 0
2f7f2e4b
LM
1151AVX_INSTR divpd, 1, 0, 0
1152AVX_INSTR divps, 1, 0, 0
1153AVX_INSTR divsd, 1, 0, 0
1154AVX_INSTR divss, 1, 0, 0
1155AVX_INSTR dppd, 1, 1, 0
1156AVX_INSTR dpps, 1, 1, 0
1157AVX_INSTR haddpd, 1, 0, 0
1158AVX_INSTR haddps, 1, 0, 0
1159AVX_INSTR hsubpd, 1, 0, 0
1160AVX_INSTR hsubps, 1, 0, 0
1161AVX_INSTR maxpd, 1, 0, 1
1162AVX_INSTR maxps, 1, 0, 1
1163AVX_INSTR maxsd, 1, 0, 1
1164AVX_INSTR maxss, 1, 0, 1
1165AVX_INSTR minpd, 1, 0, 1
1166AVX_INSTR minps, 1, 0, 1
1167AVX_INSTR minsd, 1, 0, 1
1168AVX_INSTR minss, 1, 0, 1
39df0c43
VS
1169AVX_INSTR movhlps, 1, 0, 0
1170AVX_INSTR movlhps, 1, 0, 0
2f7f2e4b
LM
1171AVX_INSTR movsd, 1, 0, 0
1172AVX_INSTR movss, 1, 0, 0
1173AVX_INSTR mpsadbw, 0, 1, 0
1174AVX_INSTR mulpd, 1, 0, 1
1175AVX_INSTR mulps, 1, 0, 1
1176AVX_INSTR mulsd, 1, 0, 1
1177AVX_INSTR mulss, 1, 0, 1
1178AVX_INSTR orpd, 1, 0, 1
1179AVX_INSTR orps, 1, 0, 1
96c9cc10
RB
1180AVX_INSTR pabsb, 0, 0, 0
1181AVX_INSTR pabsw, 0, 0, 0
1182AVX_INSTR pabsd, 0, 0, 0
2f7f2e4b
LM
1183AVX_INSTR packsswb, 0, 0, 0
1184AVX_INSTR packssdw, 0, 0, 0
1185AVX_INSTR packuswb, 0, 0, 0
1186AVX_INSTR packusdw, 0, 0, 0
1187AVX_INSTR paddb, 0, 0, 1
1188AVX_INSTR paddw, 0, 0, 1
1189AVX_INSTR paddd, 0, 0, 1
1190AVX_INSTR paddq, 0, 0, 1
1191AVX_INSTR paddsb, 0, 0, 1
1192AVX_INSTR paddsw, 0, 0, 1
1193AVX_INSTR paddusb, 0, 0, 1
1194AVX_INSTR paddusw, 0, 0, 1
1195AVX_INSTR palignr, 0, 1, 0
1196AVX_INSTR pand, 0, 0, 1
1197AVX_INSTR pandn, 0, 0, 0
1198AVX_INSTR pavgb, 0, 0, 1
1199AVX_INSTR pavgw, 0, 0, 1
1200AVX_INSTR pblendvb, 0, 0, 0
1201AVX_INSTR pblendw, 0, 1, 0
1202AVX_INSTR pcmpestri, 0, 0, 0
1203AVX_INSTR pcmpestrm, 0, 0, 0
1204AVX_INSTR pcmpistri, 0, 0, 0
1205AVX_INSTR pcmpistrm, 0, 0, 0
1206AVX_INSTR pcmpeqb, 0, 0, 1
1207AVX_INSTR pcmpeqw, 0, 0, 1
1208AVX_INSTR pcmpeqd, 0, 0, 1
1209AVX_INSTR pcmpeqq, 0, 0, 1
1210AVX_INSTR pcmpgtb, 0, 0, 0
1211AVX_INSTR pcmpgtw, 0, 0, 0
1212AVX_INSTR pcmpgtd, 0, 0, 0
1213AVX_INSTR pcmpgtq, 0, 0, 0
1214AVX_INSTR phaddw, 0, 0, 0
1215AVX_INSTR phaddd, 0, 0, 0
1216AVX_INSTR phaddsw, 0, 0, 0
1217AVX_INSTR phsubw, 0, 0, 0
1218AVX_INSTR phsubd, 0, 0, 0
1219AVX_INSTR phsubsw, 0, 0, 0
1220AVX_INSTR pmaddwd, 0, 0, 1
1221AVX_INSTR pmaddubsw, 0, 0, 0
1222AVX_INSTR pmaxsb, 0, 0, 1
1223AVX_INSTR pmaxsw, 0, 0, 1
1224AVX_INSTR pmaxsd, 0, 0, 1
1225AVX_INSTR pmaxub, 0, 0, 1
1226AVX_INSTR pmaxuw, 0, 0, 1
1227AVX_INSTR pmaxud, 0, 0, 1
1228AVX_INSTR pminsb, 0, 0, 1
1229AVX_INSTR pminsw, 0, 0, 1
1230AVX_INSTR pminsd, 0, 0, 1
1231AVX_INSTR pminub, 0, 0, 1
1232AVX_INSTR pminuw, 0, 0, 1
1233AVX_INSTR pminud, 0, 0, 1
96c9cc10 1234AVX_INSTR pmovmskb, 0, 0, 0
2f7f2e4b
LM
1235AVX_INSTR pmulhuw, 0, 0, 1
1236AVX_INSTR pmulhrsw, 0, 0, 1
1237AVX_INSTR pmulhw, 0, 0, 1
1238AVX_INSTR pmullw, 0, 0, 1
1239AVX_INSTR pmulld, 0, 0, 1
1240AVX_INSTR pmuludq, 0, 0, 1
1241AVX_INSTR pmuldq, 0, 0, 1
1242AVX_INSTR por, 0, 0, 1
1243AVX_INSTR psadbw, 0, 0, 1
1244AVX_INSTR pshufb, 0, 0, 0
96c9cc10
RB
1245AVX_INSTR pshufd, 0, 1, 0
1246AVX_INSTR pshufhw, 0, 1, 0
1247AVX_INSTR pshuflw, 0, 1, 0
2f7f2e4b
LM
1248AVX_INSTR psignb, 0, 0, 0
1249AVX_INSTR psignw, 0, 0, 0
1250AVX_INSTR psignd, 0, 0, 0
1251AVX_INSTR psllw, 0, 0, 0
1252AVX_INSTR pslld, 0, 0, 0
1253AVX_INSTR psllq, 0, 0, 0
1254AVX_INSTR pslldq, 0, 0, 0
1255AVX_INSTR psraw, 0, 0, 0
1256AVX_INSTR psrad, 0, 0, 0
1257AVX_INSTR psrlw, 0, 0, 0
1258AVX_INSTR psrld, 0, 0, 0
1259AVX_INSTR psrlq, 0, 0, 0
1260AVX_INSTR psrldq, 0, 0, 0
1261AVX_INSTR psubb, 0, 0, 0
1262AVX_INSTR psubw, 0, 0, 0
1263AVX_INSTR psubd, 0, 0, 0
1264AVX_INSTR psubq, 0, 0, 0
1265AVX_INSTR psubsb, 0, 0, 0
1266AVX_INSTR psubsw, 0, 0, 0
1267AVX_INSTR psubusb, 0, 0, 0
1268AVX_INSTR psubusw, 0, 0, 0
96c9cc10 1269AVX_INSTR ptest, 0, 0, 0
2f7f2e4b
LM
1270AVX_INSTR punpckhbw, 0, 0, 0
1271AVX_INSTR punpckhwd, 0, 0, 0
1272AVX_INSTR punpckhdq, 0, 0, 0
1273AVX_INSTR punpckhqdq, 0, 0, 0
1274AVX_INSTR punpcklbw, 0, 0, 0
1275AVX_INSTR punpcklwd, 0, 0, 0
1276AVX_INSTR punpckldq, 0, 0, 0
1277AVX_INSTR punpcklqdq, 0, 0, 0
1278AVX_INSTR pxor, 0, 0, 1
6b6ee582 1279AVX_INSTR shufps, 1, 1, 0
2f7f2e4b
LM
1280AVX_INSTR subpd, 1, 0, 0
1281AVX_INSTR subps, 1, 0, 0
1282AVX_INSTR subsd, 1, 0, 0
1283AVX_INSTR subss, 1, 0, 0
1284AVX_INSTR unpckhpd, 1, 0, 0
1285AVX_INSTR unpckhps, 1, 0, 0
1286AVX_INSTR unpcklpd, 1, 0, 0
1287AVX_INSTR unpcklps, 1, 0, 0
1288AVX_INSTR xorpd, 1, 0, 1
1289AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1290
1291; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1292AVX_INSTR pfadd, 1, 0, 1
1293AVX_INSTR pfsub, 1, 0, 0
1294AVX_INSTR pfmul, 1, 0, 1
1295
1296; base-4 constants for shuffles
1297%assign i 0
1298%rep 256
1299 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1300 %if j < 10
1301 CAT_XDEFINE q000, j, i
1302 %elif j < 100
1303 CAT_XDEFINE q00, j, i
1304 %elif j < 1000
1305 CAT_XDEFINE q0, j, i
1306 %else
1307 CAT_XDEFINE q, j, i
1308 %endif
1309%assign i i+1
1310%endrep
1311%undef i
1312%undef j
1313
1314%macro FMA_INSTR 3
79687079
JR
1315 %macro %1 5-8 %1, %2, %3
1316 %if cpuflag(xop) || cpuflag(fma4)
1317 v%6 %1, %2, %3, %4
2f7f2e4b 1318 %else
79687079
JR
1319 %ifidn %1, %4
1320 %7 %5, %2, %3
1321 %8 %1, %4, %5
1322 %else
1323 %7 %1, %2, %3
1324 %8 %1, %4
1325 %endif
2f7f2e4b
LM
1326 %endif
1327 %endmacro
1328%endmacro
1329
79687079 1330FMA_INSTR fmaddps, mulps, addps
2f7f2e4b
LM
1331FMA_INSTR pmacsdd, pmulld, paddd
1332FMA_INSTR pmacsww, pmullw, paddw
1333FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1334
1335; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1336; This lets us use tzcnt without bumping the yasm version requirement yet.
1337%define tzcnt rep bsf