x86inc: Use SSE instead of SSE2 for copying data
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
729f90e2 4;* Copyright (C) 2005-2012 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
729f90e2 9;* Henrik Gramner <hengar-6@student.ltu.se>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
d633d12b
DB
41%ifndef public_prefix
42 %define public_prefix private_prefix
43%endif
44
3b15a6d7 45%define WIN64 0
96c9cc10 46%define UNIX64 0
3b15a6d7 47%if ARCH_X86_64
3f87f39c 48 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 49 %define WIN64 1
166f3993
HY
50 %elifidn __OUTPUT_FORMAT__,win64
51 %define WIN64 1
3f87f39c 52 %else
3b15a6d7 53 %define UNIX64 1
3f87f39c
JA
54 %endif
55%endif
56
2966cc18
JGG
57%ifdef PREFIX
58 %define mangle(x) _ %+ x
59%else
60 %define mangle(x) x
61%endif
62
bafad220
LM
63; Name of the .rodata section.
64; Kludge: Something on OS X fails to align .rodata even given an align attribute,
65; so use a different read-only section.
3f87f39c 66%macro SECTION_RODATA 0-1 16
bafad220 67 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 68 SECTION .text align=%1
bafad220 69 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 70 SECTION .text align=%1
bafad220 71 fakegot:
d69f9a42
DY
72 %elifidn __OUTPUT_FORMAT__,aout
73 section .text
bafad220 74 %else
3f87f39c 75 SECTION .rodata align=%1
bafad220
LM
76 %endif
77%endmacro
78
d69f9a42
DY
79; aout does not support align=
80%macro SECTION_TEXT 0-1 16
81 %ifidn __OUTPUT_FORMAT__,aout
82 SECTION .text
83 %else
84 SECTION .text align=%1
85 %endif
86%endmacro
87
3b15a6d7 88%if WIN64
3f87f39c 89 %define PIC
412b248e 90%elif ARCH_X86_64 == 0
2966cc18
JGG
91; x86_32 doesn't require PIC.
92; Some distros prefer shared objects to be PIC, but nothing breaks if
93; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
94 %undef PIC
95%endif
96%ifdef PIC
2966cc18 97 default rel
bafad220
LM
98%endif
99
180d43bc
MR
100%macro CPUNOP 1
101 %if HAVE_CPUNOP
102 CPU %1
103 %endif
104%endmacro
105
729f90e2 106; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
180d43bc 107CPUNOP amdnop
729f90e2 108
bafad220
LM
109; Macros to eliminate most code duplication between x86_32 and x86_64:
110; Currently this works only for leaf functions which load all their arguments
111; into registers at the start, and make no other use of the stack. Luckily that
112; covers most of x264's asm.
113
114; PROLOGUE:
115; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
116; %2 = number of registers used. pushes callee-saved regs if needed.
117; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
6f40e9f0
RB
118; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
119; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
120; and an extra register will be allocated to hold the original stack
121; pointer (to not invalidate r0m etc.). To prevent the use of an extra
122; register as stack pointer, request a negative stack size.
123; %4+/%5+ = list of names to define to registers
bafad220
LM
124; PROLOGUE can also be invoked by adding the same options to cglobal
125
126; e.g.
29e4edbb 127; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 128; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
129
130; TODO Some functions can use some args directly from the stack. If they're the
131; last args then you can just not declare them, but if they're in the middle
132; we need more flexible macro.
133
134; RET:
2f7f2e4b 135; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
136
137; REP_RET:
25cb0c1a 138; Use this instead of RET if it's a branch target.
bafad220 139
3f87f39c
JA
140; registers:
141; rN and rNq are the native-size register holding function argument N
142; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 143; rNh is the high 8 bits of the word size
3f87f39c
JA
144; rNm is the original location of arg N (a register or on the stack), dword
145; rNmp is native size
146
96c9cc10 147%macro DECLARE_REG 2-3
bafad220 148 %define r%1q %2
96c9cc10
RB
149 %define r%1d %2d
150 %define r%1w %2w
151 %define r%1b %2b
152 %define r%1h %2h
7a1944b9 153 %define %2q %2
96c9cc10
RB
154 %if %0 == 2
155 %define r%1m %2d
3f87f39c 156 %define r%1mp %2
3b15a6d7 157 %elif ARCH_X86_64 ; memory
6f40e9f0 158 %define r%1m [rstk + stack_offset + %3]
0995ad8d 159 %define r%1mp qword r %+ %1 %+ m
3f87f39c 160 %else
6f40e9f0 161 %define r%1m [rstk + stack_offset + %3]
0995ad8d 162 %define r%1mp dword r %+ %1 %+ m
3f87f39c 163 %endif
bafad220
LM
164 %define r%1 %2
165%endmacro
166
96c9cc10 167%macro DECLARE_REG_SIZE 3
bafad220
LM
168 %define r%1q r%1
169 %define e%1q r%1
170 %define r%1d e%1
171 %define e%1d e%1
172 %define r%1w %1
173 %define e%1w %1
96c9cc10
RB
174 %define r%1h %3
175 %define e%1h %3
bafad220
LM
176 %define r%1b %2
177 %define e%1b %2
3b15a6d7 178%if ARCH_X86_64 == 0
bafad220
LM
179 %define r%1 e%1
180%endif
181%endmacro
182
96c9cc10
RB
183DECLARE_REG_SIZE ax, al, ah
184DECLARE_REG_SIZE bx, bl, bh
185DECLARE_REG_SIZE cx, cl, ch
186DECLARE_REG_SIZE dx, dl, dh
187DECLARE_REG_SIZE si, sil, null
188DECLARE_REG_SIZE di, dil, null
189DECLARE_REG_SIZE bp, bpl, null
bafad220 190
3f87f39c
JA
191; t# defines for when per-arch register allocation is more complex than just function arguments
192
193%macro DECLARE_REG_TMP 1-*
194 %assign %%i 0
195 %rep %0
196 CAT_XDEFINE t, %%i, r%1
197 %assign %%i %%i+1
198 %rotate 1
199 %endrep
200%endmacro
201
202%macro DECLARE_REG_TMP_SIZE 0-*
203 %rep %0
204 %define t%1q t%1 %+ q
205 %define t%1d t%1 %+ d
206 %define t%1w t%1 %+ w
96c9cc10 207 %define t%1h t%1 %+ h
3f87f39c
JA
208 %define t%1b t%1 %+ b
209 %rotate 1
210 %endrep
211%endmacro
212
729f90e2 213DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 214
3b15a6d7 215%if ARCH_X86_64
bafad220
LM
216 %define gprsize 8
217%else
218 %define gprsize 4
219%endif
220
221%macro PUSH 1
222 push %1
6f40e9f0
RB
223 %ifidn rstk, rsp
224 %assign stack_offset stack_offset+gprsize
225 %endif
bafad220
LM
226%endmacro
227
228%macro POP 1
229 pop %1
6f40e9f0
RB
230 %ifidn rstk, rsp
231 %assign stack_offset stack_offset-gprsize
232 %endif
bafad220
LM
233%endmacro
234
729f90e2
HG
235%macro PUSH_IF_USED 1-*
236 %rep %0
237 %if %1 < regs_used
238 PUSH r%1
239 %endif
240 %rotate 1
241 %endrep
242%endmacro
243
244%macro POP_IF_USED 1-*
245 %rep %0
246 %if %1 < regs_used
247 pop r%1
248 %endif
249 %rotate 1
250 %endrep
251%endmacro
252
253%macro LOAD_IF_USED 1-*
254 %rep %0
255 %if %1 < num_args
256 mov r%1, r %+ %1 %+ mp
257 %endif
258 %rotate 1
259 %endrep
260%endmacro
261
bafad220
LM
262%macro SUB 2
263 sub %1, %2
6f40e9f0 264 %ifidn %1, rstk
bafad220
LM
265 %assign stack_offset stack_offset+(%2)
266 %endif
267%endmacro
268
269%macro ADD 2
270 add %1, %2
6f40e9f0 271 %ifidn %1, rstk
bafad220
LM
272 %assign stack_offset stack_offset-(%2)
273 %endif
274%endmacro
275
276%macro movifnidn 2
277 %ifnidn %1, %2
278 mov %1, %2
279 %endif
280%endmacro
281
282%macro movsxdifnidn 2
283 %ifnidn %1, %2
284 movsxd %1, %2
285 %endif
286%endmacro
287
288%macro ASSERT 1
289 %if (%1) == 0
290 %error assert failed
291 %endif
292%endmacro
293
294%macro DEFINE_ARGS 0-*
295 %ifdef n_arg_names
296 %assign %%i 0
297 %rep n_arg_names
298 CAT_UNDEF arg_name %+ %%i, q
299 CAT_UNDEF arg_name %+ %%i, d
300 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 301 CAT_UNDEF arg_name %+ %%i, h
bafad220 302 CAT_UNDEF arg_name %+ %%i, b
2f77923d 303 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 304 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
305 CAT_UNDEF arg_name, %%i
306 %assign %%i %%i+1
307 %endrep
308 %endif
309
0f53d0cf
LM
310 %xdefine %%stack_offset stack_offset
311 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
312 %assign %%i 0
313 %rep %0
314 %xdefine %1q r %+ %%i %+ q
315 %xdefine %1d r %+ %%i %+ d
316 %xdefine %1w r %+ %%i %+ w
96c9cc10 317 %xdefine %1h r %+ %%i %+ h
bafad220 318 %xdefine %1b r %+ %%i %+ b
2f77923d 319 %xdefine %1m r %+ %%i %+ m
98b9da2a 320 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
321 CAT_XDEFINE arg_name, %%i, %1
322 %assign %%i %%i+1
323 %rotate 1
324 %endrep
0f53d0cf
LM
325 %xdefine stack_offset %%stack_offset
326 %assign n_arg_names %0
bafad220
LM
327%endmacro
328
6f40e9f0
RB
329%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
330 %ifnum %1
331 %if %1 != 0
332 %assign %%stack_alignment ((mmsize + 15) & ~15)
333 %assign stack_size %1
334 %if stack_size < 0
335 %assign stack_size -stack_size
336 %endif
a34d9ad9
RB
337 %if mmsize != 8
338 %assign xmm_regs_used %2
339 %endif
6f40e9f0
RB
340 %if mmsize <= 16 && HAVE_ALIGNED_STACK
341 %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
342 %if xmm_regs_used > 6
343 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
344 %endif
345 SUB rsp, stack_size_padded
346 %else
a34d9ad9
RB
347 %assign %%reg_num (regs_used - 1)
348 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
349 ; align stack, and save original stack location directly above
350 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
351 ; stack in a single instruction (i.e. mov rsp, rstk or mov
352 ; rsp, [rsp+stack_size_padded])
353 mov rstk, rsp
354 %assign stack_size_padded stack_size
355 %if xmm_regs_used > 6
356 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
a34d9ad9
RB
357 %if mmsize == 32 && xmm_regs_used & 1
358 ; re-align to 32 bytes
359 %assign stack_size_padded (stack_size_padded + 16)
360 %endif
6f40e9f0
RB
361 %endif
362 %if %1 < 0 ; need to store rsp on stack
363 sub rsp, gprsize+stack_size_padded
364 and rsp, ~(%%stack_alignment-1)
365 %xdefine rstkm [rsp+stack_size_padded]
366 mov rstkm, rstk
367 %else ; can keep rsp in rstk during whole function
368 sub rsp, stack_size_padded
369 and rsp, ~(%%stack_alignment-1)
370 %xdefine rstkm rstk
371 %endif
372 %endif
373 %if xmm_regs_used > 6
374 WIN64_PUSH_XMM
375 %endif
376 %endif
377 %endif
378%endmacro
379
380%macro SETUP_STACK_POINTER 1
381 %ifnum %1
382 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
383 %if %1 > 0
384 %assign regs_used (regs_used + 1)
385 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
386 %warning "Stack pointer will overwrite register argument"
387 %endif
388 %endif
389 %endif
390%endmacro
391
392%macro DEFINE_ARGS_INTERNAL 3+
393 %ifnum %2
394 DEFINE_ARGS %3
395 %elif %1 == 4
396 DEFINE_ARGS %2
397 %elif %1 > 4
398 DEFINE_ARGS %2, %3
399 %endif
400%endmacro
401
3b15a6d7 402%if WIN64 ; Windows x64 ;=================================================
bafad220 403
96c9cc10
RB
404DECLARE_REG 0, rcx
405DECLARE_REG 1, rdx
406DECLARE_REG 2, R8
407DECLARE_REG 3, R9
408DECLARE_REG 4, R10, 40
409DECLARE_REG 5, R11, 48
410DECLARE_REG 6, rax, 56
411DECLARE_REG 7, rdi, 64
412DECLARE_REG 8, rsi, 72
413DECLARE_REG 9, rbx, 80
414DECLARE_REG 10, rbp, 88
415DECLARE_REG 11, R12, 96
416DECLARE_REG 12, R13, 104
417DECLARE_REG 13, R14, 112
418DECLARE_REG 14, R15, 120
3f87f39c 419
6f40e9f0 420%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 421 %assign num_args %1
3f87f39c 422 %assign regs_used %2
729f90e2 423 ASSERT regs_used >= num_args
a34d9ad9 424 SETUP_STACK_POINTER %4
729f90e2
HG
425 ASSERT regs_used <= 15
426 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
427 ALLOC_STACK %4, %3
428 %if mmsize != 8 && stack_size == 0
9cf73853
HG
429 WIN64_SPILL_XMM %3
430 %endif
729f90e2 431 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
432 DEFINE_ARGS_INTERNAL %0, %4, %5
433%endmacro
434
435%macro WIN64_PUSH_XMM 0
436 %assign %%i xmm_regs_used
437 %rep (xmm_regs_used-6)
438 %assign %%i %%i-1
63f0d623 439 movaps [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
6f40e9f0 440 %endrep
532e7697
LM
441%endmacro
442
443%macro WIN64_SPILL_XMM 1
444 %assign xmm_regs_used %1
445 ASSERT xmm_regs_used <= 16
3f87f39c 446 %if xmm_regs_used > 6
140367af 447 SUB rsp, (xmm_regs_used-6)*16+16
6f40e9f0 448 WIN64_PUSH_XMM
3f87f39c 449 %endif
3f87f39c
JA
450%endmacro
451
532e7697 452%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
453 %if xmm_regs_used > 6
454 %assign %%i xmm_regs_used
455 %rep (xmm_regs_used-6)
456 %assign %%i %%i-1
63f0d623 457 movaps xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
3f87f39c 458 %endrep
140367af
RB
459 %if stack_size_padded == 0
460 add %1, (xmm_regs_used-6)*16+16
461 %endif
6f40e9f0
RB
462 %endif
463 %if stack_size_padded > 0
464 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
465 mov rsp, rstkm
466 %else
467 add %1, stack_size_padded
468 %endif
3f87f39c
JA
469 %endif
470%endmacro
471
532e7697
LM
472%macro WIN64_RESTORE_XMM 1
473 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 474 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
475 %assign xmm_regs_used 0
476%endmacro
477
6f40e9f0 478%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 479
3f87f39c 480%macro RET 0
532e7697 481 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 482 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
483%if mmsize == 32
484 vzeroupper
485%endif
25cb0c1a 486 AUTO_REP_RET
bafad220
LM
487%endmacro
488
3b15a6d7 489%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 490
96c9cc10
RB
491DECLARE_REG 0, rdi
492DECLARE_REG 1, rsi
493DECLARE_REG 2, rdx
494DECLARE_REG 3, rcx
495DECLARE_REG 4, R8
496DECLARE_REG 5, R9
497DECLARE_REG 6, rax, 8
498DECLARE_REG 7, R10, 16
499DECLARE_REG 8, R11, 24
500DECLARE_REG 9, rbx, 32
501DECLARE_REG 10, rbp, 40
502DECLARE_REG 11, R12, 48
503DECLARE_REG 12, R13, 56
504DECLARE_REG 13, R14, 64
505DECLARE_REG 14, R15, 72
bafad220 506
6f40e9f0 507%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
508 %assign num_args %1
509 %assign regs_used %2
510 ASSERT regs_used >= num_args
a34d9ad9 511 SETUP_STACK_POINTER %4
729f90e2
HG
512 ASSERT regs_used <= 15
513 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 514 ALLOC_STACK %4
729f90e2 515 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 516 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
517%endmacro
518
6f40e9f0 519%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 520
bafad220 521%macro RET 0
6f40e9f0
RB
522%if stack_size_padded > 0
523%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
524 mov rsp, rstkm
525%else
526 add rsp, stack_size_padded
527%endif
528%endif
729f90e2 529 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
530%if mmsize == 32
531 vzeroupper
532%endif
25cb0c1a 533 AUTO_REP_RET
bafad220
LM
534%endmacro
535
bafad220
LM
536%else ; X86_32 ;==============================================================
537
96c9cc10
RB
538DECLARE_REG 0, eax, 4
539DECLARE_REG 1, ecx, 8
540DECLARE_REG 2, edx, 12
541DECLARE_REG 3, ebx, 16
542DECLARE_REG 4, esi, 20
543DECLARE_REG 5, edi, 24
544DECLARE_REG 6, ebp, 28
bafad220
LM
545%define rsp esp
546
729f90e2
HG
547%macro DECLARE_ARG 1-*
548 %rep %0
6f40e9f0 549 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
550 %define r%1mp dword r%1m
551 %rotate 1
552 %endrep
bafad220
LM
553%endmacro
554
729f90e2 555DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 556
6f40e9f0 557%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 558 %assign num_args %1
bafad220 559 %assign regs_used %2
a34d9ad9
RB
560 ASSERT regs_used >= num_args
561 %if num_args > 7
562 %assign num_args 7
563 %endif
729f90e2
HG
564 %if regs_used > 7
565 %assign regs_used 7
566 %endif
6f40e9f0
RB
567 SETUP_STACK_POINTER %4
568 ASSERT regs_used <= 7
729f90e2 569 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 570 ALLOC_STACK %4
729f90e2 571 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 572 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
573%endmacro
574
6f40e9f0 575%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 576
bafad220 577%macro RET 0
6f40e9f0
RB
578%if stack_size_padded > 0
579%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
580 mov rsp, rstkm
581%else
582 add rsp, stack_size_padded
583%endif
584%endif
729f90e2 585 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
586%if mmsize == 32
587 vzeroupper
588%endif
25cb0c1a 589 AUTO_REP_RET
bafad220
LM
590%endmacro
591
bafad220
LM
592%endif ;======================================================================
593
3b15a6d7 594%if WIN64 == 0
532e7697
LM
595%macro WIN64_SPILL_XMM 1
596%endmacro
597%macro WIN64_RESTORE_XMM 1
598%endmacro
6f40e9f0
RB
599%macro WIN64_PUSH_XMM 0
600%endmacro
532e7697
LM
601%endif
602
25cb0c1a
LM
603; On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either
604; a branch or a branch target. So switch to a 2-byte form of ret in that case.
605; We can automatically detect "follows a branch", but not a branch target.
606; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
96c9cc10
RB
607%macro REP_RET 0
608 %if has_epilogue
609 RET
610 %else
611 rep ret
612 %endif
613%endmacro
614
25cb0c1a
LM
615%define last_branch_adr $$
616%macro AUTO_REP_RET 0
617 %ifndef cpuflags
618 times ((last_branch_adr-$)>>31)+1 rep ; times 1 iff $ != last_branch_adr.
619 %elif notcpuflag(ssse3)
620 times ((last_branch_adr-$)>>31)+1 rep
621 %endif
622 ret
623%endmacro
624
625%macro BRANCH_INSTR 0-*
626 %rep %0
627 %macro %1 1-2 %1
628 %2 %1
629 %%branch_instr:
630 %xdefine last_branch_adr %%branch_instr
631 %endmacro
632 %rotate 1
633 %endrep
634%endmacro
635
636BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, jna, jnae, jb, jbe, jnb, jnbe, jc, jnc, js, jns, jo, jno, jp, jnp
637
96c9cc10
RB
638%macro TAIL_CALL 2 ; callee, is_nonadjacent
639 %if has_epilogue
640 call %1
641 RET
642 %elif %2
643 jmp %1
644 %endif
645%endmacro
646
bafad220
LM
647;=============================================================================
648; arch-independent part
649;=============================================================================
650
651%assign function_align 16
652
2f7f2e4b
LM
653; Begin a function.
654; Applies any symbol mangling needed for C linkage, and sets up a define such that
655; subsequent uses of the function name automatically refer to the mangled version.
656; Appends cpuflags to the function name if cpuflags has been specified.
d633d12b
DB
657; The "" empty default parameter is a workaround for nasm, which fails if SUFFIX
658; is empty and we call cglobal_internal with just %1 %+ SUFFIX (without %2).
a34d9ad9 659%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
d633d12b
DB
660 cglobal_internal 1, %1 %+ SUFFIX, %2
661%endmacro
662%macro cvisible 1-2+ "" ; name, [PROLOGUE args]
663 cglobal_internal 0, %1 %+ SUFFIX, %2
664%endmacro
665%macro cglobal_internal 2-3+
666 %if %1
667 %xdefine %%FUNCTION_PREFIX private_prefix
668 %xdefine %%VISIBILITY hidden
669 %else
670 %xdefine %%FUNCTION_PREFIX public_prefix
671 %xdefine %%VISIBILITY
672 %endif
673 %ifndef cglobaled_%2
674 %xdefine %2 mangle(%%FUNCTION_PREFIX %+ _ %+ %2)
675 %xdefine %2.skip_prologue %2 %+ .skip_prologue
676 CAT_XDEFINE cglobaled_, %2, 1
2f7f2e4b 677 %endif
d633d12b 678 %xdefine current_function %2
bafad220 679 %ifidn __OUTPUT_FORMAT__,elf
d633d12b 680 global %2:function %%VISIBILITY
bafad220 681 %else
d633d12b 682 global %2
bafad220
LM
683 %endif
684 align function_align
d633d12b 685 %2:
bafad220 686 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
6f40e9f0 687 %xdefine rstk rsp
3f87f39c 688 %assign stack_offset 0
6f40e9f0
RB
689 %assign stack_size 0
690 %assign stack_size_padded 0
a34d9ad9 691 %assign xmm_regs_used 0
d633d12b
DB
692 %ifnidn %3, ""
693 PROLOGUE %3
bafad220
LM
694 %endif
695%endmacro
696
697%macro cextern 1
ef5d41a5 698 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 699 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
700 extern %1
701%endmacro
702
2f7f2e4b 703; like cextern, but without the prefix
2966cc18
JGG
704%macro cextern_naked 1
705 %xdefine %1 mangle(%1)
2f7f2e4b 706 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 707 extern %1
bafad220
LM
708%endmacro
709
2966cc18 710%macro const 2+
ef5d41a5 711 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
ad76e6e7
HG
712 %ifidn __OUTPUT_FORMAT__,elf
713 global %1:data hidden
714 %else
715 global %1
716 %endif
2966cc18
JGG
717 %1: %2
718%endmacro
719
bafad220
LM
720; This is needed for ELF, otherwise the GNU linker assumes the stack is
721; executable by default.
722%ifidn __OUTPUT_FORMAT__,elf
723SECTION .note.GNU-stack noalloc noexec nowrite progbits
724%endif
725
2f7f2e4b
LM
726; cpuflags
727
728%assign cpuflags_mmx (1<<0)
729%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
730%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 731%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
732%assign cpuflags_sse (1<<4) | cpuflags_mmx2
733%assign cpuflags_sse2 (1<<5) | cpuflags_sse
734%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
735%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
736%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
737%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
738%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
739%assign cpuflags_avx (1<<11)| cpuflags_sse42
740%assign cpuflags_xop (1<<12)| cpuflags_avx
741%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
742%assign cpuflags_avx2 (1<<14)| cpuflags_avx
743%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
744
745%assign cpuflags_cache32 (1<<16)
746%assign cpuflags_cache64 (1<<17)
747%assign cpuflags_slowctz (1<<18)
748%assign cpuflags_lzcnt (1<<19)
749%assign cpuflags_misalign (1<<20)
750%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
751%assign cpuflags_atom (1<<22)
96c9cc10
RB
752%assign cpuflags_bmi1 (1<<23)
753%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
754%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
2f7f2e4b
LM
755
756%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
757%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
758
759; Takes up to 2 cpuflags from the above list.
760; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
761; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
762%macro INIT_CPUFLAGS 0-2
180d43bc 763 CPUNOP amdnop
2f7f2e4b
LM
764 %if %0 >= 1
765 %xdefine cpuname %1
766 %assign cpuflags cpuflags_%1
767 %if %0 >= 2
768 %xdefine cpuname %1_%2
769 %assign cpuflags cpuflags | cpuflags_%2
770 %endif
771 %xdefine SUFFIX _ %+ cpuname
772 %if cpuflag(avx)
773 %assign avx_enabled 1
774 %endif
f2bd8a07
JR
775 %if mmsize == 16 && notcpuflag(sse2)
776 %define mova movaps
777 %define movu movups
778 %define movnta movntps
779 %endif
2f7f2e4b
LM
780 %if cpuflag(aligned)
781 %define movu mova
782 %elifidn %1, sse3
783 %define movu lddqu
784 %endif
0c0828ec 785 %if notcpuflag(sse2)
180d43bc 786 CPUNOP basicnop
2cd1f5ca 787 %endif
2f7f2e4b
LM
788 %else
789 %xdefine SUFFIX
790 %undef cpuname
791 %undef cpuflags
792 %endif
793%endmacro
794
bafad220
LM
795; merge mmx and sse*
796
797%macro CAT_XDEFINE 3
798 %xdefine %1%2 %3
799%endmacro
800
801%macro CAT_UNDEF 2
802 %undef %1%2
803%endmacro
804
2f7f2e4b 805%macro INIT_MMX 0-1+
33cbfa6f 806 %assign avx_enabled 0
2f7f2e4b 807 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
808 %define mmsize 8
809 %define num_mmregs 8
810 %define mova movq
811 %define movu movq
812 %define movh movd
532e7697 813 %define movnta movntq
bafad220
LM
814 %assign %%i 0
815 %rep 8
816 CAT_XDEFINE m, %%i, mm %+ %%i
817 CAT_XDEFINE nmm, %%i, %%i
818 %assign %%i %%i+1
819 %endrep
820 %rep 8
821 CAT_UNDEF m, %%i
822 CAT_UNDEF nmm, %%i
823 %assign %%i %%i+1
824 %endrep
2f7f2e4b 825 INIT_CPUFLAGS %1
bafad220
LM
826%endmacro
827
2f7f2e4b 828%macro INIT_XMM 0-1+
33cbfa6f 829 %assign avx_enabled 0
2f7f2e4b 830 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
831 %define mmsize 16
832 %define num_mmregs 8
3b15a6d7 833 %if ARCH_X86_64
bafad220
LM
834 %define num_mmregs 16
835 %endif
836 %define mova movdqa
837 %define movu movdqu
838 %define movh movq
532e7697 839 %define movnta movntdq
bafad220
LM
840 %assign %%i 0
841 %rep num_mmregs
842 CAT_XDEFINE m, %%i, xmm %+ %%i
843 CAT_XDEFINE nxmm, %%i, %%i
844 %assign %%i %%i+1
845 %endrep
2f7f2e4b 846 INIT_CPUFLAGS %1
bafad220
LM
847%endmacro
848
2f7f2e4b 849%macro INIT_YMM 0-1+
33cbfa6f 850 %assign avx_enabled 1
2f7f2e4b 851 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
852 %define mmsize 32
853 %define num_mmregs 8
3b15a6d7 854 %if ARCH_X86_64
33cbfa6f
VS
855 %define num_mmregs 16
856 %endif
857 %define mova vmovaps
858 %define movu vmovups
2f7f2e4b
LM
859 %undef movh
860 %define movnta vmovntps
33cbfa6f
VS
861 %assign %%i 0
862 %rep num_mmregs
863 CAT_XDEFINE m, %%i, ymm %+ %%i
864 CAT_XDEFINE nymm, %%i, %%i
865 %assign %%i %%i+1
866 %endrep
2f7f2e4b 867 INIT_CPUFLAGS %1
33cbfa6f
VS
868%endmacro
869
2f7f2e4b 870INIT_XMM
bafad220
LM
871
872; I often want to use macros that permute their arguments. e.g. there's no
873; efficient way to implement butterfly or transpose or dct without swapping some
874; arguments.
875;
876; I would like to not have to manually keep track of the permutations:
877; If I insert a permutation in the middle of a function, it should automatically
878; change everything that follows. For more complex macros I may also have multiple
879; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
880;
881; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
882; permutes its arguments. It's equivalent to exchanging the contents of the
883; registers, except that this way you exchange the register names instead, so it
884; doesn't cost any cycles.
885
886%macro PERMUTE 2-* ; takes a list of pairs to swap
887%rep %0/2
888 %xdefine tmp%2 m%2
889 %xdefine ntmp%2 nm%2
890 %rotate 2
891%endrep
892%rep %0/2
893 %xdefine m%1 tmp%2
894 %xdefine nm%1 ntmp%2
895 %undef tmp%2
896 %undef ntmp%2
897 %rotate 2
898%endrep
899%endmacro
900
901%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
902%rep %0-1
903%ifdef m%1
904 %xdefine tmp m%1
905 %xdefine m%1 m%2
906 %xdefine m%2 tmp
907 CAT_XDEFINE n, m%1, %1
908 CAT_XDEFINE n, m%2, %2
909%else
910 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
911 ; Be careful using this mode in nested macros though, as in some cases there may be
912 ; other copies of m# that have already been dereferenced and don't get updated correctly.
913 %xdefine %%n1 n %+ %1
914 %xdefine %%n2 n %+ %2
915 %xdefine tmp m %+ %%n1
916 CAT_XDEFINE m, %%n1, m %+ %%n2
917 CAT_XDEFINE m, %%n2, tmp
918 CAT_XDEFINE n, m %+ %%n1, %%n1
919 CAT_XDEFINE n, m %+ %%n2, %%n2
920%endif
921 %undef tmp
922 %rotate 1
923%endrep
924%endmacro
925
2f7f2e4b
LM
926; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
927; calls to that function will automatically load the permutation, so values can
928; be returned in mmregs.
929%macro SAVE_MM_PERMUTATION 0-1
930 %if %0
931 %xdefine %%f %1_m
932 %else
933 %xdefine %%f current_function %+ _m
934 %endif
bafad220
LM
935 %assign %%i 0
936 %rep num_mmregs
2f7f2e4b 937 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
938 %assign %%i %%i+1
939 %endrep
940%endmacro
941
2966cc18 942%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
943 %ifdef %1_m0
944 %assign %%i 0
945 %rep num_mmregs
946 CAT_XDEFINE m, %%i, %1_m %+ %%i
947 CAT_XDEFINE n, m %+ %%i, %%i
948 %assign %%i %%i+1
949 %endrep
950 %endif
bafad220
LM
951%endmacro
952
2f7f2e4b 953; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 954%macro call 1
edd82267 955 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
956%endmacro
957%macro call_internal 2
edd82267
MR
958 %xdefine %%i %2
959 %ifndef cglobaled_%2
960 %ifdef cglobaled_%1
961 %xdefine %%i %1
2f7f2e4b 962 %endif
bafad220 963 %endif
2f7f2e4b
LM
964 call %%i
965 LOAD_MM_PERMUTATION %%i
bafad220
LM
966%endmacro
967
2966cc18 968; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
969%macro add 2
970 %ifnum %2
971 %if %2==128
972 sub %1, -128
973 %else
974 add %1, %2
975 %endif
976 %else
977 add %1, %2
978 %endif
979%endmacro
980
981%macro sub 2
982 %ifnum %2
983 %if %2==128
984 add %1, -128
985 %else
986 sub %1, %2
987 %endif
988 %else
989 sub %1, %2
990 %endif
991%endmacro
33cbfa6f
VS
992
993;=============================================================================
994; AVX abstraction layer
995;=============================================================================
996
997%assign i 0
998%rep 16
999 %if i < 8
1000 CAT_XDEFINE sizeofmm, i, 8
1001 %endif
1002 CAT_XDEFINE sizeofxmm, i, 16
1003 CAT_XDEFINE sizeofymm, i, 32
1004%assign i i+1
1005%endrep
1006%undef i
1007
96c9cc10
RB
1008%macro CHECK_AVX_INSTR_EMU 3-*
1009 %xdefine %%opcode %1
1010 %xdefine %%dst %2
1011 %rep %0-2
1012 %ifidn %%dst, %3
1013 %error non-avx emulation of ``%%opcode'' is not supported
1014 %endif
1015 %rotate 1
1016 %endrep
1017%endmacro
1018
33cbfa6f
VS
1019;%1 == instruction
1020;%2 == 1 if float, 0 if int
705f3d47 1021;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
33cbfa6f
VS
1022;%4 == number of operands given
1023;%5+: operands
1024%macro RUN_AVX_INSTR 6-7+
96c9cc10
RB
1025 %ifid %6
1026 %define %%sizeofreg sizeof%6
1027 %elifid %5
1028 %define %%sizeofreg sizeof%5
2f7f2e4b 1029 %else
96c9cc10 1030 %define %%sizeofreg mmsize
2f7f2e4b 1031 %endif
96c9cc10
RB
1032 %if %%sizeofreg==32
1033 %if %4>=3
705f3d47
LM
1034 v%1 %5, %6, %7
1035 %else
1036 v%1 %5, %6
1037 %endif
33cbfa6f 1038 %else
96c9cc10 1039 %if %%sizeofreg==8
33cbfa6f
VS
1040 %define %%regmov movq
1041 %elif %2
1042 %define %%regmov movaps
1043 %else
1044 %define %%regmov movdqa
1045 %endif
1046
1047 %if %4>=3+%3
1048 %ifnidn %5, %6
96c9cc10 1049 %if avx_enabled && %%sizeofreg==16
33cbfa6f
VS
1050 v%1 %5, %6, %7
1051 %else
96c9cc10 1052 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
33cbfa6f
VS
1053 %%regmov %5, %6
1054 %1 %5, %7
1055 %endif
1056 %else
1057 %1 %5, %7
1058 %endif
96c9cc10 1059 %elif %4>=3
33cbfa6f
VS
1060 %1 %5, %6, %7
1061 %else
1062 %1 %5, %6
1063 %endif
1064 %endif
1065%endmacro
1066
2f7f2e4b
LM
1067; 3arg AVX ops with a memory arg can only have it in src2,
1068; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
1069; So, if the op is symmetric and the wrong one is memory, swap them.
1070%macro RUN_AVX_INSTR1 8
1071 %assign %%swap 0
1072 %if avx_enabled
1073 %ifnid %6
1074 %assign %%swap 1
1075 %endif
1076 %elifnidn %5, %6
1077 %ifnid %7
1078 %assign %%swap 1
1079 %endif
1080 %endif
1081 %if %%swap && %3 == 0 && %8 == 1
1082 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
1083 %else
1084 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
1085 %endif
1086%endmacro
1087
33cbfa6f
VS
1088;%1 == instruction
1089;%2 == 1 if float, 0 if int
96c9cc10 1090;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
2f7f2e4b
LM
1091;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
1092%macro AVX_INSTR 4
1093 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
33cbfa6f
VS
1094 %ifidn %3, fnord
1095 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
1096 %elifidn %4, fnord
2f7f2e4b 1097 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
33cbfa6f
VS
1098 %elifidn %5, fnord
1099 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
1100 %else
1101 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
1102 %endif
1103 %endmacro
1104%endmacro
1105
2f7f2e4b
LM
1106AVX_INSTR addpd, 1, 0, 1
1107AVX_INSTR addps, 1, 0, 1
1108AVX_INSTR addsd, 1, 0, 1
1109AVX_INSTR addss, 1, 0, 1
1110AVX_INSTR addsubpd, 1, 0, 0
1111AVX_INSTR addsubps, 1, 0, 0
1112AVX_INSTR andpd, 1, 0, 1
1113AVX_INSTR andps, 1, 0, 1
1114AVX_INSTR andnpd, 1, 0, 0
1115AVX_INSTR andnps, 1, 0, 0
1116AVX_INSTR blendpd, 1, 0, 0
1117AVX_INSTR blendps, 1, 0, 0
1118AVX_INSTR blendvpd, 1, 0, 0
1119AVX_INSTR blendvps, 1, 0, 0
2e81acc6
CG
1120AVX_INSTR cmppd, 1, 1, 0
1121AVX_INSTR cmpps, 1, 1, 0
1122AVX_INSTR cmpsd, 1, 1, 0
1123AVX_INSTR cmpss, 1, 1, 0
705f3d47 1124AVX_INSTR cvtdq2ps, 1, 0, 0
b30a3633 1125AVX_INSTR cvtpd2dq, 1, 0, 0
705f3d47 1126AVX_INSTR cvtps2dq, 1, 0, 0
2f7f2e4b
LM
1127AVX_INSTR divpd, 1, 0, 0
1128AVX_INSTR divps, 1, 0, 0
1129AVX_INSTR divsd, 1, 0, 0
1130AVX_INSTR divss, 1, 0, 0
1131AVX_INSTR dppd, 1, 1, 0
1132AVX_INSTR dpps, 1, 1, 0
1133AVX_INSTR haddpd, 1, 0, 0
1134AVX_INSTR haddps, 1, 0, 0
1135AVX_INSTR hsubpd, 1, 0, 0
1136AVX_INSTR hsubps, 1, 0, 0
1137AVX_INSTR maxpd, 1, 0, 1
1138AVX_INSTR maxps, 1, 0, 1
1139AVX_INSTR maxsd, 1, 0, 1
1140AVX_INSTR maxss, 1, 0, 1
1141AVX_INSTR minpd, 1, 0, 1
1142AVX_INSTR minps, 1, 0, 1
1143AVX_INSTR minsd, 1, 0, 1
1144AVX_INSTR minss, 1, 0, 1
39df0c43
VS
1145AVX_INSTR movhlps, 1, 0, 0
1146AVX_INSTR movlhps, 1, 0, 0
2f7f2e4b
LM
1147AVX_INSTR movsd, 1, 0, 0
1148AVX_INSTR movss, 1, 0, 0
1149AVX_INSTR mpsadbw, 0, 1, 0
1150AVX_INSTR mulpd, 1, 0, 1
1151AVX_INSTR mulps, 1, 0, 1
1152AVX_INSTR mulsd, 1, 0, 1
1153AVX_INSTR mulss, 1, 0, 1
1154AVX_INSTR orpd, 1, 0, 1
1155AVX_INSTR orps, 1, 0, 1
96c9cc10
RB
1156AVX_INSTR pabsb, 0, 0, 0
1157AVX_INSTR pabsw, 0, 0, 0
1158AVX_INSTR pabsd, 0, 0, 0
2f7f2e4b
LM
1159AVX_INSTR packsswb, 0, 0, 0
1160AVX_INSTR packssdw, 0, 0, 0
1161AVX_INSTR packuswb, 0, 0, 0
1162AVX_INSTR packusdw, 0, 0, 0
1163AVX_INSTR paddb, 0, 0, 1
1164AVX_INSTR paddw, 0, 0, 1
1165AVX_INSTR paddd, 0, 0, 1
1166AVX_INSTR paddq, 0, 0, 1
1167AVX_INSTR paddsb, 0, 0, 1
1168AVX_INSTR paddsw, 0, 0, 1
1169AVX_INSTR paddusb, 0, 0, 1
1170AVX_INSTR paddusw, 0, 0, 1
1171AVX_INSTR palignr, 0, 1, 0
1172AVX_INSTR pand, 0, 0, 1
1173AVX_INSTR pandn, 0, 0, 0
1174AVX_INSTR pavgb, 0, 0, 1
1175AVX_INSTR pavgw, 0, 0, 1
1176AVX_INSTR pblendvb, 0, 0, 0
1177AVX_INSTR pblendw, 0, 1, 0
1178AVX_INSTR pcmpestri, 0, 0, 0
1179AVX_INSTR pcmpestrm, 0, 0, 0
1180AVX_INSTR pcmpistri, 0, 0, 0
1181AVX_INSTR pcmpistrm, 0, 0, 0
1182AVX_INSTR pcmpeqb, 0, 0, 1
1183AVX_INSTR pcmpeqw, 0, 0, 1
1184AVX_INSTR pcmpeqd, 0, 0, 1
1185AVX_INSTR pcmpeqq, 0, 0, 1
1186AVX_INSTR pcmpgtb, 0, 0, 0
1187AVX_INSTR pcmpgtw, 0, 0, 0
1188AVX_INSTR pcmpgtd, 0, 0, 0
1189AVX_INSTR pcmpgtq, 0, 0, 0
1190AVX_INSTR phaddw, 0, 0, 0
1191AVX_INSTR phaddd, 0, 0, 0
1192AVX_INSTR phaddsw, 0, 0, 0
1193AVX_INSTR phsubw, 0, 0, 0
1194AVX_INSTR phsubd, 0, 0, 0
1195AVX_INSTR phsubsw, 0, 0, 0
1196AVX_INSTR pmaddwd, 0, 0, 1
1197AVX_INSTR pmaddubsw, 0, 0, 0
1198AVX_INSTR pmaxsb, 0, 0, 1
1199AVX_INSTR pmaxsw, 0, 0, 1
1200AVX_INSTR pmaxsd, 0, 0, 1
1201AVX_INSTR pmaxub, 0, 0, 1
1202AVX_INSTR pmaxuw, 0, 0, 1
1203AVX_INSTR pmaxud, 0, 0, 1
1204AVX_INSTR pminsb, 0, 0, 1
1205AVX_INSTR pminsw, 0, 0, 1
1206AVX_INSTR pminsd, 0, 0, 1
1207AVX_INSTR pminub, 0, 0, 1
1208AVX_INSTR pminuw, 0, 0, 1
1209AVX_INSTR pminud, 0, 0, 1
96c9cc10 1210AVX_INSTR pmovmskb, 0, 0, 0
2f7f2e4b
LM
1211AVX_INSTR pmulhuw, 0, 0, 1
1212AVX_INSTR pmulhrsw, 0, 0, 1
1213AVX_INSTR pmulhw, 0, 0, 1
1214AVX_INSTR pmullw, 0, 0, 1
1215AVX_INSTR pmulld, 0, 0, 1
1216AVX_INSTR pmuludq, 0, 0, 1
1217AVX_INSTR pmuldq, 0, 0, 1
1218AVX_INSTR por, 0, 0, 1
1219AVX_INSTR psadbw, 0, 0, 1
1220AVX_INSTR pshufb, 0, 0, 0
96c9cc10
RB
1221AVX_INSTR pshufd, 0, 1, 0
1222AVX_INSTR pshufhw, 0, 1, 0
1223AVX_INSTR pshuflw, 0, 1, 0
2f7f2e4b
LM
1224AVX_INSTR psignb, 0, 0, 0
1225AVX_INSTR psignw, 0, 0, 0
1226AVX_INSTR psignd, 0, 0, 0
1227AVX_INSTR psllw, 0, 0, 0
1228AVX_INSTR pslld, 0, 0, 0
1229AVX_INSTR psllq, 0, 0, 0
1230AVX_INSTR pslldq, 0, 0, 0
1231AVX_INSTR psraw, 0, 0, 0
1232AVX_INSTR psrad, 0, 0, 0
1233AVX_INSTR psrlw, 0, 0, 0
1234AVX_INSTR psrld, 0, 0, 0
1235AVX_INSTR psrlq, 0, 0, 0
1236AVX_INSTR psrldq, 0, 0, 0
1237AVX_INSTR psubb, 0, 0, 0
1238AVX_INSTR psubw, 0, 0, 0
1239AVX_INSTR psubd, 0, 0, 0
1240AVX_INSTR psubq, 0, 0, 0
1241AVX_INSTR psubsb, 0, 0, 0
1242AVX_INSTR psubsw, 0, 0, 0
1243AVX_INSTR psubusb, 0, 0, 0
1244AVX_INSTR psubusw, 0, 0, 0
96c9cc10 1245AVX_INSTR ptest, 0, 0, 0
2f7f2e4b
LM
1246AVX_INSTR punpckhbw, 0, 0, 0
1247AVX_INSTR punpckhwd, 0, 0, 0
1248AVX_INSTR punpckhdq, 0, 0, 0
1249AVX_INSTR punpckhqdq, 0, 0, 0
1250AVX_INSTR punpcklbw, 0, 0, 0
1251AVX_INSTR punpcklwd, 0, 0, 0
1252AVX_INSTR punpckldq, 0, 0, 0
1253AVX_INSTR punpcklqdq, 0, 0, 0
1254AVX_INSTR pxor, 0, 0, 1
6b6ee582 1255AVX_INSTR shufps, 1, 1, 0
2f7f2e4b
LM
1256AVX_INSTR subpd, 1, 0, 0
1257AVX_INSTR subps, 1, 0, 0
1258AVX_INSTR subsd, 1, 0, 0
1259AVX_INSTR subss, 1, 0, 0
1260AVX_INSTR unpckhpd, 1, 0, 0
1261AVX_INSTR unpckhps, 1, 0, 0
1262AVX_INSTR unpcklpd, 1, 0, 0
1263AVX_INSTR unpcklps, 1, 0, 0
1264AVX_INSTR xorpd, 1, 0, 1
1265AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1266
1267; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1268AVX_INSTR pfadd, 1, 0, 1
1269AVX_INSTR pfsub, 1, 0, 0
1270AVX_INSTR pfmul, 1, 0, 1
1271
1272; base-4 constants for shuffles
1273%assign i 0
1274%rep 256
1275 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1276 %if j < 10
1277 CAT_XDEFINE q000, j, i
1278 %elif j < 100
1279 CAT_XDEFINE q00, j, i
1280 %elif j < 1000
1281 CAT_XDEFINE q0, j, i
1282 %else
1283 CAT_XDEFINE q, j, i
1284 %endif
1285%assign i i+1
1286%endrep
1287%undef i
1288%undef j
1289
1290%macro FMA_INSTR 3
79687079
JR
1291 %macro %1 5-8 %1, %2, %3
1292 %if cpuflag(xop) || cpuflag(fma4)
1293 v%6 %1, %2, %3, %4
2f7f2e4b 1294 %else
79687079
JR
1295 %ifidn %1, %4
1296 %7 %5, %2, %3
1297 %8 %1, %4, %5
1298 %else
1299 %7 %1, %2, %3
1300 %8 %1, %4
1301 %endif
2f7f2e4b
LM
1302 %endif
1303 %endmacro
1304%endmacro
1305
79687079 1306FMA_INSTR fmaddps, mulps, addps
2f7f2e4b
LM
1307FMA_INSTR pmacsdd, pmulld, paddd
1308FMA_INSTR pmacsww, pmullw, paddw
1309FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1310
1311; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1312; This lets us use tzcnt without bumping the yasm version requirement yet.
1313%define tzcnt rep bsf