x86inc: Rename "program_name" to "private_prefix"
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
729f90e2 4;* Copyright (C) 2005-2012 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
729f90e2 9;* Henrik Gramner <hengar-6@student.ltu.se>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
ef5d41a5
DB
37%ifndef private_prefix
38 %define private_prefix x264
012f73e2 39%endif
2966cc18 40
3b15a6d7 41%define WIN64 0
96c9cc10 42%define UNIX64 0
3b15a6d7 43%if ARCH_X86_64
3f87f39c 44 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 45 %define WIN64 1
166f3993
HY
46 %elifidn __OUTPUT_FORMAT__,win64
47 %define WIN64 1
3f87f39c 48 %else
3b15a6d7 49 %define UNIX64 1
3f87f39c
JA
50 %endif
51%endif
52
2966cc18
JGG
53%ifdef PREFIX
54 %define mangle(x) _ %+ x
55%else
56 %define mangle(x) x
57%endif
58
bafad220
LM
59; Name of the .rodata section.
60; Kludge: Something on OS X fails to align .rodata even given an align attribute,
61; so use a different read-only section.
3f87f39c 62%macro SECTION_RODATA 0-1 16
bafad220 63 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 64 SECTION .text align=%1
bafad220 65 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 66 SECTION .text align=%1
bafad220 67 fakegot:
d69f9a42
DY
68 %elifidn __OUTPUT_FORMAT__,aout
69 section .text
bafad220 70 %else
3f87f39c 71 SECTION .rodata align=%1
bafad220
LM
72 %endif
73%endmacro
74
d69f9a42
DY
75; aout does not support align=
76%macro SECTION_TEXT 0-1 16
77 %ifidn __OUTPUT_FORMAT__,aout
78 SECTION .text
79 %else
80 SECTION .text align=%1
81 %endif
82%endmacro
83
3b15a6d7 84%if WIN64
3f87f39c 85 %define PIC
412b248e 86%elif ARCH_X86_64 == 0
2966cc18
JGG
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
90 %undef PIC
91%endif
92%ifdef PIC
2966cc18 93 default rel
bafad220
LM
94%endif
95
180d43bc
MR
96%macro CPUNOP 1
97 %if HAVE_CPUNOP
98 CPU %1
99 %endif
100%endmacro
101
729f90e2 102; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
180d43bc 103CPUNOP amdnop
729f90e2 104
bafad220
LM
105; Macros to eliminate most code duplication between x86_32 and x86_64:
106; Currently this works only for leaf functions which load all their arguments
107; into registers at the start, and make no other use of the stack. Luckily that
108; covers most of x264's asm.
109
110; PROLOGUE:
111; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
112; %2 = number of registers used. pushes callee-saved regs if needed.
113; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
6f40e9f0
RB
114; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
115; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
116; and an extra register will be allocated to hold the original stack
117; pointer (to not invalidate r0m etc.). To prevent the use of an extra
118; register as stack pointer, request a negative stack size.
119; %4+/%5+ = list of names to define to registers
bafad220
LM
120; PROLOGUE can also be invoked by adding the same options to cglobal
121
122; e.g.
29e4edbb 123; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 124; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
125
126; TODO Some functions can use some args directly from the stack. If they're the
127; last args then you can just not declare them, but if they're in the middle
128; we need more flexible macro.
129
130; RET:
2f7f2e4b 131; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
132
133; REP_RET:
134; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
135; which are slow when a normal ret follows a branch.
136
3f87f39c
JA
137; registers:
138; rN and rNq are the native-size register holding function argument N
139; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 140; rNh is the high 8 bits of the word size
3f87f39c
JA
141; rNm is the original location of arg N (a register or on the stack), dword
142; rNmp is native size
143
96c9cc10 144%macro DECLARE_REG 2-3
bafad220 145 %define r%1q %2
96c9cc10
RB
146 %define r%1d %2d
147 %define r%1w %2w
148 %define r%1b %2b
149 %define r%1h %2h
7a1944b9 150 %define %2q %2
96c9cc10
RB
151 %if %0 == 2
152 %define r%1m %2d
3f87f39c 153 %define r%1mp %2
3b15a6d7 154 %elif ARCH_X86_64 ; memory
6f40e9f0 155 %define r%1m [rstk + stack_offset + %3]
0995ad8d 156 %define r%1mp qword r %+ %1 %+ m
3f87f39c 157 %else
6f40e9f0 158 %define r%1m [rstk + stack_offset + %3]
0995ad8d 159 %define r%1mp dword r %+ %1 %+ m
3f87f39c 160 %endif
bafad220
LM
161 %define r%1 %2
162%endmacro
163
96c9cc10 164%macro DECLARE_REG_SIZE 3
bafad220
LM
165 %define r%1q r%1
166 %define e%1q r%1
167 %define r%1d e%1
168 %define e%1d e%1
169 %define r%1w %1
170 %define e%1w %1
96c9cc10
RB
171 %define r%1h %3
172 %define e%1h %3
bafad220
LM
173 %define r%1b %2
174 %define e%1b %2
3b15a6d7 175%if ARCH_X86_64 == 0
bafad220
LM
176 %define r%1 e%1
177%endif
178%endmacro
179
96c9cc10
RB
180DECLARE_REG_SIZE ax, al, ah
181DECLARE_REG_SIZE bx, bl, bh
182DECLARE_REG_SIZE cx, cl, ch
183DECLARE_REG_SIZE dx, dl, dh
184DECLARE_REG_SIZE si, sil, null
185DECLARE_REG_SIZE di, dil, null
186DECLARE_REG_SIZE bp, bpl, null
bafad220 187
3f87f39c
JA
188; t# defines for when per-arch register allocation is more complex than just function arguments
189
190%macro DECLARE_REG_TMP 1-*
191 %assign %%i 0
192 %rep %0
193 CAT_XDEFINE t, %%i, r%1
194 %assign %%i %%i+1
195 %rotate 1
196 %endrep
197%endmacro
198
199%macro DECLARE_REG_TMP_SIZE 0-*
200 %rep %0
201 %define t%1q t%1 %+ q
202 %define t%1d t%1 %+ d
203 %define t%1w t%1 %+ w
96c9cc10 204 %define t%1h t%1 %+ h
3f87f39c
JA
205 %define t%1b t%1 %+ b
206 %rotate 1
207 %endrep
208%endmacro
209
729f90e2 210DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 211
3b15a6d7 212%if ARCH_X86_64
bafad220
LM
213 %define gprsize 8
214%else
215 %define gprsize 4
216%endif
217
218%macro PUSH 1
219 push %1
6f40e9f0
RB
220 %ifidn rstk, rsp
221 %assign stack_offset stack_offset+gprsize
222 %endif
bafad220
LM
223%endmacro
224
225%macro POP 1
226 pop %1
6f40e9f0
RB
227 %ifidn rstk, rsp
228 %assign stack_offset stack_offset-gprsize
229 %endif
bafad220
LM
230%endmacro
231
729f90e2
HG
232%macro PUSH_IF_USED 1-*
233 %rep %0
234 %if %1 < regs_used
235 PUSH r%1
236 %endif
237 %rotate 1
238 %endrep
239%endmacro
240
241%macro POP_IF_USED 1-*
242 %rep %0
243 %if %1 < regs_used
244 pop r%1
245 %endif
246 %rotate 1
247 %endrep
248%endmacro
249
250%macro LOAD_IF_USED 1-*
251 %rep %0
252 %if %1 < num_args
253 mov r%1, r %+ %1 %+ mp
254 %endif
255 %rotate 1
256 %endrep
257%endmacro
258
bafad220
LM
259%macro SUB 2
260 sub %1, %2
6f40e9f0 261 %ifidn %1, rstk
bafad220
LM
262 %assign stack_offset stack_offset+(%2)
263 %endif
264%endmacro
265
266%macro ADD 2
267 add %1, %2
6f40e9f0 268 %ifidn %1, rstk
bafad220
LM
269 %assign stack_offset stack_offset-(%2)
270 %endif
271%endmacro
272
273%macro movifnidn 2
274 %ifnidn %1, %2
275 mov %1, %2
276 %endif
277%endmacro
278
279%macro movsxdifnidn 2
280 %ifnidn %1, %2
281 movsxd %1, %2
282 %endif
283%endmacro
284
285%macro ASSERT 1
286 %if (%1) == 0
287 %error assert failed
288 %endif
289%endmacro
290
291%macro DEFINE_ARGS 0-*
292 %ifdef n_arg_names
293 %assign %%i 0
294 %rep n_arg_names
295 CAT_UNDEF arg_name %+ %%i, q
296 CAT_UNDEF arg_name %+ %%i, d
297 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 298 CAT_UNDEF arg_name %+ %%i, h
bafad220 299 CAT_UNDEF arg_name %+ %%i, b
2f77923d 300 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 301 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
302 CAT_UNDEF arg_name, %%i
303 %assign %%i %%i+1
304 %endrep
305 %endif
306
0f53d0cf
LM
307 %xdefine %%stack_offset stack_offset
308 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
309 %assign %%i 0
310 %rep %0
311 %xdefine %1q r %+ %%i %+ q
312 %xdefine %1d r %+ %%i %+ d
313 %xdefine %1w r %+ %%i %+ w
96c9cc10 314 %xdefine %1h r %+ %%i %+ h
bafad220 315 %xdefine %1b r %+ %%i %+ b
2f77923d 316 %xdefine %1m r %+ %%i %+ m
98b9da2a 317 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
318 CAT_XDEFINE arg_name, %%i, %1
319 %assign %%i %%i+1
320 %rotate 1
321 %endrep
0f53d0cf
LM
322 %xdefine stack_offset %%stack_offset
323 %assign n_arg_names %0
bafad220
LM
324%endmacro
325
6f40e9f0
RB
326%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
327 %ifnum %1
328 %if %1 != 0
329 %assign %%stack_alignment ((mmsize + 15) & ~15)
330 %assign stack_size %1
331 %if stack_size < 0
332 %assign stack_size -stack_size
333 %endif
a34d9ad9
RB
334 %if mmsize != 8
335 %assign xmm_regs_used %2
336 %endif
6f40e9f0
RB
337 %if mmsize <= 16 && HAVE_ALIGNED_STACK
338 %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
339 %if xmm_regs_used > 6
340 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
341 %endif
342 SUB rsp, stack_size_padded
343 %else
a34d9ad9
RB
344 %assign %%reg_num (regs_used - 1)
345 %xdefine rstk r %+ %%reg_num
6f40e9f0
RB
346 ; align stack, and save original stack location directly above
347 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
348 ; stack in a single instruction (i.e. mov rsp, rstk or mov
349 ; rsp, [rsp+stack_size_padded])
350 mov rstk, rsp
351 %assign stack_size_padded stack_size
352 %if xmm_regs_used > 6
353 %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16
a34d9ad9
RB
354 %if mmsize == 32 && xmm_regs_used & 1
355 ; re-align to 32 bytes
356 %assign stack_size_padded (stack_size_padded + 16)
357 %endif
6f40e9f0
RB
358 %endif
359 %if %1 < 0 ; need to store rsp on stack
360 sub rsp, gprsize+stack_size_padded
361 and rsp, ~(%%stack_alignment-1)
362 %xdefine rstkm [rsp+stack_size_padded]
363 mov rstkm, rstk
364 %else ; can keep rsp in rstk during whole function
365 sub rsp, stack_size_padded
366 and rsp, ~(%%stack_alignment-1)
367 %xdefine rstkm rstk
368 %endif
369 %endif
370 %if xmm_regs_used > 6
371 WIN64_PUSH_XMM
372 %endif
373 %endif
374 %endif
375%endmacro
376
377%macro SETUP_STACK_POINTER 1
378 %ifnum %1
379 %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
380 %if %1 > 0
381 %assign regs_used (regs_used + 1)
382 %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
383 %warning "Stack pointer will overwrite register argument"
384 %endif
385 %endif
386 %endif
387%endmacro
388
389%macro DEFINE_ARGS_INTERNAL 3+
390 %ifnum %2
391 DEFINE_ARGS %3
392 %elif %1 == 4
393 DEFINE_ARGS %2
394 %elif %1 > 4
395 DEFINE_ARGS %2, %3
396 %endif
397%endmacro
398
3b15a6d7 399%if WIN64 ; Windows x64 ;=================================================
bafad220 400
96c9cc10
RB
401DECLARE_REG 0, rcx
402DECLARE_REG 1, rdx
403DECLARE_REG 2, R8
404DECLARE_REG 3, R9
405DECLARE_REG 4, R10, 40
406DECLARE_REG 5, R11, 48
407DECLARE_REG 6, rax, 56
408DECLARE_REG 7, rdi, 64
409DECLARE_REG 8, rsi, 72
410DECLARE_REG 9, rbx, 80
411DECLARE_REG 10, rbp, 88
412DECLARE_REG 11, R12, 96
413DECLARE_REG 12, R13, 104
414DECLARE_REG 13, R14, 112
415DECLARE_REG 14, R15, 120
3f87f39c 416
6f40e9f0 417%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 418 %assign num_args %1
3f87f39c 419 %assign regs_used %2
729f90e2 420 ASSERT regs_used >= num_args
a34d9ad9 421 SETUP_STACK_POINTER %4
729f90e2
HG
422 ASSERT regs_used <= 15
423 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
424 ALLOC_STACK %4, %3
425 %if mmsize != 8 && stack_size == 0
9cf73853
HG
426 WIN64_SPILL_XMM %3
427 %endif
729f90e2 428 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0
RB
429 DEFINE_ARGS_INTERNAL %0, %4, %5
430%endmacro
431
432%macro WIN64_PUSH_XMM 0
433 %assign %%i xmm_regs_used
434 %rep (xmm_regs_used-6)
435 %assign %%i %%i-1
140367af 436 movdqa [rsp + (%%i-6)*16 + stack_size + (~stack_offset&8)], xmm %+ %%i
6f40e9f0 437 %endrep
532e7697
LM
438%endmacro
439
440%macro WIN64_SPILL_XMM 1
441 %assign xmm_regs_used %1
442 ASSERT xmm_regs_used <= 16
3f87f39c 443 %if xmm_regs_used > 6
140367af 444 SUB rsp, (xmm_regs_used-6)*16+16
6f40e9f0 445 WIN64_PUSH_XMM
3f87f39c 446 %endif
3f87f39c
JA
447%endmacro
448
532e7697 449%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
450 %if xmm_regs_used > 6
451 %assign %%i xmm_regs_used
452 %rep (xmm_regs_used-6)
453 %assign %%i %%i-1
140367af 454 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size+(~stack_offset&8)]
3f87f39c 455 %endrep
140367af
RB
456 %if stack_size_padded == 0
457 add %1, (xmm_regs_used-6)*16+16
458 %endif
6f40e9f0
RB
459 %endif
460 %if stack_size_padded > 0
461 %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
462 mov rsp, rstkm
463 %else
464 add %1, stack_size_padded
465 %endif
3f87f39c
JA
466 %endif
467%endmacro
468
532e7697
LM
469%macro WIN64_RESTORE_XMM 1
470 WIN64_RESTORE_XMM_INTERNAL %1
6f40e9f0 471 %assign stack_offset (stack_offset-stack_size_padded)
3f87f39c
JA
472 %assign xmm_regs_used 0
473%endmacro
474
6f40e9f0 475%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
96c9cc10 476
3f87f39c 477%macro RET 0
532e7697 478 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 479 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
480%if mmsize == 32
481 vzeroupper
482%endif
3f87f39c 483 ret
bafad220
LM
484%endmacro
485
3b15a6d7 486%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 487
96c9cc10
RB
488DECLARE_REG 0, rdi
489DECLARE_REG 1, rsi
490DECLARE_REG 2, rdx
491DECLARE_REG 3, rcx
492DECLARE_REG 4, R8
493DECLARE_REG 5, R9
494DECLARE_REG 6, rax, 8
495DECLARE_REG 7, R10, 16
496DECLARE_REG 8, R11, 24
497DECLARE_REG 9, rbx, 32
498DECLARE_REG 10, rbp, 40
499DECLARE_REG 11, R12, 48
500DECLARE_REG 12, R13, 56
501DECLARE_REG 13, R14, 64
502DECLARE_REG 14, R15, 72
bafad220 503
6f40e9f0 504%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2
HG
505 %assign num_args %1
506 %assign regs_used %2
507 ASSERT regs_used >= num_args
a34d9ad9 508 SETUP_STACK_POINTER %4
729f90e2
HG
509 ASSERT regs_used <= 15
510 PUSH_IF_USED 9, 10, 11, 12, 13, 14
6f40e9f0 511 ALLOC_STACK %4
729f90e2 512 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
6f40e9f0 513 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
514%endmacro
515
6f40e9f0 516%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
96c9cc10 517
bafad220 518%macro RET 0
6f40e9f0
RB
519%if stack_size_padded > 0
520%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
521 mov rsp, rstkm
522%else
523 add rsp, stack_size_padded
524%endif
525%endif
729f90e2 526 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
527%if mmsize == 32
528 vzeroupper
529%endif
bafad220
LM
530 ret
531%endmacro
532
bafad220
LM
533%else ; X86_32 ;==============================================================
534
96c9cc10
RB
535DECLARE_REG 0, eax, 4
536DECLARE_REG 1, ecx, 8
537DECLARE_REG 2, edx, 12
538DECLARE_REG 3, ebx, 16
539DECLARE_REG 4, esi, 20
540DECLARE_REG 5, edi, 24
541DECLARE_REG 6, ebp, 28
bafad220
LM
542%define rsp esp
543
729f90e2
HG
544%macro DECLARE_ARG 1-*
545 %rep %0
6f40e9f0 546 %define r%1m [rstk + stack_offset + 4*%1 + 4]
729f90e2
HG
547 %define r%1mp dword r%1m
548 %rotate 1
549 %endrep
bafad220
LM
550%endmacro
551
729f90e2 552DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 553
6f40e9f0 554%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
729f90e2 555 %assign num_args %1
bafad220 556 %assign regs_used %2
a34d9ad9
RB
557 ASSERT regs_used >= num_args
558 %if num_args > 7
559 %assign num_args 7
560 %endif
729f90e2
HG
561 %if regs_used > 7
562 %assign regs_used 7
563 %endif
6f40e9f0
RB
564 SETUP_STACK_POINTER %4
565 ASSERT regs_used <= 7
729f90e2 566 PUSH_IF_USED 3, 4, 5, 6
6f40e9f0 567 ALLOC_STACK %4
729f90e2 568 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
6f40e9f0 569 DEFINE_ARGS_INTERNAL %0, %4, %5
bafad220
LM
570%endmacro
571
6f40e9f0 572%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
96c9cc10 573
bafad220 574%macro RET 0
6f40e9f0
RB
575%if stack_size_padded > 0
576%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
577 mov rsp, rstkm
578%else
579 add rsp, stack_size_padded
580%endif
581%endif
729f90e2 582 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
583%if mmsize == 32
584 vzeroupper
585%endif
bafad220
LM
586 ret
587%endmacro
588
bafad220
LM
589%endif ;======================================================================
590
3b15a6d7 591%if WIN64 == 0
532e7697
LM
592%macro WIN64_SPILL_XMM 1
593%endmacro
594%macro WIN64_RESTORE_XMM 1
595%endmacro
6f40e9f0
RB
596%macro WIN64_PUSH_XMM 0
597%endmacro
532e7697
LM
598%endif
599
96c9cc10
RB
600%macro REP_RET 0
601 %if has_epilogue
602 RET
603 %else
604 rep ret
605 %endif
606%endmacro
607
608%macro TAIL_CALL 2 ; callee, is_nonadjacent
609 %if has_epilogue
610 call %1
611 RET
612 %elif %2
613 jmp %1
614 %endif
615%endmacro
616
bafad220
LM
617;=============================================================================
618; arch-independent part
619;=============================================================================
620
621%assign function_align 16
622
2f7f2e4b
LM
623; Begin a function.
624; Applies any symbol mangling needed for C linkage, and sets up a define such that
625; subsequent uses of the function name automatically refer to the mangled version.
626; Appends cpuflags to the function name if cpuflags has been specified.
a34d9ad9
RB
627%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
628 ; the "" is a workaround for nasm, which fails if SUFFIX is empty
629 ; and we call cglobal_internal with just %1 %+ SUFFIX (without %2)
2f7f2e4b 630 cglobal_internal %1 %+ SUFFIX, %2
2f7f2e4b
LM
631%endmacro
632%macro cglobal_internal 1-2+
633 %ifndef cglobaled_%1
ef5d41a5 634 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b
LM
635 %xdefine %1.skip_prologue %1 %+ .skip_prologue
636 CAT_XDEFINE cglobaled_, %1, 1
637 %endif
638 %xdefine current_function %1
bafad220 639 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 640 global %1:function hidden
bafad220 641 %else
40c7d0ae 642 global %1
bafad220
LM
643 %endif
644 align function_align
645 %1:
646 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
6f40e9f0 647 %xdefine rstk rsp
3f87f39c 648 %assign stack_offset 0
6f40e9f0
RB
649 %assign stack_size 0
650 %assign stack_size_padded 0
a34d9ad9
RB
651 %assign xmm_regs_used 0
652 %ifnidn %2, ""
bafad220
LM
653 PROLOGUE %2
654 %endif
655%endmacro
656
657%macro cextern 1
ef5d41a5 658 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2f7f2e4b 659 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
660 extern %1
661%endmacro
662
2f7f2e4b 663; like cextern, but without the prefix
2966cc18
JGG
664%macro cextern_naked 1
665 %xdefine %1 mangle(%1)
2f7f2e4b 666 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 667 extern %1
bafad220
LM
668%endmacro
669
2966cc18 670%macro const 2+
ef5d41a5 671 %xdefine %1 mangle(private_prefix %+ _ %+ %1)
2966cc18
JGG
672 global %1
673 %1: %2
674%endmacro
675
bafad220
LM
676; This is needed for ELF, otherwise the GNU linker assumes the stack is
677; executable by default.
678%ifidn __OUTPUT_FORMAT__,elf
679SECTION .note.GNU-stack noalloc noexec nowrite progbits
680%endif
681
2f7f2e4b
LM
682; cpuflags
683
684%assign cpuflags_mmx (1<<0)
685%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
686%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 687%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
688%assign cpuflags_sse (1<<4) | cpuflags_mmx2
689%assign cpuflags_sse2 (1<<5) | cpuflags_sse
690%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
691%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
692%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
693%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
694%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
695%assign cpuflags_avx (1<<11)| cpuflags_sse42
696%assign cpuflags_xop (1<<12)| cpuflags_avx
697%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
698%assign cpuflags_avx2 (1<<14)| cpuflags_avx
699%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
700
701%assign cpuflags_cache32 (1<<16)
702%assign cpuflags_cache64 (1<<17)
703%assign cpuflags_slowctz (1<<18)
704%assign cpuflags_lzcnt (1<<19)
705%assign cpuflags_misalign (1<<20)
706%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
707%assign cpuflags_atom (1<<22)
96c9cc10
RB
708%assign cpuflags_bmi1 (1<<23)
709%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
710%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
2f7f2e4b
LM
711
712%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
713%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
714
715; Takes up to 2 cpuflags from the above list.
716; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
717; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
718%macro INIT_CPUFLAGS 0-2
180d43bc 719 CPUNOP amdnop
2f7f2e4b
LM
720 %if %0 >= 1
721 %xdefine cpuname %1
722 %assign cpuflags cpuflags_%1
723 %if %0 >= 2
724 %xdefine cpuname %1_%2
725 %assign cpuflags cpuflags | cpuflags_%2
726 %endif
727 %xdefine SUFFIX _ %+ cpuname
728 %if cpuflag(avx)
729 %assign avx_enabled 1
730 %endif
f2bd8a07
JR
731 %if mmsize == 16 && notcpuflag(sse2)
732 %define mova movaps
733 %define movu movups
734 %define movnta movntps
735 %endif
2f7f2e4b
LM
736 %if cpuflag(aligned)
737 %define movu mova
738 %elifidn %1, sse3
739 %define movu lddqu
740 %endif
2cd1f5ca 741 %if notcpuflag(mmx2)
180d43bc 742 CPUNOP basicnop
2cd1f5ca 743 %endif
2f7f2e4b
LM
744 %else
745 %xdefine SUFFIX
746 %undef cpuname
747 %undef cpuflags
748 %endif
749%endmacro
750
bafad220
LM
751; merge mmx and sse*
752
753%macro CAT_XDEFINE 3
754 %xdefine %1%2 %3
755%endmacro
756
757%macro CAT_UNDEF 2
758 %undef %1%2
759%endmacro
760
2f7f2e4b 761%macro INIT_MMX 0-1+
33cbfa6f 762 %assign avx_enabled 0
2f7f2e4b 763 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
764 %define mmsize 8
765 %define num_mmregs 8
766 %define mova movq
767 %define movu movq
768 %define movh movd
532e7697 769 %define movnta movntq
bafad220
LM
770 %assign %%i 0
771 %rep 8
772 CAT_XDEFINE m, %%i, mm %+ %%i
773 CAT_XDEFINE nmm, %%i, %%i
774 %assign %%i %%i+1
775 %endrep
776 %rep 8
777 CAT_UNDEF m, %%i
778 CAT_UNDEF nmm, %%i
779 %assign %%i %%i+1
780 %endrep
2f7f2e4b 781 INIT_CPUFLAGS %1
bafad220
LM
782%endmacro
783
2f7f2e4b 784%macro INIT_XMM 0-1+
33cbfa6f 785 %assign avx_enabled 0
2f7f2e4b 786 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
787 %define mmsize 16
788 %define num_mmregs 8
3b15a6d7 789 %if ARCH_X86_64
bafad220
LM
790 %define num_mmregs 16
791 %endif
792 %define mova movdqa
793 %define movu movdqu
794 %define movh movq
532e7697 795 %define movnta movntdq
bafad220
LM
796 %assign %%i 0
797 %rep num_mmregs
798 CAT_XDEFINE m, %%i, xmm %+ %%i
799 CAT_XDEFINE nxmm, %%i, %%i
800 %assign %%i %%i+1
801 %endrep
2f7f2e4b 802 INIT_CPUFLAGS %1
bafad220
LM
803%endmacro
804
2f7f2e4b 805%macro INIT_YMM 0-1+
33cbfa6f 806 %assign avx_enabled 1
2f7f2e4b 807 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
808 %define mmsize 32
809 %define num_mmregs 8
3b15a6d7 810 %if ARCH_X86_64
33cbfa6f
VS
811 %define num_mmregs 16
812 %endif
813 %define mova vmovaps
814 %define movu vmovups
2f7f2e4b
LM
815 %undef movh
816 %define movnta vmovntps
33cbfa6f
VS
817 %assign %%i 0
818 %rep num_mmregs
819 CAT_XDEFINE m, %%i, ymm %+ %%i
820 CAT_XDEFINE nymm, %%i, %%i
821 %assign %%i %%i+1
822 %endrep
2f7f2e4b 823 INIT_CPUFLAGS %1
33cbfa6f
VS
824%endmacro
825
2f7f2e4b 826INIT_XMM
bafad220
LM
827
828; I often want to use macros that permute their arguments. e.g. there's no
829; efficient way to implement butterfly or transpose or dct without swapping some
830; arguments.
831;
832; I would like to not have to manually keep track of the permutations:
833; If I insert a permutation in the middle of a function, it should automatically
834; change everything that follows. For more complex macros I may also have multiple
835; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
836;
837; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
838; permutes its arguments. It's equivalent to exchanging the contents of the
839; registers, except that this way you exchange the register names instead, so it
840; doesn't cost any cycles.
841
842%macro PERMUTE 2-* ; takes a list of pairs to swap
843%rep %0/2
844 %xdefine tmp%2 m%2
845 %xdefine ntmp%2 nm%2
846 %rotate 2
847%endrep
848%rep %0/2
849 %xdefine m%1 tmp%2
850 %xdefine nm%1 ntmp%2
851 %undef tmp%2
852 %undef ntmp%2
853 %rotate 2
854%endrep
855%endmacro
856
857%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
858%rep %0-1
859%ifdef m%1
860 %xdefine tmp m%1
861 %xdefine m%1 m%2
862 %xdefine m%2 tmp
863 CAT_XDEFINE n, m%1, %1
864 CAT_XDEFINE n, m%2, %2
865%else
866 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
867 ; Be careful using this mode in nested macros though, as in some cases there may be
868 ; other copies of m# that have already been dereferenced and don't get updated correctly.
869 %xdefine %%n1 n %+ %1
870 %xdefine %%n2 n %+ %2
871 %xdefine tmp m %+ %%n1
872 CAT_XDEFINE m, %%n1, m %+ %%n2
873 CAT_XDEFINE m, %%n2, tmp
874 CAT_XDEFINE n, m %+ %%n1, %%n1
875 CAT_XDEFINE n, m %+ %%n2, %%n2
876%endif
877 %undef tmp
878 %rotate 1
879%endrep
880%endmacro
881
2f7f2e4b
LM
882; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
883; calls to that function will automatically load the permutation, so values can
884; be returned in mmregs.
885%macro SAVE_MM_PERMUTATION 0-1
886 %if %0
887 %xdefine %%f %1_m
888 %else
889 %xdefine %%f current_function %+ _m
890 %endif
bafad220
LM
891 %assign %%i 0
892 %rep num_mmregs
2f7f2e4b 893 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
894 %assign %%i %%i+1
895 %endrep
896%endmacro
897
2966cc18 898%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
899 %ifdef %1_m0
900 %assign %%i 0
901 %rep num_mmregs
902 CAT_XDEFINE m, %%i, %1_m %+ %%i
903 CAT_XDEFINE n, m %+ %%i, %%i
904 %assign %%i %%i+1
905 %endrep
906 %endif
bafad220
LM
907%endmacro
908
2f7f2e4b 909; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 910%macro call 1
edd82267 911 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
912%endmacro
913%macro call_internal 2
edd82267
MR
914 %xdefine %%i %2
915 %ifndef cglobaled_%2
916 %ifdef cglobaled_%1
917 %xdefine %%i %1
2f7f2e4b 918 %endif
bafad220 919 %endif
2f7f2e4b
LM
920 call %%i
921 LOAD_MM_PERMUTATION %%i
bafad220
LM
922%endmacro
923
2966cc18 924; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
925%macro add 2
926 %ifnum %2
927 %if %2==128
928 sub %1, -128
929 %else
930 add %1, %2
931 %endif
932 %else
933 add %1, %2
934 %endif
935%endmacro
936
937%macro sub 2
938 %ifnum %2
939 %if %2==128
940 add %1, -128
941 %else
942 sub %1, %2
943 %endif
944 %else
945 sub %1, %2
946 %endif
947%endmacro
33cbfa6f
VS
948
949;=============================================================================
950; AVX abstraction layer
951;=============================================================================
952
953%assign i 0
954%rep 16
955 %if i < 8
956 CAT_XDEFINE sizeofmm, i, 8
957 %endif
958 CAT_XDEFINE sizeofxmm, i, 16
959 CAT_XDEFINE sizeofymm, i, 32
960%assign i i+1
961%endrep
962%undef i
963
96c9cc10
RB
964%macro CHECK_AVX_INSTR_EMU 3-*
965 %xdefine %%opcode %1
966 %xdefine %%dst %2
967 %rep %0-2
968 %ifidn %%dst, %3
969 %error non-avx emulation of ``%%opcode'' is not supported
970 %endif
971 %rotate 1
972 %endrep
973%endmacro
974
33cbfa6f
VS
975;%1 == instruction
976;%2 == 1 if float, 0 if int
705f3d47 977;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
33cbfa6f
VS
978;%4 == number of operands given
979;%5+: operands
980%macro RUN_AVX_INSTR 6-7+
96c9cc10
RB
981 %ifid %6
982 %define %%sizeofreg sizeof%6
983 %elifid %5
984 %define %%sizeofreg sizeof%5
2f7f2e4b 985 %else
96c9cc10 986 %define %%sizeofreg mmsize
2f7f2e4b 987 %endif
96c9cc10
RB
988 %if %%sizeofreg==32
989 %if %4>=3
705f3d47
LM
990 v%1 %5, %6, %7
991 %else
992 v%1 %5, %6
993 %endif
33cbfa6f 994 %else
96c9cc10 995 %if %%sizeofreg==8
33cbfa6f
VS
996 %define %%regmov movq
997 %elif %2
998 %define %%regmov movaps
999 %else
1000 %define %%regmov movdqa
1001 %endif
1002
1003 %if %4>=3+%3
1004 %ifnidn %5, %6
96c9cc10 1005 %if avx_enabled && %%sizeofreg==16
33cbfa6f
VS
1006 v%1 %5, %6, %7
1007 %else
96c9cc10 1008 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
33cbfa6f
VS
1009 %%regmov %5, %6
1010 %1 %5, %7
1011 %endif
1012 %else
1013 %1 %5, %7
1014 %endif
96c9cc10 1015 %elif %4>=3
33cbfa6f
VS
1016 %1 %5, %6, %7
1017 %else
1018 %1 %5, %6
1019 %endif
1020 %endif
1021%endmacro
1022
2f7f2e4b
LM
1023; 3arg AVX ops with a memory arg can only have it in src2,
1024; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
1025; So, if the op is symmetric and the wrong one is memory, swap them.
1026%macro RUN_AVX_INSTR1 8
1027 %assign %%swap 0
1028 %if avx_enabled
1029 %ifnid %6
1030 %assign %%swap 1
1031 %endif
1032 %elifnidn %5, %6
1033 %ifnid %7
1034 %assign %%swap 1
1035 %endif
1036 %endif
1037 %if %%swap && %3 == 0 && %8 == 1
1038 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
1039 %else
1040 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
1041 %endif
1042%endmacro
1043
33cbfa6f
VS
1044;%1 == instruction
1045;%2 == 1 if float, 0 if int
96c9cc10 1046;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
2f7f2e4b
LM
1047;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
1048%macro AVX_INSTR 4
1049 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
33cbfa6f
VS
1050 %ifidn %3, fnord
1051 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
1052 %elifidn %4, fnord
2f7f2e4b 1053 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
33cbfa6f
VS
1054 %elifidn %5, fnord
1055 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
1056 %else
1057 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
1058 %endif
1059 %endmacro
1060%endmacro
1061
2f7f2e4b
LM
1062AVX_INSTR addpd, 1, 0, 1
1063AVX_INSTR addps, 1, 0, 1
1064AVX_INSTR addsd, 1, 0, 1
1065AVX_INSTR addss, 1, 0, 1
1066AVX_INSTR addsubpd, 1, 0, 0
1067AVX_INSTR addsubps, 1, 0, 0
1068AVX_INSTR andpd, 1, 0, 1
1069AVX_INSTR andps, 1, 0, 1
1070AVX_INSTR andnpd, 1, 0, 0
1071AVX_INSTR andnps, 1, 0, 0
1072AVX_INSTR blendpd, 1, 0, 0
1073AVX_INSTR blendps, 1, 0, 0
1074AVX_INSTR blendvpd, 1, 0, 0
1075AVX_INSTR blendvps, 1, 0, 0
1076AVX_INSTR cmppd, 1, 0, 0
1077AVX_INSTR cmpps, 1, 0, 0
1078AVX_INSTR cmpsd, 1, 0, 0
1079AVX_INSTR cmpss, 1, 0, 0
705f3d47 1080AVX_INSTR cvtdq2ps, 1, 0, 0
b30a3633 1081AVX_INSTR cvtpd2dq, 1, 0, 0
705f3d47 1082AVX_INSTR cvtps2dq, 1, 0, 0
2f7f2e4b
LM
1083AVX_INSTR divpd, 1, 0, 0
1084AVX_INSTR divps, 1, 0, 0
1085AVX_INSTR divsd, 1, 0, 0
1086AVX_INSTR divss, 1, 0, 0
1087AVX_INSTR dppd, 1, 1, 0
1088AVX_INSTR dpps, 1, 1, 0
1089AVX_INSTR haddpd, 1, 0, 0
1090AVX_INSTR haddps, 1, 0, 0
1091AVX_INSTR hsubpd, 1, 0, 0
1092AVX_INSTR hsubps, 1, 0, 0
1093AVX_INSTR maxpd, 1, 0, 1
1094AVX_INSTR maxps, 1, 0, 1
1095AVX_INSTR maxsd, 1, 0, 1
1096AVX_INSTR maxss, 1, 0, 1
1097AVX_INSTR minpd, 1, 0, 1
1098AVX_INSTR minps, 1, 0, 1
1099AVX_INSTR minsd, 1, 0, 1
1100AVX_INSTR minss, 1, 0, 1
39df0c43
VS
1101AVX_INSTR movhlps, 1, 0, 0
1102AVX_INSTR movlhps, 1, 0, 0
2f7f2e4b
LM
1103AVX_INSTR movsd, 1, 0, 0
1104AVX_INSTR movss, 1, 0, 0
1105AVX_INSTR mpsadbw, 0, 1, 0
1106AVX_INSTR mulpd, 1, 0, 1
1107AVX_INSTR mulps, 1, 0, 1
1108AVX_INSTR mulsd, 1, 0, 1
1109AVX_INSTR mulss, 1, 0, 1
1110AVX_INSTR orpd, 1, 0, 1
1111AVX_INSTR orps, 1, 0, 1
96c9cc10
RB
1112AVX_INSTR pabsb, 0, 0, 0
1113AVX_INSTR pabsw, 0, 0, 0
1114AVX_INSTR pabsd, 0, 0, 0
2f7f2e4b
LM
1115AVX_INSTR packsswb, 0, 0, 0
1116AVX_INSTR packssdw, 0, 0, 0
1117AVX_INSTR packuswb, 0, 0, 0
1118AVX_INSTR packusdw, 0, 0, 0
1119AVX_INSTR paddb, 0, 0, 1
1120AVX_INSTR paddw, 0, 0, 1
1121AVX_INSTR paddd, 0, 0, 1
1122AVX_INSTR paddq, 0, 0, 1
1123AVX_INSTR paddsb, 0, 0, 1
1124AVX_INSTR paddsw, 0, 0, 1
1125AVX_INSTR paddusb, 0, 0, 1
1126AVX_INSTR paddusw, 0, 0, 1
1127AVX_INSTR palignr, 0, 1, 0
1128AVX_INSTR pand, 0, 0, 1
1129AVX_INSTR pandn, 0, 0, 0
1130AVX_INSTR pavgb, 0, 0, 1
1131AVX_INSTR pavgw, 0, 0, 1
1132AVX_INSTR pblendvb, 0, 0, 0
1133AVX_INSTR pblendw, 0, 1, 0
1134AVX_INSTR pcmpestri, 0, 0, 0
1135AVX_INSTR pcmpestrm, 0, 0, 0
1136AVX_INSTR pcmpistri, 0, 0, 0
1137AVX_INSTR pcmpistrm, 0, 0, 0
1138AVX_INSTR pcmpeqb, 0, 0, 1
1139AVX_INSTR pcmpeqw, 0, 0, 1
1140AVX_INSTR pcmpeqd, 0, 0, 1
1141AVX_INSTR pcmpeqq, 0, 0, 1
1142AVX_INSTR pcmpgtb, 0, 0, 0
1143AVX_INSTR pcmpgtw, 0, 0, 0
1144AVX_INSTR pcmpgtd, 0, 0, 0
1145AVX_INSTR pcmpgtq, 0, 0, 0
1146AVX_INSTR phaddw, 0, 0, 0
1147AVX_INSTR phaddd, 0, 0, 0
1148AVX_INSTR phaddsw, 0, 0, 0
1149AVX_INSTR phsubw, 0, 0, 0
1150AVX_INSTR phsubd, 0, 0, 0
1151AVX_INSTR phsubsw, 0, 0, 0
1152AVX_INSTR pmaddwd, 0, 0, 1
1153AVX_INSTR pmaddubsw, 0, 0, 0
1154AVX_INSTR pmaxsb, 0, 0, 1
1155AVX_INSTR pmaxsw, 0, 0, 1
1156AVX_INSTR pmaxsd, 0, 0, 1
1157AVX_INSTR pmaxub, 0, 0, 1
1158AVX_INSTR pmaxuw, 0, 0, 1
1159AVX_INSTR pmaxud, 0, 0, 1
1160AVX_INSTR pminsb, 0, 0, 1
1161AVX_INSTR pminsw, 0, 0, 1
1162AVX_INSTR pminsd, 0, 0, 1
1163AVX_INSTR pminub, 0, 0, 1
1164AVX_INSTR pminuw, 0, 0, 1
1165AVX_INSTR pminud, 0, 0, 1
96c9cc10 1166AVX_INSTR pmovmskb, 0, 0, 0
2f7f2e4b
LM
1167AVX_INSTR pmulhuw, 0, 0, 1
1168AVX_INSTR pmulhrsw, 0, 0, 1
1169AVX_INSTR pmulhw, 0, 0, 1
1170AVX_INSTR pmullw, 0, 0, 1
1171AVX_INSTR pmulld, 0, 0, 1
1172AVX_INSTR pmuludq, 0, 0, 1
1173AVX_INSTR pmuldq, 0, 0, 1
1174AVX_INSTR por, 0, 0, 1
1175AVX_INSTR psadbw, 0, 0, 1
1176AVX_INSTR pshufb, 0, 0, 0
96c9cc10
RB
1177AVX_INSTR pshufd, 0, 1, 0
1178AVX_INSTR pshufhw, 0, 1, 0
1179AVX_INSTR pshuflw, 0, 1, 0
2f7f2e4b
LM
1180AVX_INSTR psignb, 0, 0, 0
1181AVX_INSTR psignw, 0, 0, 0
1182AVX_INSTR psignd, 0, 0, 0
1183AVX_INSTR psllw, 0, 0, 0
1184AVX_INSTR pslld, 0, 0, 0
1185AVX_INSTR psllq, 0, 0, 0
1186AVX_INSTR pslldq, 0, 0, 0
1187AVX_INSTR psraw, 0, 0, 0
1188AVX_INSTR psrad, 0, 0, 0
1189AVX_INSTR psrlw, 0, 0, 0
1190AVX_INSTR psrld, 0, 0, 0
1191AVX_INSTR psrlq, 0, 0, 0
1192AVX_INSTR psrldq, 0, 0, 0
1193AVX_INSTR psubb, 0, 0, 0
1194AVX_INSTR psubw, 0, 0, 0
1195AVX_INSTR psubd, 0, 0, 0
1196AVX_INSTR psubq, 0, 0, 0
1197AVX_INSTR psubsb, 0, 0, 0
1198AVX_INSTR psubsw, 0, 0, 0
1199AVX_INSTR psubusb, 0, 0, 0
1200AVX_INSTR psubusw, 0, 0, 0
96c9cc10 1201AVX_INSTR ptest, 0, 0, 0
2f7f2e4b
LM
1202AVX_INSTR punpckhbw, 0, 0, 0
1203AVX_INSTR punpckhwd, 0, 0, 0
1204AVX_INSTR punpckhdq, 0, 0, 0
1205AVX_INSTR punpckhqdq, 0, 0, 0
1206AVX_INSTR punpcklbw, 0, 0, 0
1207AVX_INSTR punpcklwd, 0, 0, 0
1208AVX_INSTR punpckldq, 0, 0, 0
1209AVX_INSTR punpcklqdq, 0, 0, 0
1210AVX_INSTR pxor, 0, 0, 1
6b6ee582 1211AVX_INSTR shufps, 1, 1, 0
2f7f2e4b
LM
1212AVX_INSTR subpd, 1, 0, 0
1213AVX_INSTR subps, 1, 0, 0
1214AVX_INSTR subsd, 1, 0, 0
1215AVX_INSTR subss, 1, 0, 0
1216AVX_INSTR unpckhpd, 1, 0, 0
1217AVX_INSTR unpckhps, 1, 0, 0
1218AVX_INSTR unpcklpd, 1, 0, 0
1219AVX_INSTR unpcklps, 1, 0, 0
1220AVX_INSTR xorpd, 1, 0, 1
1221AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1222
1223; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1224AVX_INSTR pfadd, 1, 0, 1
1225AVX_INSTR pfsub, 1, 0, 0
1226AVX_INSTR pfmul, 1, 0, 1
1227
1228; base-4 constants for shuffles
1229%assign i 0
1230%rep 256
1231 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1232 %if j < 10
1233 CAT_XDEFINE q000, j, i
1234 %elif j < 100
1235 CAT_XDEFINE q00, j, i
1236 %elif j < 1000
1237 CAT_XDEFINE q0, j, i
1238 %else
1239 CAT_XDEFINE q, j, i
1240 %endif
1241%assign i i+1
1242%endrep
1243%undef i
1244%undef j
1245
1246%macro FMA_INSTR 3
79687079
JR
1247 %macro %1 5-8 %1, %2, %3
1248 %if cpuflag(xop) || cpuflag(fma4)
1249 v%6 %1, %2, %3, %4
2f7f2e4b 1250 %else
79687079
JR
1251 %ifidn %1, %4
1252 %7 %5, %2, %3
1253 %8 %1, %4, %5
1254 %else
1255 %7 %1, %2, %3
1256 %8 %1, %4
1257 %endif
2f7f2e4b
LM
1258 %endif
1259 %endmacro
1260%endmacro
1261
79687079 1262FMA_INSTR fmaddps, mulps, addps
2f7f2e4b
LM
1263FMA_INSTR pmacsdd, pmulld, paddd
1264FMA_INSTR pmacsww, pmullw, paddw
1265FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1266
1267; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1268; This lets us use tzcnt without bumping the yasm version requirement yet.
1269%define tzcnt rep bsf