lavr: temporarily store custom matrix in AVAudioResampleContext
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
729f90e2 4;* Copyright (C) 2005-2012 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
729f90e2 9;* Henrik Gramner <hengar-6@student.ltu.se>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
012f73e2 37%ifndef program_name
f0d124f0 38 %define program_name x264
012f73e2 39%endif
2966cc18 40
3b15a6d7 41%define WIN64 0
96c9cc10 42%define UNIX64 0
3b15a6d7 43%if ARCH_X86_64
3f87f39c 44 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 45 %define WIN64 1
166f3993
HY
46 %elifidn __OUTPUT_FORMAT__,win64
47 %define WIN64 1
3f87f39c 48 %else
3b15a6d7 49 %define UNIX64 1
3f87f39c
JA
50 %endif
51%endif
52
2966cc18
JGG
53%ifdef PREFIX
54 %define mangle(x) _ %+ x
55%else
56 %define mangle(x) x
57%endif
58
bafad220
LM
59; Name of the .rodata section.
60; Kludge: Something on OS X fails to align .rodata even given an align attribute,
61; so use a different read-only section.
3f87f39c 62%macro SECTION_RODATA 0-1 16
bafad220 63 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 64 SECTION .text align=%1
bafad220 65 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 66 SECTION .text align=%1
bafad220 67 fakegot:
d69f9a42
DY
68 %elifidn __OUTPUT_FORMAT__,aout
69 section .text
bafad220 70 %else
3f87f39c 71 SECTION .rodata align=%1
bafad220
LM
72 %endif
73%endmacro
74
d69f9a42
DY
75; aout does not support align=
76%macro SECTION_TEXT 0-1 16
77 %ifidn __OUTPUT_FORMAT__,aout
78 SECTION .text
79 %else
80 SECTION .text align=%1
81 %endif
82%endmacro
83
3b15a6d7 84%if WIN64
3f87f39c 85 %define PIC
412b248e 86%elif ARCH_X86_64 == 0
2966cc18
JGG
87; x86_32 doesn't require PIC.
88; Some distros prefer shared objects to be PIC, but nothing breaks if
89; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
90 %undef PIC
91%endif
92%ifdef PIC
2966cc18 93 default rel
bafad220
LM
94%endif
95
180d43bc
MR
96%macro CPUNOP 1
97 %if HAVE_CPUNOP
98 CPU %1
99 %endif
100%endmacro
101
729f90e2 102; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
180d43bc 103CPUNOP amdnop
729f90e2 104
bafad220
LM
105; Macros to eliminate most code duplication between x86_32 and x86_64:
106; Currently this works only for leaf functions which load all their arguments
107; into registers at the start, and make no other use of the stack. Luckily that
108; covers most of x264's asm.
109
110; PROLOGUE:
111; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
112; %2 = number of registers used. pushes callee-saved regs if needed.
113; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
bafad220
LM
114; %4 = list of names to define to registers
115; PROLOGUE can also be invoked by adding the same options to cglobal
116
117; e.g.
29e4edbb 118; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 119; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
120
121; TODO Some functions can use some args directly from the stack. If they're the
122; last args then you can just not declare them, but if they're in the middle
123; we need more flexible macro.
124
125; RET:
2f7f2e4b 126; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
127
128; REP_RET:
129; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
130; which are slow when a normal ret follows a branch.
131
3f87f39c
JA
132; registers:
133; rN and rNq are the native-size register holding function argument N
134; rNd, rNw, rNb are dword, word, and byte size
96c9cc10 135; rNh is the high 8 bits of the word size
3f87f39c
JA
136; rNm is the original location of arg N (a register or on the stack), dword
137; rNmp is native size
138
96c9cc10 139%macro DECLARE_REG 2-3
bafad220 140 %define r%1q %2
96c9cc10
RB
141 %define r%1d %2d
142 %define r%1w %2w
143 %define r%1b %2b
144 %define r%1h %2h
7a1944b9 145 %define %2q %2
96c9cc10
RB
146 %if %0 == 2
147 %define r%1m %2d
3f87f39c 148 %define r%1mp %2
3b15a6d7 149 %elif ARCH_X86_64 ; memory
96c9cc10 150 %define r%1m [rsp + stack_offset + %3]
72382650 151 %define r%1mp qword r %+ %1 %+ m
3f87f39c 152 %else
96c9cc10 153 %define r%1m [esp + stack_offset + %3]
72382650 154 %define r%1mp dword r %+ %1 %+ m
3f87f39c 155 %endif
bafad220
LM
156 %define r%1 %2
157%endmacro
158
96c9cc10 159%macro DECLARE_REG_SIZE 3
bafad220
LM
160 %define r%1q r%1
161 %define e%1q r%1
162 %define r%1d e%1
163 %define e%1d e%1
164 %define r%1w %1
165 %define e%1w %1
96c9cc10
RB
166 %define r%1h %3
167 %define e%1h %3
bafad220
LM
168 %define r%1b %2
169 %define e%1b %2
3b15a6d7 170%if ARCH_X86_64 == 0
bafad220
LM
171 %define r%1 e%1
172%endif
173%endmacro
174
96c9cc10
RB
175DECLARE_REG_SIZE ax, al, ah
176DECLARE_REG_SIZE bx, bl, bh
177DECLARE_REG_SIZE cx, cl, ch
178DECLARE_REG_SIZE dx, dl, dh
179DECLARE_REG_SIZE si, sil, null
180DECLARE_REG_SIZE di, dil, null
181DECLARE_REG_SIZE bp, bpl, null
bafad220 182
3f87f39c
JA
183; t# defines for when per-arch register allocation is more complex than just function arguments
184
185%macro DECLARE_REG_TMP 1-*
186 %assign %%i 0
187 %rep %0
188 CAT_XDEFINE t, %%i, r%1
189 %assign %%i %%i+1
190 %rotate 1
191 %endrep
192%endmacro
193
194%macro DECLARE_REG_TMP_SIZE 0-*
195 %rep %0
196 %define t%1q t%1 %+ q
197 %define t%1d t%1 %+ d
198 %define t%1w t%1 %+ w
96c9cc10 199 %define t%1h t%1 %+ h
3f87f39c
JA
200 %define t%1b t%1 %+ b
201 %rotate 1
202 %endrep
203%endmacro
204
729f90e2 205DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 206
3b15a6d7 207%if ARCH_X86_64
bafad220
LM
208 %define gprsize 8
209%else
210 %define gprsize 4
211%endif
212
213%macro PUSH 1
214 push %1
215 %assign stack_offset stack_offset+gprsize
216%endmacro
217
218%macro POP 1
219 pop %1
220 %assign stack_offset stack_offset-gprsize
221%endmacro
222
729f90e2
HG
223%macro PUSH_IF_USED 1-*
224 %rep %0
225 %if %1 < regs_used
226 PUSH r%1
227 %endif
228 %rotate 1
229 %endrep
230%endmacro
231
232%macro POP_IF_USED 1-*
233 %rep %0
234 %if %1 < regs_used
235 pop r%1
236 %endif
237 %rotate 1
238 %endrep
239%endmacro
240
241%macro LOAD_IF_USED 1-*
242 %rep %0
243 %if %1 < num_args
244 mov r%1, r %+ %1 %+ mp
245 %endif
246 %rotate 1
247 %endrep
248%endmacro
249
bafad220
LM
250%macro SUB 2
251 sub %1, %2
252 %ifidn %1, rsp
253 %assign stack_offset stack_offset+(%2)
254 %endif
255%endmacro
256
257%macro ADD 2
258 add %1, %2
259 %ifidn %1, rsp
260 %assign stack_offset stack_offset-(%2)
261 %endif
262%endmacro
263
264%macro movifnidn 2
265 %ifnidn %1, %2
266 mov %1, %2
267 %endif
268%endmacro
269
270%macro movsxdifnidn 2
271 %ifnidn %1, %2
272 movsxd %1, %2
273 %endif
274%endmacro
275
276%macro ASSERT 1
277 %if (%1) == 0
278 %error assert failed
279 %endif
280%endmacro
281
282%macro DEFINE_ARGS 0-*
283 %ifdef n_arg_names
284 %assign %%i 0
285 %rep n_arg_names
286 CAT_UNDEF arg_name %+ %%i, q
287 CAT_UNDEF arg_name %+ %%i, d
288 CAT_UNDEF arg_name %+ %%i, w
96c9cc10 289 CAT_UNDEF arg_name %+ %%i, h
bafad220 290 CAT_UNDEF arg_name %+ %%i, b
2f77923d 291 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 292 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
293 CAT_UNDEF arg_name, %%i
294 %assign %%i %%i+1
295 %endrep
296 %endif
297
0f53d0cf
LM
298 %xdefine %%stack_offset stack_offset
299 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
300 %assign %%i 0
301 %rep %0
302 %xdefine %1q r %+ %%i %+ q
303 %xdefine %1d r %+ %%i %+ d
304 %xdefine %1w r %+ %%i %+ w
96c9cc10 305 %xdefine %1h r %+ %%i %+ h
bafad220 306 %xdefine %1b r %+ %%i %+ b
2f77923d 307 %xdefine %1m r %+ %%i %+ m
98b9da2a 308 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
309 CAT_XDEFINE arg_name, %%i, %1
310 %assign %%i %%i+1
311 %rotate 1
312 %endrep
0f53d0cf
LM
313 %xdefine stack_offset %%stack_offset
314 %assign n_arg_names %0
bafad220
LM
315%endmacro
316
3b15a6d7 317%if WIN64 ; Windows x64 ;=================================================
bafad220 318
96c9cc10
RB
319DECLARE_REG 0, rcx
320DECLARE_REG 1, rdx
321DECLARE_REG 2, R8
322DECLARE_REG 3, R9
323DECLARE_REG 4, R10, 40
324DECLARE_REG 5, R11, 48
325DECLARE_REG 6, rax, 56
326DECLARE_REG 7, rdi, 64
327DECLARE_REG 8, rsi, 72
328DECLARE_REG 9, rbx, 80
329DECLARE_REG 10, rbp, 88
330DECLARE_REG 11, R12, 96
331DECLARE_REG 12, R13, 104
332DECLARE_REG 13, R14, 112
333DECLARE_REG 14, R15, 120
3f87f39c 334
2966cc18 335%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
729f90e2 336 %assign num_args %1
3f87f39c 337 %assign regs_used %2
729f90e2
HG
338 ASSERT regs_used >= num_args
339 ASSERT regs_used <= 15
340 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
9cf73853
HG
341 %if mmsize == 8
342 %assign xmm_regs_used 0
343 %else
344 WIN64_SPILL_XMM %3
345 %endif
729f90e2 346 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
532e7697
LM
347 DEFINE_ARGS %4
348%endmacro
349
350%macro WIN64_SPILL_XMM 1
351 %assign xmm_regs_used %1
352 ASSERT xmm_regs_used <= 16
3f87f39c 353 %if xmm_regs_used > 6
729f90e2 354 SUB rsp, (xmm_regs_used-6)*16+16
3f87f39c
JA
355 %assign %%i xmm_regs_used
356 %rep (xmm_regs_used-6)
357 %assign %%i %%i-1
729f90e2 358 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
3f87f39c
JA
359 %endrep
360 %endif
3f87f39c
JA
361%endmacro
362
532e7697 363%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
364 %if xmm_regs_used > 6
365 %assign %%i xmm_regs_used
366 %rep (xmm_regs_used-6)
367 %assign %%i %%i-1
729f90e2 368 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
3f87f39c
JA
369 %endrep
370 add %1, (xmm_regs_used-6)*16+16
371 %endif
372%endmacro
373
532e7697
LM
374%macro WIN64_RESTORE_XMM 1
375 WIN64_RESTORE_XMM_INTERNAL %1
3f87f39c
JA
376 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
377 %assign xmm_regs_used 0
378%endmacro
379
96c9cc10
RB
380%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
381
3f87f39c 382%macro RET 0
532e7697 383 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 384 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
385%if mmsize == 32
386 vzeroupper
387%endif
3f87f39c 388 ret
bafad220
LM
389%endmacro
390
3b15a6d7 391%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 392
96c9cc10
RB
393DECLARE_REG 0, rdi
394DECLARE_REG 1, rsi
395DECLARE_REG 2, rdx
396DECLARE_REG 3, rcx
397DECLARE_REG 4, R8
398DECLARE_REG 5, R9
399DECLARE_REG 6, rax, 8
400DECLARE_REG 7, R10, 16
401DECLARE_REG 8, R11, 24
402DECLARE_REG 9, rbx, 32
403DECLARE_REG 10, rbp, 40
404DECLARE_REG 11, R12, 48
405DECLARE_REG 12, R13, 56
406DECLARE_REG 13, R14, 64
407DECLARE_REG 14, R15, 72
bafad220 408
3f87f39c 409%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
729f90e2
HG
410 %assign num_args %1
411 %assign regs_used %2
412 ASSERT regs_used >= num_args
413 ASSERT regs_used <= 15
414 PUSH_IF_USED 9, 10, 11, 12, 13, 14
415 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
bafad220
LM
416 DEFINE_ARGS %4
417%endmacro
418
96c9cc10
RB
419%define has_epilogue regs_used > 9 || mmsize == 32
420
bafad220 421%macro RET 0
729f90e2 422 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
423%if mmsize == 32
424 vzeroupper
425%endif
bafad220
LM
426 ret
427%endmacro
428
bafad220
LM
429%else ; X86_32 ;==============================================================
430
96c9cc10
RB
431DECLARE_REG 0, eax, 4
432DECLARE_REG 1, ecx, 8
433DECLARE_REG 2, edx, 12
434DECLARE_REG 3, ebx, 16
435DECLARE_REG 4, esi, 20
436DECLARE_REG 5, edi, 24
437DECLARE_REG 6, ebp, 28
bafad220
LM
438%define rsp esp
439
729f90e2
HG
440%macro DECLARE_ARG 1-*
441 %rep %0
442 %define r%1m [esp + stack_offset + 4*%1 + 4]
443 %define r%1mp dword r%1m
444 %rotate 1
445 %endrep
bafad220
LM
446%endmacro
447
729f90e2 448DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 449
2966cc18 450%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
729f90e2 451 %assign num_args %1
bafad220 452 %assign regs_used %2
f8d8fe25
LM
453 %if num_args > 7
454 %assign num_args 7
455 %endif
729f90e2
HG
456 %if regs_used > 7
457 %assign regs_used 7
458 %endif
459 ASSERT regs_used >= num_args
460 PUSH_IF_USED 3, 4, 5, 6
461 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
bafad220
LM
462 DEFINE_ARGS %4
463%endmacro
464
96c9cc10
RB
465%define has_epilogue regs_used > 3 || mmsize == 32
466
bafad220 467%macro RET 0
729f90e2 468 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
469%if mmsize == 32
470 vzeroupper
471%endif
bafad220
LM
472 ret
473%endmacro
474
bafad220
LM
475%endif ;======================================================================
476
3b15a6d7 477%if WIN64 == 0
532e7697
LM
478%macro WIN64_SPILL_XMM 1
479%endmacro
480%macro WIN64_RESTORE_XMM 1
481%endmacro
482%endif
483
96c9cc10
RB
484%macro REP_RET 0
485 %if has_epilogue
486 RET
487 %else
488 rep ret
489 %endif
490%endmacro
491
492%macro TAIL_CALL 2 ; callee, is_nonadjacent
493 %if has_epilogue
494 call %1
495 RET
496 %elif %2
497 jmp %1
498 %endif
499%endmacro
500
bafad220
LM
501;=============================================================================
502; arch-independent part
503;=============================================================================
504
505%assign function_align 16
506
2f7f2e4b
LM
507; Begin a function.
508; Applies any symbol mangling needed for C linkage, and sets up a define such that
509; subsequent uses of the function name automatically refer to the mangled version.
510; Appends cpuflags to the function name if cpuflags has been specified.
edd82267 511%macro cglobal 1-2+ "" ; name, [PROLOGUE args]
2f7f2e4b 512 cglobal_internal %1 %+ SUFFIX, %2
2f7f2e4b
LM
513%endmacro
514%macro cglobal_internal 1-2+
515 %ifndef cglobaled_%1
516 %xdefine %1 mangle(program_name %+ _ %+ %1)
517 %xdefine %1.skip_prologue %1 %+ .skip_prologue
518 CAT_XDEFINE cglobaled_, %1, 1
519 %endif
520 %xdefine current_function %1
bafad220 521 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 522 global %1:function hidden
bafad220 523 %else
40c7d0ae 524 global %1
bafad220
LM
525 %endif
526 align function_align
527 %1:
528 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
3f87f39c 529 %assign stack_offset 0
edd82267 530 %ifnidn %2, ""
bafad220
LM
531 PROLOGUE %2
532 %endif
533%endmacro
534
535%macro cextern 1
2966cc18 536 %xdefine %1 mangle(program_name %+ _ %+ %1)
2f7f2e4b 537 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
538 extern %1
539%endmacro
540
2f7f2e4b 541; like cextern, but without the prefix
2966cc18
JGG
542%macro cextern_naked 1
543 %xdefine %1 mangle(%1)
2f7f2e4b 544 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 545 extern %1
bafad220
LM
546%endmacro
547
2966cc18
JGG
548%macro const 2+
549 %xdefine %1 mangle(program_name %+ _ %+ %1)
550 global %1
551 %1: %2
552%endmacro
553
bafad220
LM
554; This is needed for ELF, otherwise the GNU linker assumes the stack is
555; executable by default.
556%ifidn __OUTPUT_FORMAT__,elf
557SECTION .note.GNU-stack noalloc noexec nowrite progbits
558%endif
559
2f7f2e4b
LM
560; cpuflags
561
562%assign cpuflags_mmx (1<<0)
563%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
564%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
ca844b7b 565%assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow
2f7f2e4b
LM
566%assign cpuflags_sse (1<<4) | cpuflags_mmx2
567%assign cpuflags_sse2 (1<<5) | cpuflags_sse
568%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
569%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
570%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
571%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
572%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
573%assign cpuflags_avx (1<<11)| cpuflags_sse42
574%assign cpuflags_xop (1<<12)| cpuflags_avx
575%assign cpuflags_fma4 (1<<13)| cpuflags_avx
96c9cc10
RB
576%assign cpuflags_avx2 (1<<14)| cpuflags_avx
577%assign cpuflags_fma3 (1<<15)| cpuflags_avx
2f7f2e4b
LM
578
579%assign cpuflags_cache32 (1<<16)
580%assign cpuflags_cache64 (1<<17)
581%assign cpuflags_slowctz (1<<18)
582%assign cpuflags_lzcnt (1<<19)
583%assign cpuflags_misalign (1<<20)
584%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
585%assign cpuflags_atom (1<<22)
96c9cc10
RB
586%assign cpuflags_bmi1 (1<<23)
587%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
588%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
2f7f2e4b
LM
589
590%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
591%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
592
593; Takes up to 2 cpuflags from the above list.
594; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
595; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
596%macro INIT_CPUFLAGS 0-2
180d43bc 597 CPUNOP amdnop
2f7f2e4b
LM
598 %if %0 >= 1
599 %xdefine cpuname %1
600 %assign cpuflags cpuflags_%1
601 %if %0 >= 2
602 %xdefine cpuname %1_%2
603 %assign cpuflags cpuflags | cpuflags_%2
604 %endif
605 %xdefine SUFFIX _ %+ cpuname
606 %if cpuflag(avx)
607 %assign avx_enabled 1
608 %endif
f2bd8a07
JR
609 %if mmsize == 16 && notcpuflag(sse2)
610 %define mova movaps
611 %define movu movups
612 %define movnta movntps
613 %endif
2f7f2e4b
LM
614 %if cpuflag(aligned)
615 %define movu mova
616 %elifidn %1, sse3
617 %define movu lddqu
618 %endif
2cd1f5ca 619 %if notcpuflag(mmx2)
180d43bc 620 CPUNOP basicnop
2cd1f5ca 621 %endif
2f7f2e4b
LM
622 %else
623 %xdefine SUFFIX
624 %undef cpuname
625 %undef cpuflags
626 %endif
627%endmacro
628
bafad220
LM
629; merge mmx and sse*
630
631%macro CAT_XDEFINE 3
632 %xdefine %1%2 %3
633%endmacro
634
635%macro CAT_UNDEF 2
636 %undef %1%2
637%endmacro
638
2f7f2e4b 639%macro INIT_MMX 0-1+
33cbfa6f 640 %assign avx_enabled 0
2f7f2e4b 641 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
642 %define mmsize 8
643 %define num_mmregs 8
644 %define mova movq
645 %define movu movq
646 %define movh movd
532e7697 647 %define movnta movntq
bafad220
LM
648 %assign %%i 0
649 %rep 8
650 CAT_XDEFINE m, %%i, mm %+ %%i
651 CAT_XDEFINE nmm, %%i, %%i
652 %assign %%i %%i+1
653 %endrep
654 %rep 8
655 CAT_UNDEF m, %%i
656 CAT_UNDEF nmm, %%i
657 %assign %%i %%i+1
658 %endrep
2f7f2e4b 659 INIT_CPUFLAGS %1
bafad220
LM
660%endmacro
661
2f7f2e4b 662%macro INIT_XMM 0-1+
33cbfa6f 663 %assign avx_enabled 0
2f7f2e4b 664 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
665 %define mmsize 16
666 %define num_mmregs 8
3b15a6d7 667 %if ARCH_X86_64
bafad220
LM
668 %define num_mmregs 16
669 %endif
670 %define mova movdqa
671 %define movu movdqu
672 %define movh movq
532e7697 673 %define movnta movntdq
bafad220
LM
674 %assign %%i 0
675 %rep num_mmregs
676 CAT_XDEFINE m, %%i, xmm %+ %%i
677 CAT_XDEFINE nxmm, %%i, %%i
678 %assign %%i %%i+1
679 %endrep
2f7f2e4b 680 INIT_CPUFLAGS %1
bafad220
LM
681%endmacro
682
2f7f2e4b 683%macro INIT_YMM 0-1+
33cbfa6f 684 %assign avx_enabled 1
2f7f2e4b 685 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
686 %define mmsize 32
687 %define num_mmregs 8
3b15a6d7 688 %if ARCH_X86_64
33cbfa6f
VS
689 %define num_mmregs 16
690 %endif
691 %define mova vmovaps
692 %define movu vmovups
2f7f2e4b
LM
693 %undef movh
694 %define movnta vmovntps
33cbfa6f
VS
695 %assign %%i 0
696 %rep num_mmregs
697 CAT_XDEFINE m, %%i, ymm %+ %%i
698 CAT_XDEFINE nymm, %%i, %%i
699 %assign %%i %%i+1
700 %endrep
2f7f2e4b 701 INIT_CPUFLAGS %1
33cbfa6f
VS
702%endmacro
703
2f7f2e4b 704INIT_XMM
bafad220
LM
705
706; I often want to use macros that permute their arguments. e.g. there's no
707; efficient way to implement butterfly or transpose or dct without swapping some
708; arguments.
709;
710; I would like to not have to manually keep track of the permutations:
711; If I insert a permutation in the middle of a function, it should automatically
712; change everything that follows. For more complex macros I may also have multiple
713; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
714;
715; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
716; permutes its arguments. It's equivalent to exchanging the contents of the
717; registers, except that this way you exchange the register names instead, so it
718; doesn't cost any cycles.
719
720%macro PERMUTE 2-* ; takes a list of pairs to swap
721%rep %0/2
722 %xdefine tmp%2 m%2
723 %xdefine ntmp%2 nm%2
724 %rotate 2
725%endrep
726%rep %0/2
727 %xdefine m%1 tmp%2
728 %xdefine nm%1 ntmp%2
729 %undef tmp%2
730 %undef ntmp%2
731 %rotate 2
732%endrep
733%endmacro
734
735%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
736%rep %0-1
737%ifdef m%1
738 %xdefine tmp m%1
739 %xdefine m%1 m%2
740 %xdefine m%2 tmp
741 CAT_XDEFINE n, m%1, %1
742 CAT_XDEFINE n, m%2, %2
743%else
744 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
745 ; Be careful using this mode in nested macros though, as in some cases there may be
746 ; other copies of m# that have already been dereferenced and don't get updated correctly.
747 %xdefine %%n1 n %+ %1
748 %xdefine %%n2 n %+ %2
749 %xdefine tmp m %+ %%n1
750 CAT_XDEFINE m, %%n1, m %+ %%n2
751 CAT_XDEFINE m, %%n2, tmp
752 CAT_XDEFINE n, m %+ %%n1, %%n1
753 CAT_XDEFINE n, m %+ %%n2, %%n2
754%endif
755 %undef tmp
756 %rotate 1
757%endrep
758%endmacro
759
2f7f2e4b
LM
760; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
761; calls to that function will automatically load the permutation, so values can
762; be returned in mmregs.
763%macro SAVE_MM_PERMUTATION 0-1
764 %if %0
765 %xdefine %%f %1_m
766 %else
767 %xdefine %%f current_function %+ _m
768 %endif
bafad220
LM
769 %assign %%i 0
770 %rep num_mmregs
2f7f2e4b 771 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
772 %assign %%i %%i+1
773 %endrep
774%endmacro
775
2966cc18 776%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
777 %ifdef %1_m0
778 %assign %%i 0
779 %rep num_mmregs
780 CAT_XDEFINE m, %%i, %1_m %+ %%i
781 CAT_XDEFINE n, m %+ %%i, %%i
782 %assign %%i %%i+1
783 %endrep
784 %endif
bafad220
LM
785%endmacro
786
2f7f2e4b 787; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 788%macro call 1
edd82267 789 call_internal %1 %+ SUFFIX, %1
2f7f2e4b
LM
790%endmacro
791%macro call_internal 2
edd82267
MR
792 %xdefine %%i %2
793 %ifndef cglobaled_%2
794 %ifdef cglobaled_%1
795 %xdefine %%i %1
2f7f2e4b 796 %endif
bafad220 797 %endif
2f7f2e4b
LM
798 call %%i
799 LOAD_MM_PERMUTATION %%i
bafad220
LM
800%endmacro
801
2966cc18 802; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
803%macro add 2
804 %ifnum %2
805 %if %2==128
806 sub %1, -128
807 %else
808 add %1, %2
809 %endif
810 %else
811 add %1, %2
812 %endif
813%endmacro
814
815%macro sub 2
816 %ifnum %2
817 %if %2==128
818 add %1, -128
819 %else
820 sub %1, %2
821 %endif
822 %else
823 sub %1, %2
824 %endif
825%endmacro
33cbfa6f
VS
826
827;=============================================================================
828; AVX abstraction layer
829;=============================================================================
830
831%assign i 0
832%rep 16
833 %if i < 8
834 CAT_XDEFINE sizeofmm, i, 8
835 %endif
836 CAT_XDEFINE sizeofxmm, i, 16
837 CAT_XDEFINE sizeofymm, i, 32
838%assign i i+1
839%endrep
840%undef i
841
96c9cc10
RB
842%macro CHECK_AVX_INSTR_EMU 3-*
843 %xdefine %%opcode %1
844 %xdefine %%dst %2
845 %rep %0-2
846 %ifidn %%dst, %3
847 %error non-avx emulation of ``%%opcode'' is not supported
848 %endif
849 %rotate 1
850 %endrep
851%endmacro
852
33cbfa6f
VS
853;%1 == instruction
854;%2 == 1 if float, 0 if int
705f3d47 855;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
33cbfa6f
VS
856;%4 == number of operands given
857;%5+: operands
858%macro RUN_AVX_INSTR 6-7+
96c9cc10
RB
859 %ifid %6
860 %define %%sizeofreg sizeof%6
861 %elifid %5
862 %define %%sizeofreg sizeof%5
2f7f2e4b 863 %else
96c9cc10 864 %define %%sizeofreg mmsize
2f7f2e4b 865 %endif
96c9cc10
RB
866 %if %%sizeofreg==32
867 %if %4>=3
705f3d47
LM
868 v%1 %5, %6, %7
869 %else
870 v%1 %5, %6
871 %endif
33cbfa6f 872 %else
96c9cc10 873 %if %%sizeofreg==8
33cbfa6f
VS
874 %define %%regmov movq
875 %elif %2
876 %define %%regmov movaps
877 %else
878 %define %%regmov movdqa
879 %endif
880
881 %if %4>=3+%3
882 %ifnidn %5, %6
96c9cc10 883 %if avx_enabled && %%sizeofreg==16
33cbfa6f
VS
884 v%1 %5, %6, %7
885 %else
96c9cc10 886 CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
33cbfa6f
VS
887 %%regmov %5, %6
888 %1 %5, %7
889 %endif
890 %else
891 %1 %5, %7
892 %endif
96c9cc10 893 %elif %4>=3
33cbfa6f
VS
894 %1 %5, %6, %7
895 %else
896 %1 %5, %6
897 %endif
898 %endif
899%endmacro
900
2f7f2e4b
LM
901; 3arg AVX ops with a memory arg can only have it in src2,
902; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
903; So, if the op is symmetric and the wrong one is memory, swap them.
904%macro RUN_AVX_INSTR1 8
905 %assign %%swap 0
906 %if avx_enabled
907 %ifnid %6
908 %assign %%swap 1
909 %endif
910 %elifnidn %5, %6
911 %ifnid %7
912 %assign %%swap 1
913 %endif
914 %endif
915 %if %%swap && %3 == 0 && %8 == 1
916 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
917 %else
918 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
919 %endif
920%endmacro
921
33cbfa6f
VS
922;%1 == instruction
923;%2 == 1 if float, 0 if int
96c9cc10 924;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
2f7f2e4b
LM
925;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
926%macro AVX_INSTR 4
927 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
33cbfa6f
VS
928 %ifidn %3, fnord
929 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
930 %elifidn %4, fnord
2f7f2e4b 931 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
33cbfa6f
VS
932 %elifidn %5, fnord
933 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
934 %else
935 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
936 %endif
937 %endmacro
938%endmacro
939
2f7f2e4b
LM
940AVX_INSTR addpd, 1, 0, 1
941AVX_INSTR addps, 1, 0, 1
942AVX_INSTR addsd, 1, 0, 1
943AVX_INSTR addss, 1, 0, 1
944AVX_INSTR addsubpd, 1, 0, 0
945AVX_INSTR addsubps, 1, 0, 0
946AVX_INSTR andpd, 1, 0, 1
947AVX_INSTR andps, 1, 0, 1
948AVX_INSTR andnpd, 1, 0, 0
949AVX_INSTR andnps, 1, 0, 0
950AVX_INSTR blendpd, 1, 0, 0
951AVX_INSTR blendps, 1, 0, 0
952AVX_INSTR blendvpd, 1, 0, 0
953AVX_INSTR blendvps, 1, 0, 0
954AVX_INSTR cmppd, 1, 0, 0
955AVX_INSTR cmpps, 1, 0, 0
956AVX_INSTR cmpsd, 1, 0, 0
957AVX_INSTR cmpss, 1, 0, 0
705f3d47 958AVX_INSTR cvtdq2ps, 1, 0, 0
b30a3633 959AVX_INSTR cvtpd2dq, 1, 0, 0
705f3d47 960AVX_INSTR cvtps2dq, 1, 0, 0
2f7f2e4b
LM
961AVX_INSTR divpd, 1, 0, 0
962AVX_INSTR divps, 1, 0, 0
963AVX_INSTR divsd, 1, 0, 0
964AVX_INSTR divss, 1, 0, 0
965AVX_INSTR dppd, 1, 1, 0
966AVX_INSTR dpps, 1, 1, 0
967AVX_INSTR haddpd, 1, 0, 0
968AVX_INSTR haddps, 1, 0, 0
969AVX_INSTR hsubpd, 1, 0, 0
970AVX_INSTR hsubps, 1, 0, 0
971AVX_INSTR maxpd, 1, 0, 1
972AVX_INSTR maxps, 1, 0, 1
973AVX_INSTR maxsd, 1, 0, 1
974AVX_INSTR maxss, 1, 0, 1
975AVX_INSTR minpd, 1, 0, 1
976AVX_INSTR minps, 1, 0, 1
977AVX_INSTR minsd, 1, 0, 1
978AVX_INSTR minss, 1, 0, 1
39df0c43
VS
979AVX_INSTR movhlps, 1, 0, 0
980AVX_INSTR movlhps, 1, 0, 0
2f7f2e4b
LM
981AVX_INSTR movsd, 1, 0, 0
982AVX_INSTR movss, 1, 0, 0
983AVX_INSTR mpsadbw, 0, 1, 0
984AVX_INSTR mulpd, 1, 0, 1
985AVX_INSTR mulps, 1, 0, 1
986AVX_INSTR mulsd, 1, 0, 1
987AVX_INSTR mulss, 1, 0, 1
988AVX_INSTR orpd, 1, 0, 1
989AVX_INSTR orps, 1, 0, 1
96c9cc10
RB
990AVX_INSTR pabsb, 0, 0, 0
991AVX_INSTR pabsw, 0, 0, 0
992AVX_INSTR pabsd, 0, 0, 0
2f7f2e4b
LM
993AVX_INSTR packsswb, 0, 0, 0
994AVX_INSTR packssdw, 0, 0, 0
995AVX_INSTR packuswb, 0, 0, 0
996AVX_INSTR packusdw, 0, 0, 0
997AVX_INSTR paddb, 0, 0, 1
998AVX_INSTR paddw, 0, 0, 1
999AVX_INSTR paddd, 0, 0, 1
1000AVX_INSTR paddq, 0, 0, 1
1001AVX_INSTR paddsb, 0, 0, 1
1002AVX_INSTR paddsw, 0, 0, 1
1003AVX_INSTR paddusb, 0, 0, 1
1004AVX_INSTR paddusw, 0, 0, 1
1005AVX_INSTR palignr, 0, 1, 0
1006AVX_INSTR pand, 0, 0, 1
1007AVX_INSTR pandn, 0, 0, 0
1008AVX_INSTR pavgb, 0, 0, 1
1009AVX_INSTR pavgw, 0, 0, 1
1010AVX_INSTR pblendvb, 0, 0, 0
1011AVX_INSTR pblendw, 0, 1, 0
1012AVX_INSTR pcmpestri, 0, 0, 0
1013AVX_INSTR pcmpestrm, 0, 0, 0
1014AVX_INSTR pcmpistri, 0, 0, 0
1015AVX_INSTR pcmpistrm, 0, 0, 0
1016AVX_INSTR pcmpeqb, 0, 0, 1
1017AVX_INSTR pcmpeqw, 0, 0, 1
1018AVX_INSTR pcmpeqd, 0, 0, 1
1019AVX_INSTR pcmpeqq, 0, 0, 1
1020AVX_INSTR pcmpgtb, 0, 0, 0
1021AVX_INSTR pcmpgtw, 0, 0, 0
1022AVX_INSTR pcmpgtd, 0, 0, 0
1023AVX_INSTR pcmpgtq, 0, 0, 0
1024AVX_INSTR phaddw, 0, 0, 0
1025AVX_INSTR phaddd, 0, 0, 0
1026AVX_INSTR phaddsw, 0, 0, 0
1027AVX_INSTR phsubw, 0, 0, 0
1028AVX_INSTR phsubd, 0, 0, 0
1029AVX_INSTR phsubsw, 0, 0, 0
1030AVX_INSTR pmaddwd, 0, 0, 1
1031AVX_INSTR pmaddubsw, 0, 0, 0
1032AVX_INSTR pmaxsb, 0, 0, 1
1033AVX_INSTR pmaxsw, 0, 0, 1
1034AVX_INSTR pmaxsd, 0, 0, 1
1035AVX_INSTR pmaxub, 0, 0, 1
1036AVX_INSTR pmaxuw, 0, 0, 1
1037AVX_INSTR pmaxud, 0, 0, 1
1038AVX_INSTR pminsb, 0, 0, 1
1039AVX_INSTR pminsw, 0, 0, 1
1040AVX_INSTR pminsd, 0, 0, 1
1041AVX_INSTR pminub, 0, 0, 1
1042AVX_INSTR pminuw, 0, 0, 1
1043AVX_INSTR pminud, 0, 0, 1
96c9cc10 1044AVX_INSTR pmovmskb, 0, 0, 0
2f7f2e4b
LM
1045AVX_INSTR pmulhuw, 0, 0, 1
1046AVX_INSTR pmulhrsw, 0, 0, 1
1047AVX_INSTR pmulhw, 0, 0, 1
1048AVX_INSTR pmullw, 0, 0, 1
1049AVX_INSTR pmulld, 0, 0, 1
1050AVX_INSTR pmuludq, 0, 0, 1
1051AVX_INSTR pmuldq, 0, 0, 1
1052AVX_INSTR por, 0, 0, 1
1053AVX_INSTR psadbw, 0, 0, 1
1054AVX_INSTR pshufb, 0, 0, 0
96c9cc10
RB
1055AVX_INSTR pshufd, 0, 1, 0
1056AVX_INSTR pshufhw, 0, 1, 0
1057AVX_INSTR pshuflw, 0, 1, 0
2f7f2e4b
LM
1058AVX_INSTR psignb, 0, 0, 0
1059AVX_INSTR psignw, 0, 0, 0
1060AVX_INSTR psignd, 0, 0, 0
1061AVX_INSTR psllw, 0, 0, 0
1062AVX_INSTR pslld, 0, 0, 0
1063AVX_INSTR psllq, 0, 0, 0
1064AVX_INSTR pslldq, 0, 0, 0
1065AVX_INSTR psraw, 0, 0, 0
1066AVX_INSTR psrad, 0, 0, 0
1067AVX_INSTR psrlw, 0, 0, 0
1068AVX_INSTR psrld, 0, 0, 0
1069AVX_INSTR psrlq, 0, 0, 0
1070AVX_INSTR psrldq, 0, 0, 0
1071AVX_INSTR psubb, 0, 0, 0
1072AVX_INSTR psubw, 0, 0, 0
1073AVX_INSTR psubd, 0, 0, 0
1074AVX_INSTR psubq, 0, 0, 0
1075AVX_INSTR psubsb, 0, 0, 0
1076AVX_INSTR psubsw, 0, 0, 0
1077AVX_INSTR psubusb, 0, 0, 0
1078AVX_INSTR psubusw, 0, 0, 0
96c9cc10 1079AVX_INSTR ptest, 0, 0, 0
2f7f2e4b
LM
1080AVX_INSTR punpckhbw, 0, 0, 0
1081AVX_INSTR punpckhwd, 0, 0, 0
1082AVX_INSTR punpckhdq, 0, 0, 0
1083AVX_INSTR punpckhqdq, 0, 0, 0
1084AVX_INSTR punpcklbw, 0, 0, 0
1085AVX_INSTR punpcklwd, 0, 0, 0
1086AVX_INSTR punpckldq, 0, 0, 0
1087AVX_INSTR punpcklqdq, 0, 0, 0
1088AVX_INSTR pxor, 0, 0, 1
6b6ee582 1089AVX_INSTR shufps, 1, 1, 0
2f7f2e4b
LM
1090AVX_INSTR subpd, 1, 0, 0
1091AVX_INSTR subps, 1, 0, 0
1092AVX_INSTR subsd, 1, 0, 0
1093AVX_INSTR subss, 1, 0, 0
1094AVX_INSTR unpckhpd, 1, 0, 0
1095AVX_INSTR unpckhps, 1, 0, 0
1096AVX_INSTR unpcklpd, 1, 0, 0
1097AVX_INSTR unpcklps, 1, 0, 0
1098AVX_INSTR xorpd, 1, 0, 1
1099AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1100
1101; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1102AVX_INSTR pfadd, 1, 0, 1
1103AVX_INSTR pfsub, 1, 0, 0
1104AVX_INSTR pfmul, 1, 0, 1
1105
1106; base-4 constants for shuffles
1107%assign i 0
1108%rep 256
1109 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1110 %if j < 10
1111 CAT_XDEFINE q000, j, i
1112 %elif j < 100
1113 CAT_XDEFINE q00, j, i
1114 %elif j < 1000
1115 CAT_XDEFINE q0, j, i
1116 %else
1117 CAT_XDEFINE q, j, i
1118 %endif
1119%assign i i+1
1120%endrep
1121%undef i
1122%undef j
1123
1124%macro FMA_INSTR 3
79687079
JR
1125 %macro %1 5-8 %1, %2, %3
1126 %if cpuflag(xop) || cpuflag(fma4)
1127 v%6 %1, %2, %3, %4
2f7f2e4b 1128 %else
79687079
JR
1129 %ifidn %1, %4
1130 %7 %5, %2, %3
1131 %8 %1, %4, %5
1132 %else
1133 %7 %1, %2, %3
1134 %8 %1, %4
1135 %endif
2f7f2e4b
LM
1136 %endif
1137 %endmacro
1138%endmacro
1139
79687079 1140FMA_INSTR fmaddps, mulps, addps
2f7f2e4b
LM
1141FMA_INSTR pmacsdd, pmulld, paddd
1142FMA_INSTR pmacsww, pmullw, paddw
1143FMA_INSTR pmadcswd, pmaddwd, paddd
96c9cc10
RB
1144
1145; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
1146; This lets us use tzcnt without bumping the yasm version requirement yet.
1147%define tzcnt rep bsf