fft: rename "z" to "zc" to prevent name collision.
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220 1;*****************************************************************************
2f7f2e4b 2;* x86inc.asm: x264asm abstraction layer
bafad220 3;*****************************************************************************
729f90e2 4;* Copyright (C) 2005-2012 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
729f90e2 9;* Henrik Gramner <hengar-6@student.ltu.se>
bafad220 10;*
2966cc18
JGG
11;* Permission to use, copy, modify, and/or distribute this software for any
12;* purpose with or without fee is hereby granted, provided that the above
13;* copyright notice and this permission notice appear in all copies.
bafad220 14;*
2966cc18
JGG
15;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
16;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
17;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
18;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
20;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
21;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
22;*****************************************************************************
23
2966cc18
JGG
24; This is a header file for the x264ASM assembly language, which uses
25; NASM/YASM syntax combined with a large number of macros to provide easy
26; abstraction between different calling conventions (x86_32, win64, linux64).
27; It also has various other useful features to simplify writing the kind of
28; DSP functions that are most often used in x264.
29
30; Unlike the rest of x264, this file is available under an ISC license, as it
31; has significant usefulness outside of x264 and we want it to be available
32; to the largest audience possible. Of course, if you modify it for your own
33; purposes to add a new feature, we strongly encourage contributing a patch
34; as this feature might be useful for others as well. Send patches or ideas
35; to x264-devel@videolan.org .
36
37%define program_name ff
38
3b15a6d7
RB
39%define UNIX64 0
40%define WIN64 0
41%if ARCH_X86_64
3f87f39c 42 %ifidn __OUTPUT_FORMAT__,win32
3b15a6d7 43 %define WIN64 1
166f3993
HY
44 %elifidn __OUTPUT_FORMAT__,win64
45 %define WIN64 1
3f87f39c 46 %else
3b15a6d7 47 %define UNIX64 1
3f87f39c
JA
48 %endif
49%endif
50
2966cc18
JGG
51%ifdef PREFIX
52 %define mangle(x) _ %+ x
53%else
54 %define mangle(x) x
55%endif
56
bafad220
LM
57; FIXME: All of the 64bit asm functions that take a stride as an argument
58; via register, assume that the high dword of that register is filled with 0.
59; This is true in practice (since we never do any 64bit arithmetic on strides,
60; and x264's strides are all positive), but is not guaranteed by the ABI.
61
62; Name of the .rodata section.
63; Kludge: Something on OS X fails to align .rodata even given an align attribute,
64; so use a different read-only section.
3f87f39c 65%macro SECTION_RODATA 0-1 16
bafad220 66 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 67 SECTION .text align=%1
bafad220 68 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 69 SECTION .text align=%1
bafad220 70 fakegot:
d69f9a42
DY
71 %elifidn __OUTPUT_FORMAT__,aout
72 section .text
bafad220 73 %else
3f87f39c 74 SECTION .rodata align=%1
bafad220
LM
75 %endif
76%endmacro
77
d69f9a42
DY
78; aout does not support align=
79%macro SECTION_TEXT 0-1 16
80 %ifidn __OUTPUT_FORMAT__,aout
81 SECTION .text
82 %else
83 SECTION .text align=%1
84 %endif
85%endmacro
86
3b15a6d7 87%if WIN64
3f87f39c 88 %define PIC
412b248e 89%elif ARCH_X86_64 == 0
2966cc18
JGG
90; x86_32 doesn't require PIC.
91; Some distros prefer shared objects to be PIC, but nothing breaks if
92; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
93 %undef PIC
94%endif
95%ifdef PIC
2966cc18 96 default rel
bafad220
LM
97%endif
98
729f90e2
HG
99; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
100CPU amdnop
101
bafad220
LM
102; Macros to eliminate most code duplication between x86_32 and x86_64:
103; Currently this works only for leaf functions which load all their arguments
104; into registers at the start, and make no other use of the stack. Luckily that
105; covers most of x264's asm.
106
107; PROLOGUE:
108; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
109; %2 = number of registers used. pushes callee-saved regs if needed.
110; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
bafad220
LM
111; %4 = list of names to define to registers
112; PROLOGUE can also be invoked by adding the same options to cglobal
113
114; e.g.
29e4edbb 115; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 116; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
117
118; TODO Some functions can use some args directly from the stack. If they're the
119; last args then you can just not declare them, but if they're in the middle
120; we need more flexible macro.
121
122; RET:
2f7f2e4b 123; Pops anything that was pushed by PROLOGUE, and returns.
bafad220
LM
124
125; REP_RET:
126; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
127; which are slow when a normal ret follows a branch.
128
3f87f39c
JA
129; registers:
130; rN and rNq are the native-size register holding function argument N
131; rNd, rNw, rNb are dword, word, and byte size
132; rNm is the original location of arg N (a register or on the stack), dword
133; rNmp is native size
134
729f90e2 135%macro DECLARE_REG 5-6
bafad220
LM
136 %define r%1q %2
137 %define r%1d %3
138 %define r%1w %4
139 %define r%1b %5
729f90e2
HG
140 %if %0 == 5
141 %define r%1m %3
3f87f39c 142 %define r%1mp %2
3b15a6d7 143 %elif ARCH_X86_64 ; memory
729f90e2
HG
144 %define r%1m [rsp + stack_offset + %6]
145 %define r%1mp qword r %+ %1m
3f87f39c 146 %else
729f90e2
HG
147 %define r%1m [esp + stack_offset + %6]
148 %define r%1mp dword r %+ %1m
3f87f39c 149 %endif
bafad220
LM
150 %define r%1 %2
151%endmacro
152
153%macro DECLARE_REG_SIZE 2
154 %define r%1q r%1
155 %define e%1q r%1
156 %define r%1d e%1
157 %define e%1d e%1
158 %define r%1w %1
159 %define e%1w %1
160 %define r%1b %2
161 %define e%1b %2
3b15a6d7 162%if ARCH_X86_64 == 0
bafad220
LM
163 %define r%1 e%1
164%endif
165%endmacro
166
167DECLARE_REG_SIZE ax, al
168DECLARE_REG_SIZE bx, bl
169DECLARE_REG_SIZE cx, cl
170DECLARE_REG_SIZE dx, dl
171DECLARE_REG_SIZE si, sil
172DECLARE_REG_SIZE di, dil
173DECLARE_REG_SIZE bp, bpl
174
3f87f39c
JA
175; t# defines for when per-arch register allocation is more complex than just function arguments
176
177%macro DECLARE_REG_TMP 1-*
178 %assign %%i 0
179 %rep %0
180 CAT_XDEFINE t, %%i, r%1
181 %assign %%i %%i+1
182 %rotate 1
183 %endrep
184%endmacro
185
186%macro DECLARE_REG_TMP_SIZE 0-*
187 %rep %0
188 %define t%1q t%1 %+ q
189 %define t%1d t%1 %+ d
190 %define t%1w t%1 %+ w
191 %define t%1b t%1 %+ b
192 %rotate 1
193 %endrep
194%endmacro
195
729f90e2 196DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
3f87f39c 197
3b15a6d7 198%if ARCH_X86_64
bafad220
LM
199 %define gprsize 8
200%else
201 %define gprsize 4
202%endif
203
204%macro PUSH 1
205 push %1
206 %assign stack_offset stack_offset+gprsize
207%endmacro
208
209%macro POP 1
210 pop %1
211 %assign stack_offset stack_offset-gprsize
212%endmacro
213
729f90e2
HG
214%macro PUSH_IF_USED 1-*
215 %rep %0
216 %if %1 < regs_used
217 PUSH r%1
218 %endif
219 %rotate 1
220 %endrep
221%endmacro
222
223%macro POP_IF_USED 1-*
224 %rep %0
225 %if %1 < regs_used
226 pop r%1
227 %endif
228 %rotate 1
229 %endrep
230%endmacro
231
232%macro LOAD_IF_USED 1-*
233 %rep %0
234 %if %1 < num_args
235 mov r%1, r %+ %1 %+ mp
236 %endif
237 %rotate 1
238 %endrep
239%endmacro
240
bafad220
LM
241%macro SUB 2
242 sub %1, %2
243 %ifidn %1, rsp
244 %assign stack_offset stack_offset+(%2)
245 %endif
246%endmacro
247
248%macro ADD 2
249 add %1, %2
250 %ifidn %1, rsp
251 %assign stack_offset stack_offset-(%2)
252 %endif
253%endmacro
254
255%macro movifnidn 2
256 %ifnidn %1, %2
257 mov %1, %2
258 %endif
259%endmacro
260
261%macro movsxdifnidn 2
262 %ifnidn %1, %2
263 movsxd %1, %2
264 %endif
265%endmacro
266
267%macro ASSERT 1
268 %if (%1) == 0
269 %error assert failed
270 %endif
271%endmacro
272
273%macro DEFINE_ARGS 0-*
274 %ifdef n_arg_names
275 %assign %%i 0
276 %rep n_arg_names
277 CAT_UNDEF arg_name %+ %%i, q
278 CAT_UNDEF arg_name %+ %%i, d
279 CAT_UNDEF arg_name %+ %%i, w
280 CAT_UNDEF arg_name %+ %%i, b
2f77923d 281 CAT_UNDEF arg_name %+ %%i, m
98b9da2a 282 CAT_UNDEF arg_name %+ %%i, mp
bafad220
LM
283 CAT_UNDEF arg_name, %%i
284 %assign %%i %%i+1
285 %endrep
286 %endif
287
0f53d0cf
LM
288 %xdefine %%stack_offset stack_offset
289 %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
bafad220
LM
290 %assign %%i 0
291 %rep %0
292 %xdefine %1q r %+ %%i %+ q
293 %xdefine %1d r %+ %%i %+ d
294 %xdefine %1w r %+ %%i %+ w
295 %xdefine %1b r %+ %%i %+ b
2f77923d 296 %xdefine %1m r %+ %%i %+ m
98b9da2a 297 %xdefine %1mp r %+ %%i %+ mp
bafad220
LM
298 CAT_XDEFINE arg_name, %%i, %1
299 %assign %%i %%i+1
300 %rotate 1
301 %endrep
0f53d0cf
LM
302 %xdefine stack_offset %%stack_offset
303 %assign n_arg_names %0
bafad220
LM
304%endmacro
305
3b15a6d7 306%if WIN64 ; Windows x64 ;=================================================
bafad220 307
729f90e2
HG
308DECLARE_REG 0, rcx, ecx, cx, cl
309DECLARE_REG 1, rdx, edx, dx, dl
310DECLARE_REG 2, R8, R8D, R8W, R8B
311DECLARE_REG 3, R9, R9D, R9W, R9B
312DECLARE_REG 4, R10, R10D, R10W, R10B, 40
313DECLARE_REG 5, R11, R11D, R11W, R11B, 48
314DECLARE_REG 6, rax, eax, ax, al, 56
315DECLARE_REG 7, rdi, edi, di, dil, 64
316DECLARE_REG 8, rsi, esi, si, sil, 72
317DECLARE_REG 9, rbx, ebx, bx, bl, 80
318DECLARE_REG 10, rbp, ebp, bp, bpl, 88
319DECLARE_REG 11, R12, R12D, R12W, R12B, 96
320DECLARE_REG 12, R13, R13D, R13W, R13B, 104
321DECLARE_REG 13, R14, R14D, R14W, R14B, 112
322DECLARE_REG 14, R15, R15D, R15W, R15B, 120
3f87f39c 323
2966cc18 324%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
729f90e2 325 %assign num_args %1
3f87f39c 326 %assign regs_used %2
729f90e2
HG
327 ASSERT regs_used >= num_args
328 ASSERT regs_used <= 15
329 PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
9cf73853
HG
330 %if mmsize == 8
331 %assign xmm_regs_used 0
332 %else
333 WIN64_SPILL_XMM %3
334 %endif
729f90e2 335 LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
532e7697
LM
336 DEFINE_ARGS %4
337%endmacro
338
339%macro WIN64_SPILL_XMM 1
340 %assign xmm_regs_used %1
341 ASSERT xmm_regs_used <= 16
3f87f39c 342 %if xmm_regs_used > 6
729f90e2 343 SUB rsp, (xmm_regs_used-6)*16+16
3f87f39c
JA
344 %assign %%i xmm_regs_used
345 %rep (xmm_regs_used-6)
346 %assign %%i %%i-1
729f90e2 347 movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
3f87f39c
JA
348 %endrep
349 %endif
3f87f39c
JA
350%endmacro
351
532e7697 352%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
353 %if xmm_regs_used > 6
354 %assign %%i xmm_regs_used
355 %rep (xmm_regs_used-6)
356 %assign %%i %%i-1
729f90e2 357 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
3f87f39c
JA
358 %endrep
359 add %1, (xmm_regs_used-6)*16+16
360 %endif
361%endmacro
362
532e7697
LM
363%macro WIN64_RESTORE_XMM 1
364 WIN64_RESTORE_XMM_INTERNAL %1
3f87f39c
JA
365 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
366 %assign xmm_regs_used 0
367%endmacro
368
369%macro RET 0
532e7697 370 WIN64_RESTORE_XMM_INTERNAL rsp
729f90e2 371 POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
30b45d9c
RB
372%if mmsize == 32
373 vzeroupper
374%endif
3f87f39c 375 ret
bafad220
LM
376%endmacro
377
3f87f39c 378%macro REP_RET 0
30b45d9c 379 %if regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
3f87f39c
JA
380 RET
381 %else
382 rep ret
383 %endif
384%endmacro
385
3b15a6d7 386%elif ARCH_X86_64 ; *nix x64 ;=============================================
bafad220 387
729f90e2
HG
388DECLARE_REG 0, rdi, edi, di, dil
389DECLARE_REG 1, rsi, esi, si, sil
390DECLARE_REG 2, rdx, edx, dx, dl
391DECLARE_REG 3, rcx, ecx, cx, cl
392DECLARE_REG 4, R8, R8D, R8W, R8B
393DECLARE_REG 5, R9, R9D, R9W, R9B
394DECLARE_REG 6, rax, eax, ax, al, 8
395DECLARE_REG 7, R10, R10D, R10W, R10B, 16
396DECLARE_REG 8, R11, R11D, R11W, R11B, 24
397DECLARE_REG 9, rbx, ebx, bx, bl, 32
398DECLARE_REG 10, rbp, ebp, bp, bpl, 40
399DECLARE_REG 11, R12, R12D, R12W, R12B, 48
400DECLARE_REG 12, R13, R13D, R13W, R13B, 56
401DECLARE_REG 13, R14, R14D, R14W, R14B, 64
402DECLARE_REG 14, R15, R15D, R15W, R15B, 72
bafad220 403
3f87f39c 404%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
729f90e2
HG
405 %assign num_args %1
406 %assign regs_used %2
407 ASSERT regs_used >= num_args
408 ASSERT regs_used <= 15
409 PUSH_IF_USED 9, 10, 11, 12, 13, 14
410 LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
bafad220
LM
411 DEFINE_ARGS %4
412%endmacro
413
414%macro RET 0
729f90e2 415 POP_IF_USED 14, 13, 12, 11, 10, 9
30b45d9c
RB
416%if mmsize == 32
417 vzeroupper
418%endif
bafad220
LM
419 ret
420%endmacro
421
422%macro REP_RET 0
30b45d9c 423 %if regs_used > 9 || mmsize == 32
729f90e2
HG
424 RET
425 %else
426 rep ret
427 %endif
bafad220
LM
428%endmacro
429
430%else ; X86_32 ;==============================================================
431
729f90e2
HG
432DECLARE_REG 0, eax, eax, ax, al, 4
433DECLARE_REG 1, ecx, ecx, cx, cl, 8
434DECLARE_REG 2, edx, edx, dx, dl, 12
435DECLARE_REG 3, ebx, ebx, bx, bl, 16
436DECLARE_REG 4, esi, esi, si, null, 20
437DECLARE_REG 5, edi, edi, di, null, 24
438DECLARE_REG 6, ebp, ebp, bp, null, 28
bafad220
LM
439%define rsp esp
440
729f90e2
HG
441%macro DECLARE_ARG 1-*
442 %rep %0
443 %define r%1m [esp + stack_offset + 4*%1 + 4]
444 %define r%1mp dword r%1m
445 %rotate 1
446 %endrep
bafad220
LM
447%endmacro
448
729f90e2 449DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
bafad220 450
2966cc18 451%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
729f90e2 452 %assign num_args %1
bafad220 453 %assign regs_used %2
729f90e2
HG
454 %if regs_used > 7
455 %assign regs_used 7
456 %endif
457 ASSERT regs_used >= num_args
458 PUSH_IF_USED 3, 4, 5, 6
459 LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
bafad220
LM
460 DEFINE_ARGS %4
461%endmacro
462
463%macro RET 0
729f90e2 464 POP_IF_USED 6, 5, 4, 3
30b45d9c
RB
465%if mmsize == 32
466 vzeroupper
467%endif
bafad220
LM
468 ret
469%endmacro
470
471%macro REP_RET 0
30b45d9c 472 %if regs_used > 3 || mmsize == 32
bafad220
LM
473 RET
474 %else
475 rep ret
476 %endif
477%endmacro
478
479%endif ;======================================================================
480
3b15a6d7 481%if WIN64 == 0
532e7697
LM
482%macro WIN64_SPILL_XMM 1
483%endmacro
484%macro WIN64_RESTORE_XMM 1
485%endmacro
486%endif
487
bafad220
LM
488;=============================================================================
489; arch-independent part
490;=============================================================================
491
492%assign function_align 16
493
2f7f2e4b
LM
494; Begin a function.
495; Applies any symbol mangling needed for C linkage, and sets up a define such that
496; subsequent uses of the function name automatically refer to the mangled version.
497; Appends cpuflags to the function name if cpuflags has been specified.
498%macro cglobal 1-2+ ; name, [PROLOGUE args]
499%if %0 == 1
500 cglobal_internal %1 %+ SUFFIX
501%else
502 cglobal_internal %1 %+ SUFFIX, %2
503%endif
504%endmacro
505%macro cglobal_internal 1-2+
506 %ifndef cglobaled_%1
507 %xdefine %1 mangle(program_name %+ _ %+ %1)
508 %xdefine %1.skip_prologue %1 %+ .skip_prologue
509 CAT_XDEFINE cglobaled_, %1, 1
510 %endif
511 %xdefine current_function %1
bafad220 512 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 513 global %1:function hidden
bafad220 514 %else
40c7d0ae 515 global %1
bafad220
LM
516 %endif
517 align function_align
518 %1:
519 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
3f87f39c 520 %assign stack_offset 0
bafad220
LM
521 %if %0 > 1
522 PROLOGUE %2
523 %endif
524%endmacro
525
526%macro cextern 1
2966cc18 527 %xdefine %1 mangle(program_name %+ _ %+ %1)
2f7f2e4b 528 CAT_XDEFINE cglobaled_, %1, 1
2966cc18
JGG
529 extern %1
530%endmacro
531
2f7f2e4b 532; like cextern, but without the prefix
2966cc18
JGG
533%macro cextern_naked 1
534 %xdefine %1 mangle(%1)
2f7f2e4b 535 CAT_XDEFINE cglobaled_, %1, 1
3f87f39c 536 extern %1
bafad220
LM
537%endmacro
538
2966cc18
JGG
539%macro const 2+
540 %xdefine %1 mangle(program_name %+ _ %+ %1)
541 global %1
542 %1: %2
543%endmacro
544
bafad220
LM
545; This is needed for ELF, otherwise the GNU linker assumes the stack is
546; executable by default.
547%ifidn __OUTPUT_FORMAT__,elf
548SECTION .note.GNU-stack noalloc noexec nowrite progbits
549%endif
550
2f7f2e4b
LM
551; cpuflags
552
553%assign cpuflags_mmx (1<<0)
554%assign cpuflags_mmx2 (1<<1) | cpuflags_mmx
555%assign cpuflags_3dnow (1<<2) | cpuflags_mmx
556%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
557%assign cpuflags_sse (1<<4) | cpuflags_mmx2
558%assign cpuflags_sse2 (1<<5) | cpuflags_sse
559%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
560%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
561%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
562%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
563%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
564%assign cpuflags_avx (1<<11)| cpuflags_sse42
565%assign cpuflags_xop (1<<12)| cpuflags_avx
566%assign cpuflags_fma4 (1<<13)| cpuflags_avx
567
568%assign cpuflags_cache32 (1<<16)
569%assign cpuflags_cache64 (1<<17)
570%assign cpuflags_slowctz (1<<18)
571%assign cpuflags_lzcnt (1<<19)
572%assign cpuflags_misalign (1<<20)
573%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
574%assign cpuflags_atom (1<<22)
575
576%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
577%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
578
579; Takes up to 2 cpuflags from the above list.
580; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
581; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
582%macro INIT_CPUFLAGS 0-2
2cd1f5ca 583 CPU amdnop
2f7f2e4b
LM
584 %if %0 >= 1
585 %xdefine cpuname %1
586 %assign cpuflags cpuflags_%1
587 %if %0 >= 2
588 %xdefine cpuname %1_%2
589 %assign cpuflags cpuflags | cpuflags_%2
590 %endif
591 %xdefine SUFFIX _ %+ cpuname
592 %if cpuflag(avx)
593 %assign avx_enabled 1
594 %endif
f2bd8a07
JR
595 %if mmsize == 16 && notcpuflag(sse2)
596 %define mova movaps
597 %define movu movups
598 %define movnta movntps
599 %endif
2f7f2e4b
LM
600 %if cpuflag(aligned)
601 %define movu mova
602 %elifidn %1, sse3
603 %define movu lddqu
604 %endif
2cd1f5ca
LM
605 %if notcpuflag(mmx2)
606 CPU basicnop
607 %endif
2f7f2e4b
LM
608 %else
609 %xdefine SUFFIX
610 %undef cpuname
611 %undef cpuflags
612 %endif
613%endmacro
614
bafad220
LM
615; merge mmx and sse*
616
617%macro CAT_XDEFINE 3
618 %xdefine %1%2 %3
619%endmacro
620
621%macro CAT_UNDEF 2
622 %undef %1%2
623%endmacro
624
2f7f2e4b 625%macro INIT_MMX 0-1+
33cbfa6f 626 %assign avx_enabled 0
2f7f2e4b 627 %define RESET_MM_PERMUTATION INIT_MMX %1
bafad220
LM
628 %define mmsize 8
629 %define num_mmregs 8
630 %define mova movq
631 %define movu movq
632 %define movh movd
532e7697 633 %define movnta movntq
bafad220
LM
634 %assign %%i 0
635 %rep 8
636 CAT_XDEFINE m, %%i, mm %+ %%i
637 CAT_XDEFINE nmm, %%i, %%i
638 %assign %%i %%i+1
639 %endrep
640 %rep 8
641 CAT_UNDEF m, %%i
642 CAT_UNDEF nmm, %%i
643 %assign %%i %%i+1
644 %endrep
2f7f2e4b 645 INIT_CPUFLAGS %1
bafad220
LM
646%endmacro
647
2f7f2e4b 648%macro INIT_XMM 0-1+
33cbfa6f 649 %assign avx_enabled 0
2f7f2e4b 650 %define RESET_MM_PERMUTATION INIT_XMM %1
bafad220
LM
651 %define mmsize 16
652 %define num_mmregs 8
3b15a6d7 653 %if ARCH_X86_64
bafad220
LM
654 %define num_mmregs 16
655 %endif
656 %define mova movdqa
657 %define movu movdqu
658 %define movh movq
532e7697 659 %define movnta movntdq
bafad220
LM
660 %assign %%i 0
661 %rep num_mmregs
662 CAT_XDEFINE m, %%i, xmm %+ %%i
663 CAT_XDEFINE nxmm, %%i, %%i
664 %assign %%i %%i+1
665 %endrep
2f7f2e4b 666 INIT_CPUFLAGS %1
bafad220
LM
667%endmacro
668
2f7f2e4b 669; FIXME: INIT_AVX can be replaced by INIT_XMM avx
33cbfa6f
VS
670%macro INIT_AVX 0
671 INIT_XMM
672 %assign avx_enabled 1
673 %define PALIGNR PALIGNR_SSSE3
674 %define RESET_MM_PERMUTATION INIT_AVX
675%endmacro
676
2f7f2e4b 677%macro INIT_YMM 0-1+
33cbfa6f 678 %assign avx_enabled 1
2f7f2e4b 679 %define RESET_MM_PERMUTATION INIT_YMM %1
33cbfa6f
VS
680 %define mmsize 32
681 %define num_mmregs 8
3b15a6d7 682 %if ARCH_X86_64
33cbfa6f
VS
683 %define num_mmregs 16
684 %endif
685 %define mova vmovaps
686 %define movu vmovups
2f7f2e4b
LM
687 %undef movh
688 %define movnta vmovntps
33cbfa6f
VS
689 %assign %%i 0
690 %rep num_mmregs
691 CAT_XDEFINE m, %%i, ymm %+ %%i
692 CAT_XDEFINE nymm, %%i, %%i
693 %assign %%i %%i+1
694 %endrep
2f7f2e4b 695 INIT_CPUFLAGS %1
33cbfa6f
VS
696%endmacro
697
2f7f2e4b 698INIT_XMM
bafad220
LM
699
700; I often want to use macros that permute their arguments. e.g. there's no
701; efficient way to implement butterfly or transpose or dct without swapping some
702; arguments.
703;
704; I would like to not have to manually keep track of the permutations:
705; If I insert a permutation in the middle of a function, it should automatically
706; change everything that follows. For more complex macros I may also have multiple
707; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
708;
709; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
710; permutes its arguments. It's equivalent to exchanging the contents of the
711; registers, except that this way you exchange the register names instead, so it
712; doesn't cost any cycles.
713
714%macro PERMUTE 2-* ; takes a list of pairs to swap
715%rep %0/2
716 %xdefine tmp%2 m%2
717 %xdefine ntmp%2 nm%2
718 %rotate 2
719%endrep
720%rep %0/2
721 %xdefine m%1 tmp%2
722 %xdefine nm%1 ntmp%2
723 %undef tmp%2
724 %undef ntmp%2
725 %rotate 2
726%endrep
727%endmacro
728
729%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
730%rep %0-1
731%ifdef m%1
732 %xdefine tmp m%1
733 %xdefine m%1 m%2
734 %xdefine m%2 tmp
735 CAT_XDEFINE n, m%1, %1
736 CAT_XDEFINE n, m%2, %2
737%else
738 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
739 ; Be careful using this mode in nested macros though, as in some cases there may be
740 ; other copies of m# that have already been dereferenced and don't get updated correctly.
741 %xdefine %%n1 n %+ %1
742 %xdefine %%n2 n %+ %2
743 %xdefine tmp m %+ %%n1
744 CAT_XDEFINE m, %%n1, m %+ %%n2
745 CAT_XDEFINE m, %%n2, tmp
746 CAT_XDEFINE n, m %+ %%n1, %%n1
747 CAT_XDEFINE n, m %+ %%n2, %%n2
748%endif
749 %undef tmp
750 %rotate 1
751%endrep
752%endmacro
753
2f7f2e4b
LM
754; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
755; calls to that function will automatically load the permutation, so values can
756; be returned in mmregs.
757%macro SAVE_MM_PERMUTATION 0-1
758 %if %0
759 %xdefine %%f %1_m
760 %else
761 %xdefine %%f current_function %+ _m
762 %endif
bafad220
LM
763 %assign %%i 0
764 %rep num_mmregs
2f7f2e4b 765 CAT_XDEFINE %%f, %%i, m %+ %%i
bafad220
LM
766 %assign %%i %%i+1
767 %endrep
768%endmacro
769
2966cc18 770%macro LOAD_MM_PERMUTATION 1 ; name to load from
2f7f2e4b
LM
771 %ifdef %1_m0
772 %assign %%i 0
773 %rep num_mmregs
774 CAT_XDEFINE m, %%i, %1_m %+ %%i
775 CAT_XDEFINE n, m %+ %%i, %%i
776 %assign %%i %%i+1
777 %endrep
778 %endif
bafad220
LM
779%endmacro
780
2f7f2e4b 781; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
bafad220 782%macro call 1
2f7f2e4b
LM
783 call_internal %1, %1 %+ SUFFIX
784%endmacro
785%macro call_internal 2
786 %xdefine %%i %1
787 %ifndef cglobaled_%1
788 %ifdef cglobaled_%2
789 %xdefine %%i %2
790 %endif
bafad220 791 %endif
2f7f2e4b
LM
792 call %%i
793 LOAD_MM_PERMUTATION %%i
bafad220
LM
794%endmacro
795
2966cc18 796; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
797%macro add 2
798 %ifnum %2
799 %if %2==128
800 sub %1, -128
801 %else
802 add %1, %2
803 %endif
804 %else
805 add %1, %2
806 %endif
807%endmacro
808
809%macro sub 2
810 %ifnum %2
811 %if %2==128
812 add %1, -128
813 %else
814 sub %1, %2
815 %endif
816 %else
817 sub %1, %2
818 %endif
819%endmacro
33cbfa6f
VS
820
821;=============================================================================
822; AVX abstraction layer
823;=============================================================================
824
825%assign i 0
826%rep 16
827 %if i < 8
828 CAT_XDEFINE sizeofmm, i, 8
829 %endif
830 CAT_XDEFINE sizeofxmm, i, 16
831 CAT_XDEFINE sizeofymm, i, 32
832%assign i i+1
833%endrep
834%undef i
835
836;%1 == instruction
837;%2 == 1 if float, 0 if int
705f3d47 838;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
33cbfa6f
VS
839;%4 == number of operands given
840;%5+: operands
841%macro RUN_AVX_INSTR 6-7+
2f7f2e4b
LM
842 %ifid %5
843 %define %%size sizeof%5
844 %else
845 %define %%size mmsize
846 %endif
847 %if %%size==32
705f3d47
LM
848 %if %0 >= 7
849 v%1 %5, %6, %7
850 %else
851 v%1 %5, %6
852 %endif
33cbfa6f 853 %else
2f7f2e4b 854 %if %%size==8
33cbfa6f
VS
855 %define %%regmov movq
856 %elif %2
857 %define %%regmov movaps
858 %else
859 %define %%regmov movdqa
860 %endif
861
862 %if %4>=3+%3
863 %ifnidn %5, %6
864 %if avx_enabled && sizeof%5==16
865 v%1 %5, %6, %7
866 %else
867 %%regmov %5, %6
868 %1 %5, %7
869 %endif
870 %else
871 %1 %5, %7
872 %endif
873 %elif %3
874 %1 %5, %6, %7
875 %else
876 %1 %5, %6
877 %endif
878 %endif
879%endmacro
880
2f7f2e4b
LM
881; 3arg AVX ops with a memory arg can only have it in src2,
882; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
883; So, if the op is symmetric and the wrong one is memory, swap them.
884%macro RUN_AVX_INSTR1 8
885 %assign %%swap 0
886 %if avx_enabled
887 %ifnid %6
888 %assign %%swap 1
889 %endif
890 %elifnidn %5, %6
891 %ifnid %7
892 %assign %%swap 1
893 %endif
894 %endif
895 %if %%swap && %3 == 0 && %8 == 1
896 RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
897 %else
898 RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
899 %endif
900%endmacro
901
33cbfa6f
VS
902;%1 == instruction
903;%2 == 1 if float, 0 if int
2f7f2e4b
LM
904;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm)
905;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
906%macro AVX_INSTR 4
907 %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
33cbfa6f
VS
908 %ifidn %3, fnord
909 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
910 %elifidn %4, fnord
2f7f2e4b 911 RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
33cbfa6f
VS
912 %elifidn %5, fnord
913 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
914 %else
915 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
916 %endif
917 %endmacro
918%endmacro
919
2f7f2e4b
LM
920AVX_INSTR addpd, 1, 0, 1
921AVX_INSTR addps, 1, 0, 1
922AVX_INSTR addsd, 1, 0, 1
923AVX_INSTR addss, 1, 0, 1
924AVX_INSTR addsubpd, 1, 0, 0
925AVX_INSTR addsubps, 1, 0, 0
926AVX_INSTR andpd, 1, 0, 1
927AVX_INSTR andps, 1, 0, 1
928AVX_INSTR andnpd, 1, 0, 0
929AVX_INSTR andnps, 1, 0, 0
930AVX_INSTR blendpd, 1, 0, 0
931AVX_INSTR blendps, 1, 0, 0
932AVX_INSTR blendvpd, 1, 0, 0
933AVX_INSTR blendvps, 1, 0, 0
934AVX_INSTR cmppd, 1, 0, 0
935AVX_INSTR cmpps, 1, 0, 0
936AVX_INSTR cmpsd, 1, 0, 0
937AVX_INSTR cmpss, 1, 0, 0
705f3d47
LM
938AVX_INSTR cvtdq2ps, 1, 0, 0
939AVX_INSTR cvtps2dq, 1, 0, 0
2f7f2e4b
LM
940AVX_INSTR divpd, 1, 0, 0
941AVX_INSTR divps, 1, 0, 0
942AVX_INSTR divsd, 1, 0, 0
943AVX_INSTR divss, 1, 0, 0
944AVX_INSTR dppd, 1, 1, 0
945AVX_INSTR dpps, 1, 1, 0
946AVX_INSTR haddpd, 1, 0, 0
947AVX_INSTR haddps, 1, 0, 0
948AVX_INSTR hsubpd, 1, 0, 0
949AVX_INSTR hsubps, 1, 0, 0
950AVX_INSTR maxpd, 1, 0, 1
951AVX_INSTR maxps, 1, 0, 1
952AVX_INSTR maxsd, 1, 0, 1
953AVX_INSTR maxss, 1, 0, 1
954AVX_INSTR minpd, 1, 0, 1
955AVX_INSTR minps, 1, 0, 1
956AVX_INSTR minsd, 1, 0, 1
957AVX_INSTR minss, 1, 0, 1
39df0c43
VS
958AVX_INSTR movhlps, 1, 0, 0
959AVX_INSTR movlhps, 1, 0, 0
2f7f2e4b
LM
960AVX_INSTR movsd, 1, 0, 0
961AVX_INSTR movss, 1, 0, 0
962AVX_INSTR mpsadbw, 0, 1, 0
963AVX_INSTR mulpd, 1, 0, 1
964AVX_INSTR mulps, 1, 0, 1
965AVX_INSTR mulsd, 1, 0, 1
966AVX_INSTR mulss, 1, 0, 1
967AVX_INSTR orpd, 1, 0, 1
968AVX_INSTR orps, 1, 0, 1
969AVX_INSTR packsswb, 0, 0, 0
970AVX_INSTR packssdw, 0, 0, 0
971AVX_INSTR packuswb, 0, 0, 0
972AVX_INSTR packusdw, 0, 0, 0
973AVX_INSTR paddb, 0, 0, 1
974AVX_INSTR paddw, 0, 0, 1
975AVX_INSTR paddd, 0, 0, 1
976AVX_INSTR paddq, 0, 0, 1
977AVX_INSTR paddsb, 0, 0, 1
978AVX_INSTR paddsw, 0, 0, 1
979AVX_INSTR paddusb, 0, 0, 1
980AVX_INSTR paddusw, 0, 0, 1
981AVX_INSTR palignr, 0, 1, 0
982AVX_INSTR pand, 0, 0, 1
983AVX_INSTR pandn, 0, 0, 0
984AVX_INSTR pavgb, 0, 0, 1
985AVX_INSTR pavgw, 0, 0, 1
986AVX_INSTR pblendvb, 0, 0, 0
987AVX_INSTR pblendw, 0, 1, 0
988AVX_INSTR pcmpestri, 0, 0, 0
989AVX_INSTR pcmpestrm, 0, 0, 0
990AVX_INSTR pcmpistri, 0, 0, 0
991AVX_INSTR pcmpistrm, 0, 0, 0
992AVX_INSTR pcmpeqb, 0, 0, 1
993AVX_INSTR pcmpeqw, 0, 0, 1
994AVX_INSTR pcmpeqd, 0, 0, 1
995AVX_INSTR pcmpeqq, 0, 0, 1
996AVX_INSTR pcmpgtb, 0, 0, 0
997AVX_INSTR pcmpgtw, 0, 0, 0
998AVX_INSTR pcmpgtd, 0, 0, 0
999AVX_INSTR pcmpgtq, 0, 0, 0
1000AVX_INSTR phaddw, 0, 0, 0
1001AVX_INSTR phaddd, 0, 0, 0
1002AVX_INSTR phaddsw, 0, 0, 0
1003AVX_INSTR phsubw, 0, 0, 0
1004AVX_INSTR phsubd, 0, 0, 0
1005AVX_INSTR phsubsw, 0, 0, 0
1006AVX_INSTR pmaddwd, 0, 0, 1
1007AVX_INSTR pmaddubsw, 0, 0, 0
1008AVX_INSTR pmaxsb, 0, 0, 1
1009AVX_INSTR pmaxsw, 0, 0, 1
1010AVX_INSTR pmaxsd, 0, 0, 1
1011AVX_INSTR pmaxub, 0, 0, 1
1012AVX_INSTR pmaxuw, 0, 0, 1
1013AVX_INSTR pmaxud, 0, 0, 1
1014AVX_INSTR pminsb, 0, 0, 1
1015AVX_INSTR pminsw, 0, 0, 1
1016AVX_INSTR pminsd, 0, 0, 1
1017AVX_INSTR pminub, 0, 0, 1
1018AVX_INSTR pminuw, 0, 0, 1
1019AVX_INSTR pminud, 0, 0, 1
1020AVX_INSTR pmulhuw, 0, 0, 1
1021AVX_INSTR pmulhrsw, 0, 0, 1
1022AVX_INSTR pmulhw, 0, 0, 1
1023AVX_INSTR pmullw, 0, 0, 1
1024AVX_INSTR pmulld, 0, 0, 1
1025AVX_INSTR pmuludq, 0, 0, 1
1026AVX_INSTR pmuldq, 0, 0, 1
1027AVX_INSTR por, 0, 0, 1
1028AVX_INSTR psadbw, 0, 0, 1
1029AVX_INSTR pshufb, 0, 0, 0
1030AVX_INSTR psignb, 0, 0, 0
1031AVX_INSTR psignw, 0, 0, 0
1032AVX_INSTR psignd, 0, 0, 0
1033AVX_INSTR psllw, 0, 0, 0
1034AVX_INSTR pslld, 0, 0, 0
1035AVX_INSTR psllq, 0, 0, 0
1036AVX_INSTR pslldq, 0, 0, 0
1037AVX_INSTR psraw, 0, 0, 0
1038AVX_INSTR psrad, 0, 0, 0
1039AVX_INSTR psrlw, 0, 0, 0
1040AVX_INSTR psrld, 0, 0, 0
1041AVX_INSTR psrlq, 0, 0, 0
1042AVX_INSTR psrldq, 0, 0, 0
1043AVX_INSTR psubb, 0, 0, 0
1044AVX_INSTR psubw, 0, 0, 0
1045AVX_INSTR psubd, 0, 0, 0
1046AVX_INSTR psubq, 0, 0, 0
1047AVX_INSTR psubsb, 0, 0, 0
1048AVX_INSTR psubsw, 0, 0, 0
1049AVX_INSTR psubusb, 0, 0, 0
1050AVX_INSTR psubusw, 0, 0, 0
1051AVX_INSTR punpckhbw, 0, 0, 0
1052AVX_INSTR punpckhwd, 0, 0, 0
1053AVX_INSTR punpckhdq, 0, 0, 0
1054AVX_INSTR punpckhqdq, 0, 0, 0
1055AVX_INSTR punpcklbw, 0, 0, 0
1056AVX_INSTR punpcklwd, 0, 0, 0
1057AVX_INSTR punpckldq, 0, 0, 0
1058AVX_INSTR punpcklqdq, 0, 0, 0
1059AVX_INSTR pxor, 0, 0, 1
6b6ee582 1060AVX_INSTR shufps, 1, 1, 0
2f7f2e4b
LM
1061AVX_INSTR subpd, 1, 0, 0
1062AVX_INSTR subps, 1, 0, 0
1063AVX_INSTR subsd, 1, 0, 0
1064AVX_INSTR subss, 1, 0, 0
1065AVX_INSTR unpckhpd, 1, 0, 0
1066AVX_INSTR unpckhps, 1, 0, 0
1067AVX_INSTR unpcklpd, 1, 0, 0
1068AVX_INSTR unpcklps, 1, 0, 0
1069AVX_INSTR xorpd, 1, 0, 1
1070AVX_INSTR xorps, 1, 0, 1
33cbfa6f
VS
1071
1072; 3DNow instructions, for sharing code between AVX, SSE and 3DN
2f7f2e4b
LM
1073AVX_INSTR pfadd, 1, 0, 1
1074AVX_INSTR pfsub, 1, 0, 0
1075AVX_INSTR pfmul, 1, 0, 1
1076
1077; base-4 constants for shuffles
1078%assign i 0
1079%rep 256
1080 %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
1081 %if j < 10
1082 CAT_XDEFINE q000, j, i
1083 %elif j < 100
1084 CAT_XDEFINE q00, j, i
1085 %elif j < 1000
1086 CAT_XDEFINE q0, j, i
1087 %else
1088 CAT_XDEFINE q, j, i
1089 %endif
1090%assign i i+1
1091%endrep
1092%undef i
1093%undef j
1094
1095%macro FMA_INSTR 3
79687079
JR
1096 %macro %1 5-8 %1, %2, %3
1097 %if cpuflag(xop) || cpuflag(fma4)
1098 v%6 %1, %2, %3, %4
2f7f2e4b 1099 %else
79687079
JR
1100 %ifidn %1, %4
1101 %7 %5, %2, %3
1102 %8 %1, %4, %5
1103 %else
1104 %7 %1, %2, %3
1105 %8 %1, %4
1106 %endif
2f7f2e4b
LM
1107 %endif
1108 %endmacro
1109%endmacro
1110
79687079 1111FMA_INSTR fmaddps, mulps, addps
2f7f2e4b
LM
1112FMA_INSTR pmacsdd, pmulld, paddd
1113FMA_INSTR pmacsww, pmullw, paddw
1114FMA_INSTR pmadcswd, pmaddwd, paddd