Commit | Line | Data |
---|---|---|
bafad220 | 1 | ;***************************************************************************** |
2f7f2e4b | 2 | ;* x86inc.asm: x264asm abstraction layer |
bafad220 | 3 | ;***************************************************************************** |
729f90e2 | 4 | ;* Copyright (C) 2005-2012 x264 project |
bafad220 | 5 | ;* |
2966cc18 JGG |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | ;* Anton Mitrofanov <BugMaster@narod.ru> | |
33cbfa6f | 8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
729f90e2 | 9 | ;* Henrik Gramner <hengar-6@student.ltu.se> |
bafad220 | 10 | ;* |
2966cc18 JGG |
11 | ;* Permission to use, copy, modify, and/or distribute this software for any |
12 | ;* purpose with or without fee is hereby granted, provided that the above | |
13 | ;* copyright notice and this permission notice appear in all copies. | |
bafad220 | 14 | ;* |
2966cc18 JGG |
15 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
16 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
17 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
18 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
19 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
20 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
21 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
bafad220 LM |
22 | ;***************************************************************************** |
23 | ||
2966cc18 JGG |
24 | ; This is a header file for the x264ASM assembly language, which uses |
25 | ; NASM/YASM syntax combined with a large number of macros to provide easy | |
26 | ; abstraction between different calling conventions (x86_32, win64, linux64). | |
27 | ; It also has various other useful features to simplify writing the kind of | |
28 | ; DSP functions that are most often used in x264. | |
29 | ||
30 | ; Unlike the rest of x264, this file is available under an ISC license, as it | |
31 | ; has significant usefulness outside of x264 and we want it to be available | |
32 | ; to the largest audience possible. Of course, if you modify it for your own | |
33 | ; purposes to add a new feature, we strongly encourage contributing a patch | |
34 | ; as this feature might be useful for others as well. Send patches or ideas | |
35 | ; to x264-devel@videolan.org . | |
36 | ||
012f73e2 | 37 | %ifndef program_name |
f0d124f0 | 38 | %define program_name x264 |
012f73e2 | 39 | %endif |
2966cc18 | 40 | |
3b15a6d7 | 41 | %define WIN64 0 |
96c9cc10 | 42 | %define UNIX64 0 |
3b15a6d7 | 43 | %if ARCH_X86_64 |
3f87f39c | 44 | %ifidn __OUTPUT_FORMAT__,win32 |
3b15a6d7 | 45 | %define WIN64 1 |
166f3993 HY |
46 | %elifidn __OUTPUT_FORMAT__,win64 |
47 | %define WIN64 1 | |
3f87f39c | 48 | %else |
3b15a6d7 | 49 | %define UNIX64 1 |
3f87f39c JA |
50 | %endif |
51 | %endif | |
52 | ||
2966cc18 JGG |
53 | %ifdef PREFIX |
54 | %define mangle(x) _ %+ x | |
55 | %else | |
56 | %define mangle(x) x | |
57 | %endif | |
58 | ||
bafad220 LM |
59 | ; Name of the .rodata section. |
60 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, | |
61 | ; so use a different read-only section. | |
3f87f39c | 62 | %macro SECTION_RODATA 0-1 16 |
bafad220 | 63 | %ifidn __OUTPUT_FORMAT__,macho64 |
3f87f39c | 64 | SECTION .text align=%1 |
bafad220 | 65 | %elifidn __OUTPUT_FORMAT__,macho |
3f87f39c | 66 | SECTION .text align=%1 |
bafad220 | 67 | fakegot: |
d69f9a42 DY |
68 | %elifidn __OUTPUT_FORMAT__,aout |
69 | section .text | |
bafad220 | 70 | %else |
3f87f39c | 71 | SECTION .rodata align=%1 |
bafad220 LM |
72 | %endif |
73 | %endmacro | |
74 | ||
d69f9a42 DY |
75 | ; aout does not support align= |
76 | %macro SECTION_TEXT 0-1 16 | |
77 | %ifidn __OUTPUT_FORMAT__,aout | |
78 | SECTION .text | |
79 | %else | |
80 | SECTION .text align=%1 | |
81 | %endif | |
82 | %endmacro | |
83 | ||
3b15a6d7 | 84 | %if WIN64 |
3f87f39c | 85 | %define PIC |
412b248e | 86 | %elif ARCH_X86_64 == 0 |
2966cc18 JGG |
87 | ; x86_32 doesn't require PIC. |
88 | ; Some distros prefer shared objects to be PIC, but nothing breaks if | |
89 | ; the code contains a few textrels, so we'll skip that complexity. | |
3f87f39c JA |
90 | %undef PIC |
91 | %endif | |
92 | %ifdef PIC | |
2966cc18 | 93 | default rel |
bafad220 LM |
94 | %endif |
95 | ||
180d43bc MR |
96 | %macro CPUNOP 1 |
97 | %if HAVE_CPUNOP | |
98 | CPU %1 | |
99 | %endif | |
100 | %endmacro | |
101 | ||
729f90e2 | 102 | ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) |
180d43bc | 103 | CPUNOP amdnop |
729f90e2 | 104 | |
bafad220 LM |
105 | ; Macros to eliminate most code duplication between x86_32 and x86_64: |
106 | ; Currently this works only for leaf functions which load all their arguments | |
107 | ; into registers at the start, and make no other use of the stack. Luckily that | |
108 | ; covers most of x264's asm. | |
109 | ||
110 | ; PROLOGUE: | |
111 | ; %1 = number of arguments. loads them from stack if needed. | |
3f87f39c JA |
112 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
113 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | |
bafad220 LM |
114 | ; %4 = list of names to define to registers |
115 | ; PROLOGUE can also be invoked by adding the same options to cglobal | |
116 | ||
117 | ; e.g. | |
29e4edbb | 118 | ; cglobal foo, 2,3,0, dst, src, tmp |
3f87f39c | 119 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
bafad220 LM |
120 | |
121 | ; TODO Some functions can use some args directly from the stack. If they're the | |
122 | ; last args then you can just not declare them, but if they're in the middle | |
123 | ; we need more flexible macro. | |
124 | ||
125 | ; RET: | |
2f7f2e4b | 126 | ; Pops anything that was pushed by PROLOGUE, and returns. |
bafad220 LM |
127 | |
128 | ; REP_RET: | |
129 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | |
130 | ; which are slow when a normal ret follows a branch. | |
131 | ||
3f87f39c JA |
132 | ; registers: |
133 | ; rN and rNq are the native-size register holding function argument N | |
134 | ; rNd, rNw, rNb are dword, word, and byte size | |
96c9cc10 | 135 | ; rNh is the high 8 bits of the word size |
3f87f39c JA |
136 | ; rNm is the original location of arg N (a register or on the stack), dword |
137 | ; rNmp is native size | |
138 | ||
96c9cc10 | 139 | %macro DECLARE_REG 2-3 |
bafad220 | 140 | %define r%1q %2 |
96c9cc10 RB |
141 | %define r%1d %2d |
142 | %define r%1w %2w | |
143 | %define r%1b %2b | |
144 | %define r%1h %2h | |
7a1944b9 | 145 | %define %2q %2 |
96c9cc10 RB |
146 | %if %0 == 2 |
147 | %define r%1m %2d | |
3f87f39c | 148 | %define r%1mp %2 |
3b15a6d7 | 149 | %elif ARCH_X86_64 ; memory |
96c9cc10 | 150 | %define r%1m [rsp + stack_offset + %3] |
72382650 | 151 | %define r%1mp qword r %+ %1 %+ m |
3f87f39c | 152 | %else |
96c9cc10 | 153 | %define r%1m [esp + stack_offset + %3] |
72382650 | 154 | %define r%1mp dword r %+ %1 %+ m |
3f87f39c | 155 | %endif |
bafad220 LM |
156 | %define r%1 %2 |
157 | %endmacro | |
158 | ||
96c9cc10 | 159 | %macro DECLARE_REG_SIZE 3 |
bafad220 LM |
160 | %define r%1q r%1 |
161 | %define e%1q r%1 | |
162 | %define r%1d e%1 | |
163 | %define e%1d e%1 | |
164 | %define r%1w %1 | |
165 | %define e%1w %1 | |
96c9cc10 RB |
166 | %define r%1h %3 |
167 | %define e%1h %3 | |
bafad220 LM |
168 | %define r%1b %2 |
169 | %define e%1b %2 | |
3b15a6d7 | 170 | %if ARCH_X86_64 == 0 |
bafad220 LM |
171 | %define r%1 e%1 |
172 | %endif | |
173 | %endmacro | |
174 | ||
96c9cc10 RB |
175 | DECLARE_REG_SIZE ax, al, ah |
176 | DECLARE_REG_SIZE bx, bl, bh | |
177 | DECLARE_REG_SIZE cx, cl, ch | |
178 | DECLARE_REG_SIZE dx, dl, dh | |
179 | DECLARE_REG_SIZE si, sil, null | |
180 | DECLARE_REG_SIZE di, dil, null | |
181 | DECLARE_REG_SIZE bp, bpl, null | |
bafad220 | 182 | |
3f87f39c JA |
183 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
184 | ||
185 | %macro DECLARE_REG_TMP 1-* | |
186 | %assign %%i 0 | |
187 | %rep %0 | |
188 | CAT_XDEFINE t, %%i, r%1 | |
189 | %assign %%i %%i+1 | |
190 | %rotate 1 | |
191 | %endrep | |
192 | %endmacro | |
193 | ||
194 | %macro DECLARE_REG_TMP_SIZE 0-* | |
195 | %rep %0 | |
196 | %define t%1q t%1 %+ q | |
197 | %define t%1d t%1 %+ d | |
198 | %define t%1w t%1 %+ w | |
96c9cc10 | 199 | %define t%1h t%1 %+ h |
3f87f39c JA |
200 | %define t%1b t%1 %+ b |
201 | %rotate 1 | |
202 | %endrep | |
203 | %endmacro | |
204 | ||
729f90e2 | 205 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
3f87f39c | 206 | |
3b15a6d7 | 207 | %if ARCH_X86_64 |
bafad220 LM |
208 | %define gprsize 8 |
209 | %else | |
210 | %define gprsize 4 | |
211 | %endif | |
212 | ||
213 | %macro PUSH 1 | |
214 | push %1 | |
215 | %assign stack_offset stack_offset+gprsize | |
216 | %endmacro | |
217 | ||
218 | %macro POP 1 | |
219 | pop %1 | |
220 | %assign stack_offset stack_offset-gprsize | |
221 | %endmacro | |
222 | ||
729f90e2 HG |
223 | %macro PUSH_IF_USED 1-* |
224 | %rep %0 | |
225 | %if %1 < regs_used | |
226 | PUSH r%1 | |
227 | %endif | |
228 | %rotate 1 | |
229 | %endrep | |
230 | %endmacro | |
231 | ||
232 | %macro POP_IF_USED 1-* | |
233 | %rep %0 | |
234 | %if %1 < regs_used | |
235 | pop r%1 | |
236 | %endif | |
237 | %rotate 1 | |
238 | %endrep | |
239 | %endmacro | |
240 | ||
241 | %macro LOAD_IF_USED 1-* | |
242 | %rep %0 | |
243 | %if %1 < num_args | |
244 | mov r%1, r %+ %1 %+ mp | |
245 | %endif | |
246 | %rotate 1 | |
247 | %endrep | |
248 | %endmacro | |
249 | ||
bafad220 LM |
250 | %macro SUB 2 |
251 | sub %1, %2 | |
252 | %ifidn %1, rsp | |
253 | %assign stack_offset stack_offset+(%2) | |
254 | %endif | |
255 | %endmacro | |
256 | ||
257 | %macro ADD 2 | |
258 | add %1, %2 | |
259 | %ifidn %1, rsp | |
260 | %assign stack_offset stack_offset-(%2) | |
261 | %endif | |
262 | %endmacro | |
263 | ||
264 | %macro movifnidn 2 | |
265 | %ifnidn %1, %2 | |
266 | mov %1, %2 | |
267 | %endif | |
268 | %endmacro | |
269 | ||
270 | %macro movsxdifnidn 2 | |
271 | %ifnidn %1, %2 | |
272 | movsxd %1, %2 | |
273 | %endif | |
274 | %endmacro | |
275 | ||
276 | %macro ASSERT 1 | |
277 | %if (%1) == 0 | |
278 | %error assert failed | |
279 | %endif | |
280 | %endmacro | |
281 | ||
282 | %macro DEFINE_ARGS 0-* | |
283 | %ifdef n_arg_names | |
284 | %assign %%i 0 | |
285 | %rep n_arg_names | |
286 | CAT_UNDEF arg_name %+ %%i, q | |
287 | CAT_UNDEF arg_name %+ %%i, d | |
288 | CAT_UNDEF arg_name %+ %%i, w | |
96c9cc10 | 289 | CAT_UNDEF arg_name %+ %%i, h |
bafad220 | 290 | CAT_UNDEF arg_name %+ %%i, b |
2f77923d | 291 | CAT_UNDEF arg_name %+ %%i, m |
98b9da2a | 292 | CAT_UNDEF arg_name %+ %%i, mp |
bafad220 LM |
293 | CAT_UNDEF arg_name, %%i |
294 | %assign %%i %%i+1 | |
295 | %endrep | |
296 | %endif | |
297 | ||
0f53d0cf LM |
298 | %xdefine %%stack_offset stack_offset |
299 | %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine | |
bafad220 LM |
300 | %assign %%i 0 |
301 | %rep %0 | |
302 | %xdefine %1q r %+ %%i %+ q | |
303 | %xdefine %1d r %+ %%i %+ d | |
304 | %xdefine %1w r %+ %%i %+ w | |
96c9cc10 | 305 | %xdefine %1h r %+ %%i %+ h |
bafad220 | 306 | %xdefine %1b r %+ %%i %+ b |
2f77923d | 307 | %xdefine %1m r %+ %%i %+ m |
98b9da2a | 308 | %xdefine %1mp r %+ %%i %+ mp |
bafad220 LM |
309 | CAT_XDEFINE arg_name, %%i, %1 |
310 | %assign %%i %%i+1 | |
311 | %rotate 1 | |
312 | %endrep | |
0f53d0cf LM |
313 | %xdefine stack_offset %%stack_offset |
314 | %assign n_arg_names %0 | |
bafad220 LM |
315 | %endmacro |
316 | ||
3b15a6d7 | 317 | %if WIN64 ; Windows x64 ;================================================= |
bafad220 | 318 | |
96c9cc10 RB |
319 | DECLARE_REG 0, rcx |
320 | DECLARE_REG 1, rdx | |
321 | DECLARE_REG 2, R8 | |
322 | DECLARE_REG 3, R9 | |
323 | DECLARE_REG 4, R10, 40 | |
324 | DECLARE_REG 5, R11, 48 | |
325 | DECLARE_REG 6, rax, 56 | |
326 | DECLARE_REG 7, rdi, 64 | |
327 | DECLARE_REG 8, rsi, 72 | |
328 | DECLARE_REG 9, rbx, 80 | |
329 | DECLARE_REG 10, rbp, 88 | |
330 | DECLARE_REG 11, R12, 96 | |
331 | DECLARE_REG 12, R13, 104 | |
332 | DECLARE_REG 13, R14, 112 | |
333 | DECLARE_REG 14, R15, 120 | |
3f87f39c | 334 | |
2966cc18 | 335 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 | 336 | %assign num_args %1 |
3f87f39c | 337 | %assign regs_used %2 |
729f90e2 HG |
338 | ASSERT regs_used >= num_args |
339 | ASSERT regs_used <= 15 | |
340 | PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | |
9cf73853 HG |
341 | %if mmsize == 8 |
342 | %assign xmm_regs_used 0 | |
343 | %else | |
344 | WIN64_SPILL_XMM %3 | |
345 | %endif | |
729f90e2 | 346 | LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
532e7697 LM |
347 | DEFINE_ARGS %4 |
348 | %endmacro | |
349 | ||
350 | %macro WIN64_SPILL_XMM 1 | |
351 | %assign xmm_regs_used %1 | |
352 | ASSERT xmm_regs_used <= 16 | |
3f87f39c | 353 | %if xmm_regs_used > 6 |
729f90e2 | 354 | SUB rsp, (xmm_regs_used-6)*16+16 |
3f87f39c JA |
355 | %assign %%i xmm_regs_used |
356 | %rep (xmm_regs_used-6) | |
357 | %assign %%i %%i-1 | |
729f90e2 | 358 | movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i |
3f87f39c JA |
359 | %endrep |
360 | %endif | |
3f87f39c JA |
361 | %endmacro |
362 | ||
532e7697 | 363 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
3f87f39c JA |
364 | %if xmm_regs_used > 6 |
365 | %assign %%i xmm_regs_used | |
366 | %rep (xmm_regs_used-6) | |
367 | %assign %%i %%i-1 | |
729f90e2 | 368 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] |
3f87f39c JA |
369 | %endrep |
370 | add %1, (xmm_regs_used-6)*16+16 | |
371 | %endif | |
372 | %endmacro | |
373 | ||
532e7697 LM |
374 | %macro WIN64_RESTORE_XMM 1 |
375 | WIN64_RESTORE_XMM_INTERNAL %1 | |
3f87f39c JA |
376 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
377 | %assign xmm_regs_used 0 | |
378 | %endmacro | |
379 | ||
96c9cc10 RB |
380 | %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 |
381 | ||
3f87f39c | 382 | %macro RET 0 |
532e7697 | 383 | WIN64_RESTORE_XMM_INTERNAL rsp |
729f90e2 | 384 | POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
30b45d9c RB |
385 | %if mmsize == 32 |
386 | vzeroupper | |
387 | %endif | |
3f87f39c | 388 | ret |
bafad220 LM |
389 | %endmacro |
390 | ||
3b15a6d7 | 391 | %elif ARCH_X86_64 ; *nix x64 ;============================================= |
bafad220 | 392 | |
96c9cc10 RB |
393 | DECLARE_REG 0, rdi |
394 | DECLARE_REG 1, rsi | |
395 | DECLARE_REG 2, rdx | |
396 | DECLARE_REG 3, rcx | |
397 | DECLARE_REG 4, R8 | |
398 | DECLARE_REG 5, R9 | |
399 | DECLARE_REG 6, rax, 8 | |
400 | DECLARE_REG 7, R10, 16 | |
401 | DECLARE_REG 8, R11, 24 | |
402 | DECLARE_REG 9, rbx, 32 | |
403 | DECLARE_REG 10, rbp, 40 | |
404 | DECLARE_REG 11, R12, 48 | |
405 | DECLARE_REG 12, R13, 56 | |
406 | DECLARE_REG 13, R14, 64 | |
407 | DECLARE_REG 14, R15, 72 | |
bafad220 | 408 | |
3f87f39c | 409 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 HG |
410 | %assign num_args %1 |
411 | %assign regs_used %2 | |
412 | ASSERT regs_used >= num_args | |
413 | ASSERT regs_used <= 15 | |
414 | PUSH_IF_USED 9, 10, 11, 12, 13, 14 | |
415 | LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | |
bafad220 LM |
416 | DEFINE_ARGS %4 |
417 | %endmacro | |
418 | ||
96c9cc10 RB |
419 | %define has_epilogue regs_used > 9 || mmsize == 32 |
420 | ||
bafad220 | 421 | %macro RET 0 |
729f90e2 | 422 | POP_IF_USED 14, 13, 12, 11, 10, 9 |
30b45d9c RB |
423 | %if mmsize == 32 |
424 | vzeroupper | |
425 | %endif | |
bafad220 LM |
426 | ret |
427 | %endmacro | |
428 | ||
bafad220 LM |
429 | %else ; X86_32 ;============================================================== |
430 | ||
96c9cc10 RB |
431 | DECLARE_REG 0, eax, 4 |
432 | DECLARE_REG 1, ecx, 8 | |
433 | DECLARE_REG 2, edx, 12 | |
434 | DECLARE_REG 3, ebx, 16 | |
435 | DECLARE_REG 4, esi, 20 | |
436 | DECLARE_REG 5, edi, 24 | |
437 | DECLARE_REG 6, ebp, 28 | |
bafad220 LM |
438 | %define rsp esp |
439 | ||
729f90e2 HG |
440 | %macro DECLARE_ARG 1-* |
441 | %rep %0 | |
442 | %define r%1m [esp + stack_offset + 4*%1 + 4] | |
443 | %define r%1mp dword r%1m | |
444 | %rotate 1 | |
445 | %endrep | |
bafad220 LM |
446 | %endmacro |
447 | ||
729f90e2 | 448 | DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
bafad220 | 449 | |
2966cc18 | 450 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 | 451 | %assign num_args %1 |
bafad220 | 452 | %assign regs_used %2 |
f8d8fe25 LM |
453 | %if num_args > 7 |
454 | %assign num_args 7 | |
455 | %endif | |
729f90e2 HG |
456 | %if regs_used > 7 |
457 | %assign regs_used 7 | |
458 | %endif | |
459 | ASSERT regs_used >= num_args | |
460 | PUSH_IF_USED 3, 4, 5, 6 | |
461 | LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | |
bafad220 LM |
462 | DEFINE_ARGS %4 |
463 | %endmacro | |
464 | ||
96c9cc10 RB |
465 | %define has_epilogue regs_used > 3 || mmsize == 32 |
466 | ||
bafad220 | 467 | %macro RET 0 |
729f90e2 | 468 | POP_IF_USED 6, 5, 4, 3 |
30b45d9c RB |
469 | %if mmsize == 32 |
470 | vzeroupper | |
471 | %endif | |
bafad220 LM |
472 | ret |
473 | %endmacro | |
474 | ||
bafad220 LM |
475 | %endif ;====================================================================== |
476 | ||
3b15a6d7 | 477 | %if WIN64 == 0 |
532e7697 LM |
478 | %macro WIN64_SPILL_XMM 1 |
479 | %endmacro | |
480 | %macro WIN64_RESTORE_XMM 1 | |
481 | %endmacro | |
482 | %endif | |
483 | ||
96c9cc10 RB |
484 | %macro REP_RET 0 |
485 | %if has_epilogue | |
486 | RET | |
487 | %else | |
488 | rep ret | |
489 | %endif | |
490 | %endmacro | |
491 | ||
492 | %macro TAIL_CALL 2 ; callee, is_nonadjacent | |
493 | %if has_epilogue | |
494 | call %1 | |
495 | RET | |
496 | %elif %2 | |
497 | jmp %1 | |
498 | %endif | |
499 | %endmacro | |
500 | ||
bafad220 LM |
501 | ;============================================================================= |
502 | ; arch-independent part | |
503 | ;============================================================================= | |
504 | ||
505 | %assign function_align 16 | |
506 | ||
2f7f2e4b LM |
507 | ; Begin a function. |
508 | ; Applies any symbol mangling needed for C linkage, and sets up a define such that | |
509 | ; subsequent uses of the function name automatically refer to the mangled version. | |
510 | ; Appends cpuflags to the function name if cpuflags has been specified. | |
edd82267 | 511 | %macro cglobal 1-2+ "" ; name, [PROLOGUE args] |
2f7f2e4b | 512 | cglobal_internal %1 %+ SUFFIX, %2 |
2f7f2e4b LM |
513 | %endmacro |
514 | %macro cglobal_internal 1-2+ | |
515 | %ifndef cglobaled_%1 | |
516 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
517 | %xdefine %1.skip_prologue %1 %+ .skip_prologue | |
518 | CAT_XDEFINE cglobaled_, %1, 1 | |
519 | %endif | |
520 | %xdefine current_function %1 | |
bafad220 | 521 | %ifidn __OUTPUT_FORMAT__,elf |
40c7d0ae | 522 | global %1:function hidden |
bafad220 | 523 | %else |
40c7d0ae | 524 | global %1 |
bafad220 LM |
525 | %endif |
526 | align function_align | |
527 | %1: | |
528 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | |
3f87f39c | 529 | %assign stack_offset 0 |
edd82267 | 530 | %ifnidn %2, "" |
bafad220 LM |
531 | PROLOGUE %2 |
532 | %endif | |
533 | %endmacro | |
534 | ||
535 | %macro cextern 1 | |
2966cc18 | 536 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
2f7f2e4b | 537 | CAT_XDEFINE cglobaled_, %1, 1 |
2966cc18 JGG |
538 | extern %1 |
539 | %endmacro | |
540 | ||
2f7f2e4b | 541 | ; like cextern, but without the prefix |
2966cc18 JGG |
542 | %macro cextern_naked 1 |
543 | %xdefine %1 mangle(%1) | |
2f7f2e4b | 544 | CAT_XDEFINE cglobaled_, %1, 1 |
3f87f39c | 545 | extern %1 |
bafad220 LM |
546 | %endmacro |
547 | ||
2966cc18 JGG |
548 | %macro const 2+ |
549 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
550 | global %1 | |
551 | %1: %2 | |
552 | %endmacro | |
553 | ||
bafad220 LM |
554 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
555 | ; executable by default. | |
556 | %ifidn __OUTPUT_FORMAT__,elf | |
557 | SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
558 | %endif | |
559 | ||
2f7f2e4b LM |
560 | ; cpuflags |
561 | ||
562 | %assign cpuflags_mmx (1<<0) | |
563 | %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx | |
564 | %assign cpuflags_3dnow (1<<2) | cpuflags_mmx | |
ca844b7b | 565 | %assign cpuflags_3dnowext (1<<3) | cpuflags_3dnow |
2f7f2e4b LM |
566 | %assign cpuflags_sse (1<<4) | cpuflags_mmx2 |
567 | %assign cpuflags_sse2 (1<<5) | cpuflags_sse | |
568 | %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 | |
569 | %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 | |
570 | %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 | |
571 | %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 | |
572 | %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 | |
573 | %assign cpuflags_avx (1<<11)| cpuflags_sse42 | |
574 | %assign cpuflags_xop (1<<12)| cpuflags_avx | |
575 | %assign cpuflags_fma4 (1<<13)| cpuflags_avx | |
96c9cc10 RB |
576 | %assign cpuflags_avx2 (1<<14)| cpuflags_avx |
577 | %assign cpuflags_fma3 (1<<15)| cpuflags_avx | |
2f7f2e4b LM |
578 | |
579 | %assign cpuflags_cache32 (1<<16) | |
580 | %assign cpuflags_cache64 (1<<17) | |
581 | %assign cpuflags_slowctz (1<<18) | |
582 | %assign cpuflags_lzcnt (1<<19) | |
583 | %assign cpuflags_misalign (1<<20) | |
584 | %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant | |
585 | %assign cpuflags_atom (1<<22) | |
96c9cc10 RB |
586 | %assign cpuflags_bmi1 (1<<23) |
587 | %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 | |
588 | %assign cpuflags_tbm (1<<25)|cpuflags_bmi1 | |
2f7f2e4b LM |
589 | |
590 | %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) | |
591 | %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) | |
592 | ||
593 | ; Takes up to 2 cpuflags from the above list. | |
594 | ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. | |
595 | ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. | |
596 | %macro INIT_CPUFLAGS 0-2 | |
180d43bc | 597 | CPUNOP amdnop |
2f7f2e4b LM |
598 | %if %0 >= 1 |
599 | %xdefine cpuname %1 | |
600 | %assign cpuflags cpuflags_%1 | |
601 | %if %0 >= 2 | |
602 | %xdefine cpuname %1_%2 | |
603 | %assign cpuflags cpuflags | cpuflags_%2 | |
604 | %endif | |
605 | %xdefine SUFFIX _ %+ cpuname | |
606 | %if cpuflag(avx) | |
607 | %assign avx_enabled 1 | |
608 | %endif | |
f2bd8a07 JR |
609 | %if mmsize == 16 && notcpuflag(sse2) |
610 | %define mova movaps | |
611 | %define movu movups | |
612 | %define movnta movntps | |
613 | %endif | |
2f7f2e4b LM |
614 | %if cpuflag(aligned) |
615 | %define movu mova | |
616 | %elifidn %1, sse3 | |
617 | %define movu lddqu | |
618 | %endif | |
2cd1f5ca | 619 | %if notcpuflag(mmx2) |
180d43bc | 620 | CPUNOP basicnop |
2cd1f5ca | 621 | %endif |
2f7f2e4b LM |
622 | %else |
623 | %xdefine SUFFIX | |
624 | %undef cpuname | |
625 | %undef cpuflags | |
626 | %endif | |
627 | %endmacro | |
628 | ||
bafad220 LM |
629 | ; merge mmx and sse* |
630 | ||
631 | %macro CAT_XDEFINE 3 | |
632 | %xdefine %1%2 %3 | |
633 | %endmacro | |
634 | ||
635 | %macro CAT_UNDEF 2 | |
636 | %undef %1%2 | |
637 | %endmacro | |
638 | ||
2f7f2e4b | 639 | %macro INIT_MMX 0-1+ |
33cbfa6f | 640 | %assign avx_enabled 0 |
2f7f2e4b | 641 | %define RESET_MM_PERMUTATION INIT_MMX %1 |
bafad220 LM |
642 | %define mmsize 8 |
643 | %define num_mmregs 8 | |
644 | %define mova movq | |
645 | %define movu movq | |
646 | %define movh movd | |
532e7697 | 647 | %define movnta movntq |
bafad220 LM |
648 | %assign %%i 0 |
649 | %rep 8 | |
650 | CAT_XDEFINE m, %%i, mm %+ %%i | |
651 | CAT_XDEFINE nmm, %%i, %%i | |
652 | %assign %%i %%i+1 | |
653 | %endrep | |
654 | %rep 8 | |
655 | CAT_UNDEF m, %%i | |
656 | CAT_UNDEF nmm, %%i | |
657 | %assign %%i %%i+1 | |
658 | %endrep | |
2f7f2e4b | 659 | INIT_CPUFLAGS %1 |
bafad220 LM |
660 | %endmacro |
661 | ||
2f7f2e4b | 662 | %macro INIT_XMM 0-1+ |
33cbfa6f | 663 | %assign avx_enabled 0 |
2f7f2e4b | 664 | %define RESET_MM_PERMUTATION INIT_XMM %1 |
bafad220 LM |
665 | %define mmsize 16 |
666 | %define num_mmregs 8 | |
3b15a6d7 | 667 | %if ARCH_X86_64 |
bafad220 LM |
668 | %define num_mmregs 16 |
669 | %endif | |
670 | %define mova movdqa | |
671 | %define movu movdqu | |
672 | %define movh movq | |
532e7697 | 673 | %define movnta movntdq |
bafad220 LM |
674 | %assign %%i 0 |
675 | %rep num_mmregs | |
676 | CAT_XDEFINE m, %%i, xmm %+ %%i | |
677 | CAT_XDEFINE nxmm, %%i, %%i | |
678 | %assign %%i %%i+1 | |
679 | %endrep | |
2f7f2e4b | 680 | INIT_CPUFLAGS %1 |
bafad220 LM |
681 | %endmacro |
682 | ||
2f7f2e4b | 683 | %macro INIT_YMM 0-1+ |
33cbfa6f | 684 | %assign avx_enabled 1 |
2f7f2e4b | 685 | %define RESET_MM_PERMUTATION INIT_YMM %1 |
33cbfa6f VS |
686 | %define mmsize 32 |
687 | %define num_mmregs 8 | |
3b15a6d7 | 688 | %if ARCH_X86_64 |
33cbfa6f VS |
689 | %define num_mmregs 16 |
690 | %endif | |
691 | %define mova vmovaps | |
692 | %define movu vmovups | |
2f7f2e4b LM |
693 | %undef movh |
694 | %define movnta vmovntps | |
33cbfa6f VS |
695 | %assign %%i 0 |
696 | %rep num_mmregs | |
697 | CAT_XDEFINE m, %%i, ymm %+ %%i | |
698 | CAT_XDEFINE nymm, %%i, %%i | |
699 | %assign %%i %%i+1 | |
700 | %endrep | |
2f7f2e4b | 701 | INIT_CPUFLAGS %1 |
33cbfa6f VS |
702 | %endmacro |
703 | ||
2f7f2e4b | 704 | INIT_XMM |
bafad220 LM |
705 | |
706 | ; I often want to use macros that permute their arguments. e.g. there's no | |
707 | ; efficient way to implement butterfly or transpose or dct without swapping some | |
708 | ; arguments. | |
709 | ; | |
710 | ; I would like to not have to manually keep track of the permutations: | |
711 | ; If I insert a permutation in the middle of a function, it should automatically | |
712 | ; change everything that follows. For more complex macros I may also have multiple | |
713 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. | |
714 | ; | |
715 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | |
716 | ; permutes its arguments. It's equivalent to exchanging the contents of the | |
717 | ; registers, except that this way you exchange the register names instead, so it | |
718 | ; doesn't cost any cycles. | |
719 | ||
720 | %macro PERMUTE 2-* ; takes a list of pairs to swap | |
721 | %rep %0/2 | |
722 | %xdefine tmp%2 m%2 | |
723 | %xdefine ntmp%2 nm%2 | |
724 | %rotate 2 | |
725 | %endrep | |
726 | %rep %0/2 | |
727 | %xdefine m%1 tmp%2 | |
728 | %xdefine nm%1 ntmp%2 | |
729 | %undef tmp%2 | |
730 | %undef ntmp%2 | |
731 | %rotate 2 | |
732 | %endrep | |
733 | %endmacro | |
734 | ||
735 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | |
736 | %rep %0-1 | |
737 | %ifdef m%1 | |
738 | %xdefine tmp m%1 | |
739 | %xdefine m%1 m%2 | |
740 | %xdefine m%2 tmp | |
741 | CAT_XDEFINE n, m%1, %1 | |
742 | CAT_XDEFINE n, m%2, %2 | |
743 | %else | |
744 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. | |
745 | ; Be careful using this mode in nested macros though, as in some cases there may be | |
746 | ; other copies of m# that have already been dereferenced and don't get updated correctly. | |
747 | %xdefine %%n1 n %+ %1 | |
748 | %xdefine %%n2 n %+ %2 | |
749 | %xdefine tmp m %+ %%n1 | |
750 | CAT_XDEFINE m, %%n1, m %+ %%n2 | |
751 | CAT_XDEFINE m, %%n2, tmp | |
752 | CAT_XDEFINE n, m %+ %%n1, %%n1 | |
753 | CAT_XDEFINE n, m %+ %%n2, %%n2 | |
754 | %endif | |
755 | %undef tmp | |
756 | %rotate 1 | |
757 | %endrep | |
758 | %endmacro | |
759 | ||
2f7f2e4b LM |
760 | ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
761 | ; calls to that function will automatically load the permutation, so values can | |
762 | ; be returned in mmregs. | |
763 | %macro SAVE_MM_PERMUTATION 0-1 | |
764 | %if %0 | |
765 | %xdefine %%f %1_m | |
766 | %else | |
767 | %xdefine %%f current_function %+ _m | |
768 | %endif | |
bafad220 LM |
769 | %assign %%i 0 |
770 | %rep num_mmregs | |
2f7f2e4b | 771 | CAT_XDEFINE %%f, %%i, m %+ %%i |
bafad220 LM |
772 | %assign %%i %%i+1 |
773 | %endrep | |
774 | %endmacro | |
775 | ||
2966cc18 | 776 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
2f7f2e4b LM |
777 | %ifdef %1_m0 |
778 | %assign %%i 0 | |
779 | %rep num_mmregs | |
780 | CAT_XDEFINE m, %%i, %1_m %+ %%i | |
781 | CAT_XDEFINE n, m %+ %%i, %%i | |
782 | %assign %%i %%i+1 | |
783 | %endrep | |
784 | %endif | |
bafad220 LM |
785 | %endmacro |
786 | ||
2f7f2e4b | 787 | ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't |
bafad220 | 788 | %macro call 1 |
edd82267 | 789 | call_internal %1 %+ SUFFIX, %1 |
2f7f2e4b LM |
790 | %endmacro |
791 | %macro call_internal 2 | |
edd82267 MR |
792 | %xdefine %%i %2 |
793 | %ifndef cglobaled_%2 | |
794 | %ifdef cglobaled_%1 | |
795 | %xdefine %%i %1 | |
2f7f2e4b | 796 | %endif |
bafad220 | 797 | %endif |
2f7f2e4b LM |
798 | call %%i |
799 | LOAD_MM_PERMUTATION %%i | |
bafad220 LM |
800 | %endmacro |
801 | ||
2966cc18 | 802 | ; Substitutions that reduce instruction size but are functionally equivalent |
3f87f39c JA |
803 | %macro add 2 |
804 | %ifnum %2 | |
805 | %if %2==128 | |
806 | sub %1, -128 | |
807 | %else | |
808 | add %1, %2 | |
809 | %endif | |
810 | %else | |
811 | add %1, %2 | |
812 | %endif | |
813 | %endmacro | |
814 | ||
815 | %macro sub 2 | |
816 | %ifnum %2 | |
817 | %if %2==128 | |
818 | add %1, -128 | |
819 | %else | |
820 | sub %1, %2 | |
821 | %endif | |
822 | %else | |
823 | sub %1, %2 | |
824 | %endif | |
825 | %endmacro | |
33cbfa6f VS |
826 | |
827 | ;============================================================================= | |
828 | ; AVX abstraction layer | |
829 | ;============================================================================= | |
830 | ||
831 | %assign i 0 | |
832 | %rep 16 | |
833 | %if i < 8 | |
834 | CAT_XDEFINE sizeofmm, i, 8 | |
835 | %endif | |
836 | CAT_XDEFINE sizeofxmm, i, 16 | |
837 | CAT_XDEFINE sizeofymm, i, 32 | |
838 | %assign i i+1 | |
839 | %endrep | |
840 | %undef i | |
841 | ||
96c9cc10 RB |
842 | %macro CHECK_AVX_INSTR_EMU 3-* |
843 | %xdefine %%opcode %1 | |
844 | %xdefine %%dst %2 | |
845 | %rep %0-2 | |
846 | %ifidn %%dst, %3 | |
847 | %error non-avx emulation of ``%%opcode'' is not supported | |
848 | %endif | |
849 | %rotate 1 | |
850 | %endrep | |
851 | %endmacro | |
852 | ||
33cbfa6f VS |
853 | ;%1 == instruction |
854 | ;%2 == 1 if float, 0 if int | |
705f3d47 | 855 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
33cbfa6f VS |
856 | ;%4 == number of operands given |
857 | ;%5+: operands | |
858 | %macro RUN_AVX_INSTR 6-7+ | |
96c9cc10 RB |
859 | %ifid %6 |
860 | %define %%sizeofreg sizeof%6 | |
861 | %elifid %5 | |
862 | %define %%sizeofreg sizeof%5 | |
2f7f2e4b | 863 | %else |
96c9cc10 | 864 | %define %%sizeofreg mmsize |
2f7f2e4b | 865 | %endif |
96c9cc10 RB |
866 | %if %%sizeofreg==32 |
867 | %if %4>=3 | |
705f3d47 LM |
868 | v%1 %5, %6, %7 |
869 | %else | |
870 | v%1 %5, %6 | |
871 | %endif | |
33cbfa6f | 872 | %else |
96c9cc10 | 873 | %if %%sizeofreg==8 |
33cbfa6f VS |
874 | %define %%regmov movq |
875 | %elif %2 | |
876 | %define %%regmov movaps | |
877 | %else | |
878 | %define %%regmov movdqa | |
879 | %endif | |
880 | ||
881 | %if %4>=3+%3 | |
882 | %ifnidn %5, %6 | |
96c9cc10 | 883 | %if avx_enabled && %%sizeofreg==16 |
33cbfa6f VS |
884 | v%1 %5, %6, %7 |
885 | %else | |
96c9cc10 | 886 | CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 |
33cbfa6f VS |
887 | %%regmov %5, %6 |
888 | %1 %5, %7 | |
889 | %endif | |
890 | %else | |
891 | %1 %5, %7 | |
892 | %endif | |
96c9cc10 | 893 | %elif %4>=3 |
33cbfa6f VS |
894 | %1 %5, %6, %7 |
895 | %else | |
896 | %1 %5, %6 | |
897 | %endif | |
898 | %endif | |
899 | %endmacro | |
900 | ||
2f7f2e4b LM |
901 | ; 3arg AVX ops with a memory arg can only have it in src2, |
902 | ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). | |
903 | ; So, if the op is symmetric and the wrong one is memory, swap them. | |
904 | %macro RUN_AVX_INSTR1 8 | |
905 | %assign %%swap 0 | |
906 | %if avx_enabled | |
907 | %ifnid %6 | |
908 | %assign %%swap 1 | |
909 | %endif | |
910 | %elifnidn %5, %6 | |
911 | %ifnid %7 | |
912 | %assign %%swap 1 | |
913 | %endif | |
914 | %endif | |
915 | %if %%swap && %3 == 0 && %8 == 1 | |
916 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 | |
917 | %else | |
918 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 | |
919 | %endif | |
920 | %endmacro | |
921 | ||
33cbfa6f VS |
922 | ;%1 == instruction |
923 | ;%2 == 1 if float, 0 if int | |
96c9cc10 | 924 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
2f7f2e4b LM |
925 | ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not |
926 | %macro AVX_INSTR 4 | |
927 | %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 | |
33cbfa6f VS |
928 | %ifidn %3, fnord |
929 | RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 | |
930 | %elifidn %4, fnord | |
2f7f2e4b | 931 | RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 |
33cbfa6f VS |
932 | %elifidn %5, fnord |
933 | RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 | |
934 | %else | |
935 | RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 | |
936 | %endif | |
937 | %endmacro | |
938 | %endmacro | |
939 | ||
2f7f2e4b LM |
940 | AVX_INSTR addpd, 1, 0, 1 |
941 | AVX_INSTR addps, 1, 0, 1 | |
942 | AVX_INSTR addsd, 1, 0, 1 | |
943 | AVX_INSTR addss, 1, 0, 1 | |
944 | AVX_INSTR addsubpd, 1, 0, 0 | |
945 | AVX_INSTR addsubps, 1, 0, 0 | |
946 | AVX_INSTR andpd, 1, 0, 1 | |
947 | AVX_INSTR andps, 1, 0, 1 | |
948 | AVX_INSTR andnpd, 1, 0, 0 | |
949 | AVX_INSTR andnps, 1, 0, 0 | |
950 | AVX_INSTR blendpd, 1, 0, 0 | |
951 | AVX_INSTR blendps, 1, 0, 0 | |
952 | AVX_INSTR blendvpd, 1, 0, 0 | |
953 | AVX_INSTR blendvps, 1, 0, 0 | |
954 | AVX_INSTR cmppd, 1, 0, 0 | |
955 | AVX_INSTR cmpps, 1, 0, 0 | |
956 | AVX_INSTR cmpsd, 1, 0, 0 | |
957 | AVX_INSTR cmpss, 1, 0, 0 | |
705f3d47 | 958 | AVX_INSTR cvtdq2ps, 1, 0, 0 |
b30a3633 | 959 | AVX_INSTR cvtpd2dq, 1, 0, 0 |
705f3d47 | 960 | AVX_INSTR cvtps2dq, 1, 0, 0 |
2f7f2e4b LM |
961 | AVX_INSTR divpd, 1, 0, 0 |
962 | AVX_INSTR divps, 1, 0, 0 | |
963 | AVX_INSTR divsd, 1, 0, 0 | |
964 | AVX_INSTR divss, 1, 0, 0 | |
965 | AVX_INSTR dppd, 1, 1, 0 | |
966 | AVX_INSTR dpps, 1, 1, 0 | |
967 | AVX_INSTR haddpd, 1, 0, 0 | |
968 | AVX_INSTR haddps, 1, 0, 0 | |
969 | AVX_INSTR hsubpd, 1, 0, 0 | |
970 | AVX_INSTR hsubps, 1, 0, 0 | |
971 | AVX_INSTR maxpd, 1, 0, 1 | |
972 | AVX_INSTR maxps, 1, 0, 1 | |
973 | AVX_INSTR maxsd, 1, 0, 1 | |
974 | AVX_INSTR maxss, 1, 0, 1 | |
975 | AVX_INSTR minpd, 1, 0, 1 | |
976 | AVX_INSTR minps, 1, 0, 1 | |
977 | AVX_INSTR minsd, 1, 0, 1 | |
978 | AVX_INSTR minss, 1, 0, 1 | |
39df0c43 VS |
979 | AVX_INSTR movhlps, 1, 0, 0 |
980 | AVX_INSTR movlhps, 1, 0, 0 | |
2f7f2e4b LM |
981 | AVX_INSTR movsd, 1, 0, 0 |
982 | AVX_INSTR movss, 1, 0, 0 | |
983 | AVX_INSTR mpsadbw, 0, 1, 0 | |
984 | AVX_INSTR mulpd, 1, 0, 1 | |
985 | AVX_INSTR mulps, 1, 0, 1 | |
986 | AVX_INSTR mulsd, 1, 0, 1 | |
987 | AVX_INSTR mulss, 1, 0, 1 | |
988 | AVX_INSTR orpd, 1, 0, 1 | |
989 | AVX_INSTR orps, 1, 0, 1 | |
96c9cc10 RB |
990 | AVX_INSTR pabsb, 0, 0, 0 |
991 | AVX_INSTR pabsw, 0, 0, 0 | |
992 | AVX_INSTR pabsd, 0, 0, 0 | |
2f7f2e4b LM |
993 | AVX_INSTR packsswb, 0, 0, 0 |
994 | AVX_INSTR packssdw, 0, 0, 0 | |
995 | AVX_INSTR packuswb, 0, 0, 0 | |
996 | AVX_INSTR packusdw, 0, 0, 0 | |
997 | AVX_INSTR paddb, 0, 0, 1 | |
998 | AVX_INSTR paddw, 0, 0, 1 | |
999 | AVX_INSTR paddd, 0, 0, 1 | |
1000 | AVX_INSTR paddq, 0, 0, 1 | |
1001 | AVX_INSTR paddsb, 0, 0, 1 | |
1002 | AVX_INSTR paddsw, 0, 0, 1 | |
1003 | AVX_INSTR paddusb, 0, 0, 1 | |
1004 | AVX_INSTR paddusw, 0, 0, 1 | |
1005 | AVX_INSTR palignr, 0, 1, 0 | |
1006 | AVX_INSTR pand, 0, 0, 1 | |
1007 | AVX_INSTR pandn, 0, 0, 0 | |
1008 | AVX_INSTR pavgb, 0, 0, 1 | |
1009 | AVX_INSTR pavgw, 0, 0, 1 | |
1010 | AVX_INSTR pblendvb, 0, 0, 0 | |
1011 | AVX_INSTR pblendw, 0, 1, 0 | |
1012 | AVX_INSTR pcmpestri, 0, 0, 0 | |
1013 | AVX_INSTR pcmpestrm, 0, 0, 0 | |
1014 | AVX_INSTR pcmpistri, 0, 0, 0 | |
1015 | AVX_INSTR pcmpistrm, 0, 0, 0 | |
1016 | AVX_INSTR pcmpeqb, 0, 0, 1 | |
1017 | AVX_INSTR pcmpeqw, 0, 0, 1 | |
1018 | AVX_INSTR pcmpeqd, 0, 0, 1 | |
1019 | AVX_INSTR pcmpeqq, 0, 0, 1 | |
1020 | AVX_INSTR pcmpgtb, 0, 0, 0 | |
1021 | AVX_INSTR pcmpgtw, 0, 0, 0 | |
1022 | AVX_INSTR pcmpgtd, 0, 0, 0 | |
1023 | AVX_INSTR pcmpgtq, 0, 0, 0 | |
1024 | AVX_INSTR phaddw, 0, 0, 0 | |
1025 | AVX_INSTR phaddd, 0, 0, 0 | |
1026 | AVX_INSTR phaddsw, 0, 0, 0 | |
1027 | AVX_INSTR phsubw, 0, 0, 0 | |
1028 | AVX_INSTR phsubd, 0, 0, 0 | |
1029 | AVX_INSTR phsubsw, 0, 0, 0 | |
1030 | AVX_INSTR pmaddwd, 0, 0, 1 | |
1031 | AVX_INSTR pmaddubsw, 0, 0, 0 | |
1032 | AVX_INSTR pmaxsb, 0, 0, 1 | |
1033 | AVX_INSTR pmaxsw, 0, 0, 1 | |
1034 | AVX_INSTR pmaxsd, 0, 0, 1 | |
1035 | AVX_INSTR pmaxub, 0, 0, 1 | |
1036 | AVX_INSTR pmaxuw, 0, 0, 1 | |
1037 | AVX_INSTR pmaxud, 0, 0, 1 | |
1038 | AVX_INSTR pminsb, 0, 0, 1 | |
1039 | AVX_INSTR pminsw, 0, 0, 1 | |
1040 | AVX_INSTR pminsd, 0, 0, 1 | |
1041 | AVX_INSTR pminub, 0, 0, 1 | |
1042 | AVX_INSTR pminuw, 0, 0, 1 | |
1043 | AVX_INSTR pminud, 0, 0, 1 | |
96c9cc10 | 1044 | AVX_INSTR pmovmskb, 0, 0, 0 |
2f7f2e4b LM |
1045 | AVX_INSTR pmulhuw, 0, 0, 1 |
1046 | AVX_INSTR pmulhrsw, 0, 0, 1 | |
1047 | AVX_INSTR pmulhw, 0, 0, 1 | |
1048 | AVX_INSTR pmullw, 0, 0, 1 | |
1049 | AVX_INSTR pmulld, 0, 0, 1 | |
1050 | AVX_INSTR pmuludq, 0, 0, 1 | |
1051 | AVX_INSTR pmuldq, 0, 0, 1 | |
1052 | AVX_INSTR por, 0, 0, 1 | |
1053 | AVX_INSTR psadbw, 0, 0, 1 | |
1054 | AVX_INSTR pshufb, 0, 0, 0 | |
96c9cc10 RB |
1055 | AVX_INSTR pshufd, 0, 1, 0 |
1056 | AVX_INSTR pshufhw, 0, 1, 0 | |
1057 | AVX_INSTR pshuflw, 0, 1, 0 | |
2f7f2e4b LM |
1058 | AVX_INSTR psignb, 0, 0, 0 |
1059 | AVX_INSTR psignw, 0, 0, 0 | |
1060 | AVX_INSTR psignd, 0, 0, 0 | |
1061 | AVX_INSTR psllw, 0, 0, 0 | |
1062 | AVX_INSTR pslld, 0, 0, 0 | |
1063 | AVX_INSTR psllq, 0, 0, 0 | |
1064 | AVX_INSTR pslldq, 0, 0, 0 | |
1065 | AVX_INSTR psraw, 0, 0, 0 | |
1066 | AVX_INSTR psrad, 0, 0, 0 | |
1067 | AVX_INSTR psrlw, 0, 0, 0 | |
1068 | AVX_INSTR psrld, 0, 0, 0 | |
1069 | AVX_INSTR psrlq, 0, 0, 0 | |
1070 | AVX_INSTR psrldq, 0, 0, 0 | |
1071 | AVX_INSTR psubb, 0, 0, 0 | |
1072 | AVX_INSTR psubw, 0, 0, 0 | |
1073 | AVX_INSTR psubd, 0, 0, 0 | |
1074 | AVX_INSTR psubq, 0, 0, 0 | |
1075 | AVX_INSTR psubsb, 0, 0, 0 | |
1076 | AVX_INSTR psubsw, 0, 0, 0 | |
1077 | AVX_INSTR psubusb, 0, 0, 0 | |
1078 | AVX_INSTR psubusw, 0, 0, 0 | |
96c9cc10 | 1079 | AVX_INSTR ptest, 0, 0, 0 |
2f7f2e4b LM |
1080 | AVX_INSTR punpckhbw, 0, 0, 0 |
1081 | AVX_INSTR punpckhwd, 0, 0, 0 | |
1082 | AVX_INSTR punpckhdq, 0, 0, 0 | |
1083 | AVX_INSTR punpckhqdq, 0, 0, 0 | |
1084 | AVX_INSTR punpcklbw, 0, 0, 0 | |
1085 | AVX_INSTR punpcklwd, 0, 0, 0 | |
1086 | AVX_INSTR punpckldq, 0, 0, 0 | |
1087 | AVX_INSTR punpcklqdq, 0, 0, 0 | |
1088 | AVX_INSTR pxor, 0, 0, 1 | |
6b6ee582 | 1089 | AVX_INSTR shufps, 1, 1, 0 |
2f7f2e4b LM |
1090 | AVX_INSTR subpd, 1, 0, 0 |
1091 | AVX_INSTR subps, 1, 0, 0 | |
1092 | AVX_INSTR subsd, 1, 0, 0 | |
1093 | AVX_INSTR subss, 1, 0, 0 | |
1094 | AVX_INSTR unpckhpd, 1, 0, 0 | |
1095 | AVX_INSTR unpckhps, 1, 0, 0 | |
1096 | AVX_INSTR unpcklpd, 1, 0, 0 | |
1097 | AVX_INSTR unpcklps, 1, 0, 0 | |
1098 | AVX_INSTR xorpd, 1, 0, 1 | |
1099 | AVX_INSTR xorps, 1, 0, 1 | |
33cbfa6f VS |
1100 | |
1101 | ; 3DNow instructions, for sharing code between AVX, SSE and 3DN | |
2f7f2e4b LM |
1102 | AVX_INSTR pfadd, 1, 0, 1 |
1103 | AVX_INSTR pfsub, 1, 0, 0 | |
1104 | AVX_INSTR pfmul, 1, 0, 1 | |
1105 | ||
1106 | ; base-4 constants for shuffles | |
1107 | %assign i 0 | |
1108 | %rep 256 | |
1109 | %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) | |
1110 | %if j < 10 | |
1111 | CAT_XDEFINE q000, j, i | |
1112 | %elif j < 100 | |
1113 | CAT_XDEFINE q00, j, i | |
1114 | %elif j < 1000 | |
1115 | CAT_XDEFINE q0, j, i | |
1116 | %else | |
1117 | CAT_XDEFINE q, j, i | |
1118 | %endif | |
1119 | %assign i i+1 | |
1120 | %endrep | |
1121 | %undef i | |
1122 | %undef j | |
1123 | ||
1124 | %macro FMA_INSTR 3 | |
79687079 JR |
1125 | %macro %1 5-8 %1, %2, %3 |
1126 | %if cpuflag(xop) || cpuflag(fma4) | |
1127 | v%6 %1, %2, %3, %4 | |
2f7f2e4b | 1128 | %else |
79687079 JR |
1129 | %ifidn %1, %4 |
1130 | %7 %5, %2, %3 | |
1131 | %8 %1, %4, %5 | |
1132 | %else | |
1133 | %7 %1, %2, %3 | |
1134 | %8 %1, %4 | |
1135 | %endif | |
2f7f2e4b LM |
1136 | %endif |
1137 | %endmacro | |
1138 | %endmacro | |
1139 | ||
79687079 | 1140 | FMA_INSTR fmaddps, mulps, addps |
2f7f2e4b LM |
1141 | FMA_INSTR pmacsdd, pmulld, paddd |
1142 | FMA_INSTR pmacsww, pmullw, paddw | |
1143 | FMA_INSTR pmadcswd, pmaddwd, paddd | |
96c9cc10 RB |
1144 | |
1145 | ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. | |
1146 | ; This lets us use tzcnt without bumping the yasm version requirement yet. | |
1147 | %define tzcnt rep bsf |