Commit | Line | Data |
---|---|---|
bafad220 | 1 | ;***************************************************************************** |
2f7f2e4b | 2 | ;* x86inc.asm: x264asm abstraction layer |
bafad220 | 3 | ;***************************************************************************** |
729f90e2 | 4 | ;* Copyright (C) 2005-2012 x264 project |
bafad220 | 5 | ;* |
2966cc18 JGG |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | ;* Anton Mitrofanov <BugMaster@narod.ru> | |
33cbfa6f | 8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
729f90e2 | 9 | ;* Henrik Gramner <hengar-6@student.ltu.se> |
bafad220 | 10 | ;* |
2966cc18 JGG |
11 | ;* Permission to use, copy, modify, and/or distribute this software for any |
12 | ;* purpose with or without fee is hereby granted, provided that the above | |
13 | ;* copyright notice and this permission notice appear in all copies. | |
bafad220 | 14 | ;* |
2966cc18 JGG |
15 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
16 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
17 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
18 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
19 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
20 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
21 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
bafad220 LM |
22 | ;***************************************************************************** |
23 | ||
2966cc18 JGG |
24 | ; This is a header file for the x264ASM assembly language, which uses |
25 | ; NASM/YASM syntax combined with a large number of macros to provide easy | |
26 | ; abstraction between different calling conventions (x86_32, win64, linux64). | |
27 | ; It also has various other useful features to simplify writing the kind of | |
28 | ; DSP functions that are most often used in x264. | |
29 | ||
30 | ; Unlike the rest of x264, this file is available under an ISC license, as it | |
31 | ; has significant usefulness outside of x264 and we want it to be available | |
32 | ; to the largest audience possible. Of course, if you modify it for your own | |
33 | ; purposes to add a new feature, we strongly encourage contributing a patch | |
34 | ; as this feature might be useful for others as well. Send patches or ideas | |
35 | ; to x264-devel@videolan.org . | |
36 | ||
37 | %define program_name ff | |
38 | ||
3b15a6d7 RB |
39 | %define UNIX64 0 |
40 | %define WIN64 0 | |
41 | %if ARCH_X86_64 | |
3f87f39c | 42 | %ifidn __OUTPUT_FORMAT__,win32 |
3b15a6d7 | 43 | %define WIN64 1 |
166f3993 HY |
44 | %elifidn __OUTPUT_FORMAT__,win64 |
45 | %define WIN64 1 | |
3f87f39c | 46 | %else |
3b15a6d7 | 47 | %define UNIX64 1 |
3f87f39c JA |
48 | %endif |
49 | %endif | |
50 | ||
2966cc18 JGG |
51 | %ifdef PREFIX |
52 | %define mangle(x) _ %+ x | |
53 | %else | |
54 | %define mangle(x) x | |
55 | %endif | |
56 | ||
bafad220 LM |
57 | ; FIXME: All of the 64bit asm functions that take a stride as an argument |
58 | ; via register, assume that the high dword of that register is filled with 0. | |
59 | ; This is true in practice (since we never do any 64bit arithmetic on strides, | |
60 | ; and x264's strides are all positive), but is not guaranteed by the ABI. | |
61 | ||
62 | ; Name of the .rodata section. | |
63 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, | |
64 | ; so use a different read-only section. | |
3f87f39c | 65 | %macro SECTION_RODATA 0-1 16 |
bafad220 | 66 | %ifidn __OUTPUT_FORMAT__,macho64 |
3f87f39c | 67 | SECTION .text align=%1 |
bafad220 | 68 | %elifidn __OUTPUT_FORMAT__,macho |
3f87f39c | 69 | SECTION .text align=%1 |
bafad220 | 70 | fakegot: |
d69f9a42 DY |
71 | %elifidn __OUTPUT_FORMAT__,aout |
72 | section .text | |
bafad220 | 73 | %else |
3f87f39c | 74 | SECTION .rodata align=%1 |
bafad220 LM |
75 | %endif |
76 | %endmacro | |
77 | ||
d69f9a42 DY |
78 | ; aout does not support align= |
79 | %macro SECTION_TEXT 0-1 16 | |
80 | %ifidn __OUTPUT_FORMAT__,aout | |
81 | SECTION .text | |
82 | %else | |
83 | SECTION .text align=%1 | |
84 | %endif | |
85 | %endmacro | |
86 | ||
3b15a6d7 | 87 | %if WIN64 |
3f87f39c | 88 | %define PIC |
412b248e | 89 | %elif ARCH_X86_64 == 0 |
2966cc18 JGG |
90 | ; x86_32 doesn't require PIC. |
91 | ; Some distros prefer shared objects to be PIC, but nothing breaks if | |
92 | ; the code contains a few textrels, so we'll skip that complexity. | |
3f87f39c JA |
93 | %undef PIC |
94 | %endif | |
95 | %ifdef PIC | |
2966cc18 | 96 | default rel |
bafad220 LM |
97 | %endif |
98 | ||
729f90e2 HG |
99 | ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) |
100 | CPU amdnop | |
101 | ||
bafad220 LM |
102 | ; Macros to eliminate most code duplication between x86_32 and x86_64: |
103 | ; Currently this works only for leaf functions which load all their arguments | |
104 | ; into registers at the start, and make no other use of the stack. Luckily that | |
105 | ; covers most of x264's asm. | |
106 | ||
107 | ; PROLOGUE: | |
108 | ; %1 = number of arguments. loads them from stack if needed. | |
3f87f39c JA |
109 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
110 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | |
bafad220 LM |
111 | ; %4 = list of names to define to registers |
112 | ; PROLOGUE can also be invoked by adding the same options to cglobal | |
113 | ||
114 | ; e.g. | |
29e4edbb | 115 | ; cglobal foo, 2,3,0, dst, src, tmp |
3f87f39c | 116 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
bafad220 LM |
117 | |
118 | ; TODO Some functions can use some args directly from the stack. If they're the | |
119 | ; last args then you can just not declare them, but if they're in the middle | |
120 | ; we need more flexible macro. | |
121 | ||
122 | ; RET: | |
2f7f2e4b | 123 | ; Pops anything that was pushed by PROLOGUE, and returns. |
bafad220 LM |
124 | |
125 | ; REP_RET: | |
126 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | |
127 | ; which are slow when a normal ret follows a branch. | |
128 | ||
3f87f39c JA |
129 | ; registers: |
130 | ; rN and rNq are the native-size register holding function argument N | |
131 | ; rNd, rNw, rNb are dword, word, and byte size | |
132 | ; rNm is the original location of arg N (a register or on the stack), dword | |
133 | ; rNmp is native size | |
134 | ||
729f90e2 | 135 | %macro DECLARE_REG 5-6 |
bafad220 LM |
136 | %define r%1q %2 |
137 | %define r%1d %3 | |
138 | %define r%1w %4 | |
139 | %define r%1b %5 | |
729f90e2 HG |
140 | %if %0 == 5 |
141 | %define r%1m %3 | |
3f87f39c | 142 | %define r%1mp %2 |
3b15a6d7 | 143 | %elif ARCH_X86_64 ; memory |
729f90e2 HG |
144 | %define r%1m [rsp + stack_offset + %6] |
145 | %define r%1mp qword r %+ %1m | |
3f87f39c | 146 | %else |
729f90e2 HG |
147 | %define r%1m [esp + stack_offset + %6] |
148 | %define r%1mp dword r %+ %1m | |
3f87f39c | 149 | %endif |
bafad220 LM |
150 | %define r%1 %2 |
151 | %endmacro | |
152 | ||
153 | %macro DECLARE_REG_SIZE 2 | |
154 | %define r%1q r%1 | |
155 | %define e%1q r%1 | |
156 | %define r%1d e%1 | |
157 | %define e%1d e%1 | |
158 | %define r%1w %1 | |
159 | %define e%1w %1 | |
160 | %define r%1b %2 | |
161 | %define e%1b %2 | |
3b15a6d7 | 162 | %if ARCH_X86_64 == 0 |
bafad220 LM |
163 | %define r%1 e%1 |
164 | %endif | |
165 | %endmacro | |
166 | ||
167 | DECLARE_REG_SIZE ax, al | |
168 | DECLARE_REG_SIZE bx, bl | |
169 | DECLARE_REG_SIZE cx, cl | |
170 | DECLARE_REG_SIZE dx, dl | |
171 | DECLARE_REG_SIZE si, sil | |
172 | DECLARE_REG_SIZE di, dil | |
173 | DECLARE_REG_SIZE bp, bpl | |
174 | ||
3f87f39c JA |
175 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
176 | ||
177 | %macro DECLARE_REG_TMP 1-* | |
178 | %assign %%i 0 | |
179 | %rep %0 | |
180 | CAT_XDEFINE t, %%i, r%1 | |
181 | %assign %%i %%i+1 | |
182 | %rotate 1 | |
183 | %endrep | |
184 | %endmacro | |
185 | ||
186 | %macro DECLARE_REG_TMP_SIZE 0-* | |
187 | %rep %0 | |
188 | %define t%1q t%1 %+ q | |
189 | %define t%1d t%1 %+ d | |
190 | %define t%1w t%1 %+ w | |
191 | %define t%1b t%1 %+ b | |
192 | %rotate 1 | |
193 | %endrep | |
194 | %endmacro | |
195 | ||
729f90e2 | 196 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
3f87f39c | 197 | |
3b15a6d7 | 198 | %if ARCH_X86_64 |
bafad220 LM |
199 | %define gprsize 8 |
200 | %else | |
201 | %define gprsize 4 | |
202 | %endif | |
203 | ||
204 | %macro PUSH 1 | |
205 | push %1 | |
206 | %assign stack_offset stack_offset+gprsize | |
207 | %endmacro | |
208 | ||
209 | %macro POP 1 | |
210 | pop %1 | |
211 | %assign stack_offset stack_offset-gprsize | |
212 | %endmacro | |
213 | ||
729f90e2 HG |
214 | %macro PUSH_IF_USED 1-* |
215 | %rep %0 | |
216 | %if %1 < regs_used | |
217 | PUSH r%1 | |
218 | %endif | |
219 | %rotate 1 | |
220 | %endrep | |
221 | %endmacro | |
222 | ||
223 | %macro POP_IF_USED 1-* | |
224 | %rep %0 | |
225 | %if %1 < regs_used | |
226 | pop r%1 | |
227 | %endif | |
228 | %rotate 1 | |
229 | %endrep | |
230 | %endmacro | |
231 | ||
232 | %macro LOAD_IF_USED 1-* | |
233 | %rep %0 | |
234 | %if %1 < num_args | |
235 | mov r%1, r %+ %1 %+ mp | |
236 | %endif | |
237 | %rotate 1 | |
238 | %endrep | |
239 | %endmacro | |
240 | ||
bafad220 LM |
241 | %macro SUB 2 |
242 | sub %1, %2 | |
243 | %ifidn %1, rsp | |
244 | %assign stack_offset stack_offset+(%2) | |
245 | %endif | |
246 | %endmacro | |
247 | ||
248 | %macro ADD 2 | |
249 | add %1, %2 | |
250 | %ifidn %1, rsp | |
251 | %assign stack_offset stack_offset-(%2) | |
252 | %endif | |
253 | %endmacro | |
254 | ||
255 | %macro movifnidn 2 | |
256 | %ifnidn %1, %2 | |
257 | mov %1, %2 | |
258 | %endif | |
259 | %endmacro | |
260 | ||
261 | %macro movsxdifnidn 2 | |
262 | %ifnidn %1, %2 | |
263 | movsxd %1, %2 | |
264 | %endif | |
265 | %endmacro | |
266 | ||
267 | %macro ASSERT 1 | |
268 | %if (%1) == 0 | |
269 | %error assert failed | |
270 | %endif | |
271 | %endmacro | |
272 | ||
273 | %macro DEFINE_ARGS 0-* | |
274 | %ifdef n_arg_names | |
275 | %assign %%i 0 | |
276 | %rep n_arg_names | |
277 | CAT_UNDEF arg_name %+ %%i, q | |
278 | CAT_UNDEF arg_name %+ %%i, d | |
279 | CAT_UNDEF arg_name %+ %%i, w | |
280 | CAT_UNDEF arg_name %+ %%i, b | |
2f77923d | 281 | CAT_UNDEF arg_name %+ %%i, m |
98b9da2a | 282 | CAT_UNDEF arg_name %+ %%i, mp |
bafad220 LM |
283 | CAT_UNDEF arg_name, %%i |
284 | %assign %%i %%i+1 | |
285 | %endrep | |
286 | %endif | |
287 | ||
0f53d0cf LM |
288 | %xdefine %%stack_offset stack_offset |
289 | %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine | |
bafad220 LM |
290 | %assign %%i 0 |
291 | %rep %0 | |
292 | %xdefine %1q r %+ %%i %+ q | |
293 | %xdefine %1d r %+ %%i %+ d | |
294 | %xdefine %1w r %+ %%i %+ w | |
295 | %xdefine %1b r %+ %%i %+ b | |
2f77923d | 296 | %xdefine %1m r %+ %%i %+ m |
98b9da2a | 297 | %xdefine %1mp r %+ %%i %+ mp |
bafad220 LM |
298 | CAT_XDEFINE arg_name, %%i, %1 |
299 | %assign %%i %%i+1 | |
300 | %rotate 1 | |
301 | %endrep | |
0f53d0cf LM |
302 | %xdefine stack_offset %%stack_offset |
303 | %assign n_arg_names %0 | |
bafad220 LM |
304 | %endmacro |
305 | ||
3b15a6d7 | 306 | %if WIN64 ; Windows x64 ;================================================= |
bafad220 | 307 | |
729f90e2 HG |
308 | DECLARE_REG 0, rcx, ecx, cx, cl |
309 | DECLARE_REG 1, rdx, edx, dx, dl | |
310 | DECLARE_REG 2, R8, R8D, R8W, R8B | |
311 | DECLARE_REG 3, R9, R9D, R9W, R9B | |
312 | DECLARE_REG 4, R10, R10D, R10W, R10B, 40 | |
313 | DECLARE_REG 5, R11, R11D, R11W, R11B, 48 | |
314 | DECLARE_REG 6, rax, eax, ax, al, 56 | |
315 | DECLARE_REG 7, rdi, edi, di, dil, 64 | |
316 | DECLARE_REG 8, rsi, esi, si, sil, 72 | |
317 | DECLARE_REG 9, rbx, ebx, bx, bl, 80 | |
318 | DECLARE_REG 10, rbp, ebp, bp, bpl, 88 | |
319 | DECLARE_REG 11, R12, R12D, R12W, R12B, 96 | |
320 | DECLARE_REG 12, R13, R13D, R13W, R13B, 104 | |
321 | DECLARE_REG 13, R14, R14D, R14W, R14B, 112 | |
322 | DECLARE_REG 14, R15, R15D, R15W, R15B, 120 | |
3f87f39c | 323 | |
2966cc18 | 324 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 | 325 | %assign num_args %1 |
3f87f39c | 326 | %assign regs_used %2 |
729f90e2 HG |
327 | ASSERT regs_used >= num_args |
328 | ASSERT regs_used <= 15 | |
329 | PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | |
9cf73853 HG |
330 | %if mmsize == 8 |
331 | %assign xmm_regs_used 0 | |
332 | %else | |
333 | WIN64_SPILL_XMM %3 | |
334 | %endif | |
729f90e2 | 335 | LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
532e7697 LM |
336 | DEFINE_ARGS %4 |
337 | %endmacro | |
338 | ||
339 | %macro WIN64_SPILL_XMM 1 | |
340 | %assign xmm_regs_used %1 | |
341 | ASSERT xmm_regs_used <= 16 | |
3f87f39c | 342 | %if xmm_regs_used > 6 |
729f90e2 | 343 | SUB rsp, (xmm_regs_used-6)*16+16 |
3f87f39c JA |
344 | %assign %%i xmm_regs_used |
345 | %rep (xmm_regs_used-6) | |
346 | %assign %%i %%i-1 | |
729f90e2 | 347 | movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i |
3f87f39c JA |
348 | %endrep |
349 | %endif | |
3f87f39c JA |
350 | %endmacro |
351 | ||
532e7697 | 352 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
3f87f39c JA |
353 | %if xmm_regs_used > 6 |
354 | %assign %%i xmm_regs_used | |
355 | %rep (xmm_regs_used-6) | |
356 | %assign %%i %%i-1 | |
729f90e2 | 357 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] |
3f87f39c JA |
358 | %endrep |
359 | add %1, (xmm_regs_used-6)*16+16 | |
360 | %endif | |
361 | %endmacro | |
362 | ||
532e7697 LM |
363 | %macro WIN64_RESTORE_XMM 1 |
364 | WIN64_RESTORE_XMM_INTERNAL %1 | |
3f87f39c JA |
365 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
366 | %assign xmm_regs_used 0 | |
367 | %endmacro | |
368 | ||
369 | %macro RET 0 | |
532e7697 | 370 | WIN64_RESTORE_XMM_INTERNAL rsp |
729f90e2 | 371 | POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
30b45d9c RB |
372 | %if mmsize == 32 |
373 | vzeroupper | |
374 | %endif | |
3f87f39c | 375 | ret |
bafad220 LM |
376 | %endmacro |
377 | ||
3f87f39c | 378 | %macro REP_RET 0 |
30b45d9c | 379 | %if regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 |
3f87f39c JA |
380 | RET |
381 | %else | |
382 | rep ret | |
383 | %endif | |
384 | %endmacro | |
385 | ||
3b15a6d7 | 386 | %elif ARCH_X86_64 ; *nix x64 ;============================================= |
bafad220 | 387 | |
729f90e2 HG |
388 | DECLARE_REG 0, rdi, edi, di, dil |
389 | DECLARE_REG 1, rsi, esi, si, sil | |
390 | DECLARE_REG 2, rdx, edx, dx, dl | |
391 | DECLARE_REG 3, rcx, ecx, cx, cl | |
392 | DECLARE_REG 4, R8, R8D, R8W, R8B | |
393 | DECLARE_REG 5, R9, R9D, R9W, R9B | |
394 | DECLARE_REG 6, rax, eax, ax, al, 8 | |
395 | DECLARE_REG 7, R10, R10D, R10W, R10B, 16 | |
396 | DECLARE_REG 8, R11, R11D, R11W, R11B, 24 | |
397 | DECLARE_REG 9, rbx, ebx, bx, bl, 32 | |
398 | DECLARE_REG 10, rbp, ebp, bp, bpl, 40 | |
399 | DECLARE_REG 11, R12, R12D, R12W, R12B, 48 | |
400 | DECLARE_REG 12, R13, R13D, R13W, R13B, 56 | |
401 | DECLARE_REG 13, R14, R14D, R14W, R14B, 64 | |
402 | DECLARE_REG 14, R15, R15D, R15W, R15B, 72 | |
bafad220 | 403 | |
3f87f39c | 404 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 HG |
405 | %assign num_args %1 |
406 | %assign regs_used %2 | |
407 | ASSERT regs_used >= num_args | |
408 | ASSERT regs_used <= 15 | |
409 | PUSH_IF_USED 9, 10, 11, 12, 13, 14 | |
410 | LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | |
bafad220 LM |
411 | DEFINE_ARGS %4 |
412 | %endmacro | |
413 | ||
414 | %macro RET 0 | |
729f90e2 | 415 | POP_IF_USED 14, 13, 12, 11, 10, 9 |
30b45d9c RB |
416 | %if mmsize == 32 |
417 | vzeroupper | |
418 | %endif | |
bafad220 LM |
419 | ret |
420 | %endmacro | |
421 | ||
422 | %macro REP_RET 0 | |
30b45d9c | 423 | %if regs_used > 9 || mmsize == 32 |
729f90e2 HG |
424 | RET |
425 | %else | |
426 | rep ret | |
427 | %endif | |
bafad220 LM |
428 | %endmacro |
429 | ||
430 | %else ; X86_32 ;============================================================== | |
431 | ||
729f90e2 HG |
432 | DECLARE_REG 0, eax, eax, ax, al, 4 |
433 | DECLARE_REG 1, ecx, ecx, cx, cl, 8 | |
434 | DECLARE_REG 2, edx, edx, dx, dl, 12 | |
435 | DECLARE_REG 3, ebx, ebx, bx, bl, 16 | |
436 | DECLARE_REG 4, esi, esi, si, null, 20 | |
437 | DECLARE_REG 5, edi, edi, di, null, 24 | |
438 | DECLARE_REG 6, ebp, ebp, bp, null, 28 | |
bafad220 LM |
439 | %define rsp esp |
440 | ||
729f90e2 HG |
441 | %macro DECLARE_ARG 1-* |
442 | %rep %0 | |
443 | %define r%1m [esp + stack_offset + 4*%1 + 4] | |
444 | %define r%1mp dword r%1m | |
445 | %rotate 1 | |
446 | %endrep | |
bafad220 LM |
447 | %endmacro |
448 | ||
729f90e2 | 449 | DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
bafad220 | 450 | |
2966cc18 | 451 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
729f90e2 | 452 | %assign num_args %1 |
bafad220 | 453 | %assign regs_used %2 |
729f90e2 HG |
454 | %if regs_used > 7 |
455 | %assign regs_used 7 | |
456 | %endif | |
457 | ASSERT regs_used >= num_args | |
458 | PUSH_IF_USED 3, 4, 5, 6 | |
459 | LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | |
bafad220 LM |
460 | DEFINE_ARGS %4 |
461 | %endmacro | |
462 | ||
463 | %macro RET 0 | |
729f90e2 | 464 | POP_IF_USED 6, 5, 4, 3 |
30b45d9c RB |
465 | %if mmsize == 32 |
466 | vzeroupper | |
467 | %endif | |
bafad220 LM |
468 | ret |
469 | %endmacro | |
470 | ||
471 | %macro REP_RET 0 | |
30b45d9c | 472 | %if regs_used > 3 || mmsize == 32 |
bafad220 LM |
473 | RET |
474 | %else | |
475 | rep ret | |
476 | %endif | |
477 | %endmacro | |
478 | ||
479 | %endif ;====================================================================== | |
480 | ||
3b15a6d7 | 481 | %if WIN64 == 0 |
532e7697 LM |
482 | %macro WIN64_SPILL_XMM 1 |
483 | %endmacro | |
484 | %macro WIN64_RESTORE_XMM 1 | |
485 | %endmacro | |
486 | %endif | |
487 | ||
bafad220 LM |
488 | ;============================================================================= |
489 | ; arch-independent part | |
490 | ;============================================================================= | |
491 | ||
492 | %assign function_align 16 | |
493 | ||
2f7f2e4b LM |
494 | ; Begin a function. |
495 | ; Applies any symbol mangling needed for C linkage, and sets up a define such that | |
496 | ; subsequent uses of the function name automatically refer to the mangled version. | |
497 | ; Appends cpuflags to the function name if cpuflags has been specified. | |
498 | %macro cglobal 1-2+ ; name, [PROLOGUE args] | |
499 | %if %0 == 1 | |
500 | cglobal_internal %1 %+ SUFFIX | |
501 | %else | |
502 | cglobal_internal %1 %+ SUFFIX, %2 | |
503 | %endif | |
504 | %endmacro | |
505 | %macro cglobal_internal 1-2+ | |
506 | %ifndef cglobaled_%1 | |
507 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
508 | %xdefine %1.skip_prologue %1 %+ .skip_prologue | |
509 | CAT_XDEFINE cglobaled_, %1, 1 | |
510 | %endif | |
511 | %xdefine current_function %1 | |
bafad220 | 512 | %ifidn __OUTPUT_FORMAT__,elf |
40c7d0ae | 513 | global %1:function hidden |
bafad220 | 514 | %else |
40c7d0ae | 515 | global %1 |
bafad220 LM |
516 | %endif |
517 | align function_align | |
518 | %1: | |
519 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | |
3f87f39c | 520 | %assign stack_offset 0 |
bafad220 LM |
521 | %if %0 > 1 |
522 | PROLOGUE %2 | |
523 | %endif | |
524 | %endmacro | |
525 | ||
526 | %macro cextern 1 | |
2966cc18 | 527 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
2f7f2e4b | 528 | CAT_XDEFINE cglobaled_, %1, 1 |
2966cc18 JGG |
529 | extern %1 |
530 | %endmacro | |
531 | ||
2f7f2e4b | 532 | ; like cextern, but without the prefix |
2966cc18 JGG |
533 | %macro cextern_naked 1 |
534 | %xdefine %1 mangle(%1) | |
2f7f2e4b | 535 | CAT_XDEFINE cglobaled_, %1, 1 |
3f87f39c | 536 | extern %1 |
bafad220 LM |
537 | %endmacro |
538 | ||
2966cc18 JGG |
539 | %macro const 2+ |
540 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
541 | global %1 | |
542 | %1: %2 | |
543 | %endmacro | |
544 | ||
bafad220 LM |
545 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
546 | ; executable by default. | |
547 | %ifidn __OUTPUT_FORMAT__,elf | |
548 | SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
549 | %endif | |
550 | ||
2f7f2e4b LM |
551 | ; cpuflags |
552 | ||
553 | %assign cpuflags_mmx (1<<0) | |
554 | %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx | |
555 | %assign cpuflags_3dnow (1<<2) | cpuflags_mmx | |
556 | %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow | |
557 | %assign cpuflags_sse (1<<4) | cpuflags_mmx2 | |
558 | %assign cpuflags_sse2 (1<<5) | cpuflags_sse | |
559 | %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 | |
560 | %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 | |
561 | %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 | |
562 | %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 | |
563 | %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 | |
564 | %assign cpuflags_avx (1<<11)| cpuflags_sse42 | |
565 | %assign cpuflags_xop (1<<12)| cpuflags_avx | |
566 | %assign cpuflags_fma4 (1<<13)| cpuflags_avx | |
567 | ||
568 | %assign cpuflags_cache32 (1<<16) | |
569 | %assign cpuflags_cache64 (1<<17) | |
570 | %assign cpuflags_slowctz (1<<18) | |
571 | %assign cpuflags_lzcnt (1<<19) | |
572 | %assign cpuflags_misalign (1<<20) | |
573 | %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant | |
574 | %assign cpuflags_atom (1<<22) | |
575 | ||
576 | %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) | |
577 | %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) | |
578 | ||
579 | ; Takes up to 2 cpuflags from the above list. | |
580 | ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. | |
581 | ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. | |
582 | %macro INIT_CPUFLAGS 0-2 | |
2cd1f5ca | 583 | CPU amdnop |
2f7f2e4b LM |
584 | %if %0 >= 1 |
585 | %xdefine cpuname %1 | |
586 | %assign cpuflags cpuflags_%1 | |
587 | %if %0 >= 2 | |
588 | %xdefine cpuname %1_%2 | |
589 | %assign cpuflags cpuflags | cpuflags_%2 | |
590 | %endif | |
591 | %xdefine SUFFIX _ %+ cpuname | |
592 | %if cpuflag(avx) | |
593 | %assign avx_enabled 1 | |
594 | %endif | |
f2bd8a07 JR |
595 | %if mmsize == 16 && notcpuflag(sse2) |
596 | %define mova movaps | |
597 | %define movu movups | |
598 | %define movnta movntps | |
599 | %endif | |
2f7f2e4b LM |
600 | %if cpuflag(aligned) |
601 | %define movu mova | |
602 | %elifidn %1, sse3 | |
603 | %define movu lddqu | |
604 | %endif | |
2cd1f5ca LM |
605 | %if notcpuflag(mmx2) |
606 | CPU basicnop | |
607 | %endif | |
2f7f2e4b LM |
608 | %else |
609 | %xdefine SUFFIX | |
610 | %undef cpuname | |
611 | %undef cpuflags | |
612 | %endif | |
613 | %endmacro | |
614 | ||
bafad220 LM |
615 | ; merge mmx and sse* |
616 | ||
617 | %macro CAT_XDEFINE 3 | |
618 | %xdefine %1%2 %3 | |
619 | %endmacro | |
620 | ||
621 | %macro CAT_UNDEF 2 | |
622 | %undef %1%2 | |
623 | %endmacro | |
624 | ||
2f7f2e4b | 625 | %macro INIT_MMX 0-1+ |
33cbfa6f | 626 | %assign avx_enabled 0 |
2f7f2e4b | 627 | %define RESET_MM_PERMUTATION INIT_MMX %1 |
bafad220 LM |
628 | %define mmsize 8 |
629 | %define num_mmregs 8 | |
630 | %define mova movq | |
631 | %define movu movq | |
632 | %define movh movd | |
532e7697 | 633 | %define movnta movntq |
bafad220 LM |
634 | %assign %%i 0 |
635 | %rep 8 | |
636 | CAT_XDEFINE m, %%i, mm %+ %%i | |
637 | CAT_XDEFINE nmm, %%i, %%i | |
638 | %assign %%i %%i+1 | |
639 | %endrep | |
640 | %rep 8 | |
641 | CAT_UNDEF m, %%i | |
642 | CAT_UNDEF nmm, %%i | |
643 | %assign %%i %%i+1 | |
644 | %endrep | |
2f7f2e4b | 645 | INIT_CPUFLAGS %1 |
bafad220 LM |
646 | %endmacro |
647 | ||
2f7f2e4b | 648 | %macro INIT_XMM 0-1+ |
33cbfa6f | 649 | %assign avx_enabled 0 |
2f7f2e4b | 650 | %define RESET_MM_PERMUTATION INIT_XMM %1 |
bafad220 LM |
651 | %define mmsize 16 |
652 | %define num_mmregs 8 | |
3b15a6d7 | 653 | %if ARCH_X86_64 |
bafad220 LM |
654 | %define num_mmregs 16 |
655 | %endif | |
656 | %define mova movdqa | |
657 | %define movu movdqu | |
658 | %define movh movq | |
532e7697 | 659 | %define movnta movntdq |
bafad220 LM |
660 | %assign %%i 0 |
661 | %rep num_mmregs | |
662 | CAT_XDEFINE m, %%i, xmm %+ %%i | |
663 | CAT_XDEFINE nxmm, %%i, %%i | |
664 | %assign %%i %%i+1 | |
665 | %endrep | |
2f7f2e4b | 666 | INIT_CPUFLAGS %1 |
bafad220 LM |
667 | %endmacro |
668 | ||
2f7f2e4b | 669 | ; FIXME: INIT_AVX can be replaced by INIT_XMM avx |
33cbfa6f VS |
670 | %macro INIT_AVX 0 |
671 | INIT_XMM | |
672 | %assign avx_enabled 1 | |
673 | %define PALIGNR PALIGNR_SSSE3 | |
674 | %define RESET_MM_PERMUTATION INIT_AVX | |
675 | %endmacro | |
676 | ||
2f7f2e4b | 677 | %macro INIT_YMM 0-1+ |
33cbfa6f | 678 | %assign avx_enabled 1 |
2f7f2e4b | 679 | %define RESET_MM_PERMUTATION INIT_YMM %1 |
33cbfa6f VS |
680 | %define mmsize 32 |
681 | %define num_mmregs 8 | |
3b15a6d7 | 682 | %if ARCH_X86_64 |
33cbfa6f VS |
683 | %define num_mmregs 16 |
684 | %endif | |
685 | %define mova vmovaps | |
686 | %define movu vmovups | |
2f7f2e4b LM |
687 | %undef movh |
688 | %define movnta vmovntps | |
33cbfa6f VS |
689 | %assign %%i 0 |
690 | %rep num_mmregs | |
691 | CAT_XDEFINE m, %%i, ymm %+ %%i | |
692 | CAT_XDEFINE nymm, %%i, %%i | |
693 | %assign %%i %%i+1 | |
694 | %endrep | |
2f7f2e4b | 695 | INIT_CPUFLAGS %1 |
33cbfa6f VS |
696 | %endmacro |
697 | ||
2f7f2e4b | 698 | INIT_XMM |
bafad220 LM |
699 | |
700 | ; I often want to use macros that permute their arguments. e.g. there's no | |
701 | ; efficient way to implement butterfly or transpose or dct without swapping some | |
702 | ; arguments. | |
703 | ; | |
704 | ; I would like to not have to manually keep track of the permutations: | |
705 | ; If I insert a permutation in the middle of a function, it should automatically | |
706 | ; change everything that follows. For more complex macros I may also have multiple | |
707 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. | |
708 | ; | |
709 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | |
710 | ; permutes its arguments. It's equivalent to exchanging the contents of the | |
711 | ; registers, except that this way you exchange the register names instead, so it | |
712 | ; doesn't cost any cycles. | |
713 | ||
714 | %macro PERMUTE 2-* ; takes a list of pairs to swap | |
715 | %rep %0/2 | |
716 | %xdefine tmp%2 m%2 | |
717 | %xdefine ntmp%2 nm%2 | |
718 | %rotate 2 | |
719 | %endrep | |
720 | %rep %0/2 | |
721 | %xdefine m%1 tmp%2 | |
722 | %xdefine nm%1 ntmp%2 | |
723 | %undef tmp%2 | |
724 | %undef ntmp%2 | |
725 | %rotate 2 | |
726 | %endrep | |
727 | %endmacro | |
728 | ||
729 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | |
730 | %rep %0-1 | |
731 | %ifdef m%1 | |
732 | %xdefine tmp m%1 | |
733 | %xdefine m%1 m%2 | |
734 | %xdefine m%2 tmp | |
735 | CAT_XDEFINE n, m%1, %1 | |
736 | CAT_XDEFINE n, m%2, %2 | |
737 | %else | |
738 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. | |
739 | ; Be careful using this mode in nested macros though, as in some cases there may be | |
740 | ; other copies of m# that have already been dereferenced and don't get updated correctly. | |
741 | %xdefine %%n1 n %+ %1 | |
742 | %xdefine %%n2 n %+ %2 | |
743 | %xdefine tmp m %+ %%n1 | |
744 | CAT_XDEFINE m, %%n1, m %+ %%n2 | |
745 | CAT_XDEFINE m, %%n2, tmp | |
746 | CAT_XDEFINE n, m %+ %%n1, %%n1 | |
747 | CAT_XDEFINE n, m %+ %%n2, %%n2 | |
748 | %endif | |
749 | %undef tmp | |
750 | %rotate 1 | |
751 | %endrep | |
752 | %endmacro | |
753 | ||
2f7f2e4b LM |
754 | ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
755 | ; calls to that function will automatically load the permutation, so values can | |
756 | ; be returned in mmregs. | |
757 | %macro SAVE_MM_PERMUTATION 0-1 | |
758 | %if %0 | |
759 | %xdefine %%f %1_m | |
760 | %else | |
761 | %xdefine %%f current_function %+ _m | |
762 | %endif | |
bafad220 LM |
763 | %assign %%i 0 |
764 | %rep num_mmregs | |
2f7f2e4b | 765 | CAT_XDEFINE %%f, %%i, m %+ %%i |
bafad220 LM |
766 | %assign %%i %%i+1 |
767 | %endrep | |
768 | %endmacro | |
769 | ||
2966cc18 | 770 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
2f7f2e4b LM |
771 | %ifdef %1_m0 |
772 | %assign %%i 0 | |
773 | %rep num_mmregs | |
774 | CAT_XDEFINE m, %%i, %1_m %+ %%i | |
775 | CAT_XDEFINE n, m %+ %%i, %%i | |
776 | %assign %%i %%i+1 | |
777 | %endrep | |
778 | %endif | |
bafad220 LM |
779 | %endmacro |
780 | ||
2f7f2e4b | 781 | ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't |
bafad220 | 782 | %macro call 1 |
2f7f2e4b LM |
783 | call_internal %1, %1 %+ SUFFIX |
784 | %endmacro | |
785 | %macro call_internal 2 | |
786 | %xdefine %%i %1 | |
787 | %ifndef cglobaled_%1 | |
788 | %ifdef cglobaled_%2 | |
789 | %xdefine %%i %2 | |
790 | %endif | |
bafad220 | 791 | %endif |
2f7f2e4b LM |
792 | call %%i |
793 | LOAD_MM_PERMUTATION %%i | |
bafad220 LM |
794 | %endmacro |
795 | ||
2966cc18 | 796 | ; Substitutions that reduce instruction size but are functionally equivalent |
3f87f39c JA |
797 | %macro add 2 |
798 | %ifnum %2 | |
799 | %if %2==128 | |
800 | sub %1, -128 | |
801 | %else | |
802 | add %1, %2 | |
803 | %endif | |
804 | %else | |
805 | add %1, %2 | |
806 | %endif | |
807 | %endmacro | |
808 | ||
809 | %macro sub 2 | |
810 | %ifnum %2 | |
811 | %if %2==128 | |
812 | add %1, -128 | |
813 | %else | |
814 | sub %1, %2 | |
815 | %endif | |
816 | %else | |
817 | sub %1, %2 | |
818 | %endif | |
819 | %endmacro | |
33cbfa6f VS |
820 | |
821 | ;============================================================================= | |
822 | ; AVX abstraction layer | |
823 | ;============================================================================= | |
824 | ||
825 | %assign i 0 | |
826 | %rep 16 | |
827 | %if i < 8 | |
828 | CAT_XDEFINE sizeofmm, i, 8 | |
829 | %endif | |
830 | CAT_XDEFINE sizeofxmm, i, 16 | |
831 | CAT_XDEFINE sizeofymm, i, 32 | |
832 | %assign i i+1 | |
833 | %endrep | |
834 | %undef i | |
835 | ||
836 | ;%1 == instruction | |
837 | ;%2 == 1 if float, 0 if int | |
705f3d47 | 838 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
33cbfa6f VS |
839 | ;%4 == number of operands given |
840 | ;%5+: operands | |
841 | %macro RUN_AVX_INSTR 6-7+ | |
2f7f2e4b LM |
842 | %ifid %5 |
843 | %define %%size sizeof%5 | |
844 | %else | |
845 | %define %%size mmsize | |
846 | %endif | |
847 | %if %%size==32 | |
705f3d47 LM |
848 | %if %0 >= 7 |
849 | v%1 %5, %6, %7 | |
850 | %else | |
851 | v%1 %5, %6 | |
852 | %endif | |
33cbfa6f | 853 | %else |
2f7f2e4b | 854 | %if %%size==8 |
33cbfa6f VS |
855 | %define %%regmov movq |
856 | %elif %2 | |
857 | %define %%regmov movaps | |
858 | %else | |
859 | %define %%regmov movdqa | |
860 | %endif | |
861 | ||
862 | %if %4>=3+%3 | |
863 | %ifnidn %5, %6 | |
864 | %if avx_enabled && sizeof%5==16 | |
865 | v%1 %5, %6, %7 | |
866 | %else | |
867 | %%regmov %5, %6 | |
868 | %1 %5, %7 | |
869 | %endif | |
870 | %else | |
871 | %1 %5, %7 | |
872 | %endif | |
873 | %elif %3 | |
874 | %1 %5, %6, %7 | |
875 | %else | |
876 | %1 %5, %6 | |
877 | %endif | |
878 | %endif | |
879 | %endmacro | |
880 | ||
2f7f2e4b LM |
881 | ; 3arg AVX ops with a memory arg can only have it in src2, |
882 | ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). | |
883 | ; So, if the op is symmetric and the wrong one is memory, swap them. | |
884 | %macro RUN_AVX_INSTR1 8 | |
885 | %assign %%swap 0 | |
886 | %if avx_enabled | |
887 | %ifnid %6 | |
888 | %assign %%swap 1 | |
889 | %endif | |
890 | %elifnidn %5, %6 | |
891 | %ifnid %7 | |
892 | %assign %%swap 1 | |
893 | %endif | |
894 | %endif | |
895 | %if %%swap && %3 == 0 && %8 == 1 | |
896 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 | |
897 | %else | |
898 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 | |
899 | %endif | |
900 | %endmacro | |
901 | ||
33cbfa6f VS |
902 | ;%1 == instruction |
903 | ;%2 == 1 if float, 0 if int | |
2f7f2e4b LM |
904 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) |
905 | ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not | |
906 | %macro AVX_INSTR 4 | |
907 | %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 | |
33cbfa6f VS |
908 | %ifidn %3, fnord |
909 | RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 | |
910 | %elifidn %4, fnord | |
2f7f2e4b | 911 | RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 |
33cbfa6f VS |
912 | %elifidn %5, fnord |
913 | RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 | |
914 | %else | |
915 | RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 | |
916 | %endif | |
917 | %endmacro | |
918 | %endmacro | |
919 | ||
2f7f2e4b LM |
920 | AVX_INSTR addpd, 1, 0, 1 |
921 | AVX_INSTR addps, 1, 0, 1 | |
922 | AVX_INSTR addsd, 1, 0, 1 | |
923 | AVX_INSTR addss, 1, 0, 1 | |
924 | AVX_INSTR addsubpd, 1, 0, 0 | |
925 | AVX_INSTR addsubps, 1, 0, 0 | |
926 | AVX_INSTR andpd, 1, 0, 1 | |
927 | AVX_INSTR andps, 1, 0, 1 | |
928 | AVX_INSTR andnpd, 1, 0, 0 | |
929 | AVX_INSTR andnps, 1, 0, 0 | |
930 | AVX_INSTR blendpd, 1, 0, 0 | |
931 | AVX_INSTR blendps, 1, 0, 0 | |
932 | AVX_INSTR blendvpd, 1, 0, 0 | |
933 | AVX_INSTR blendvps, 1, 0, 0 | |
934 | AVX_INSTR cmppd, 1, 0, 0 | |
935 | AVX_INSTR cmpps, 1, 0, 0 | |
936 | AVX_INSTR cmpsd, 1, 0, 0 | |
937 | AVX_INSTR cmpss, 1, 0, 0 | |
705f3d47 LM |
938 | AVX_INSTR cvtdq2ps, 1, 0, 0 |
939 | AVX_INSTR cvtps2dq, 1, 0, 0 | |
2f7f2e4b LM |
940 | AVX_INSTR divpd, 1, 0, 0 |
941 | AVX_INSTR divps, 1, 0, 0 | |
942 | AVX_INSTR divsd, 1, 0, 0 | |
943 | AVX_INSTR divss, 1, 0, 0 | |
944 | AVX_INSTR dppd, 1, 1, 0 | |
945 | AVX_INSTR dpps, 1, 1, 0 | |
946 | AVX_INSTR haddpd, 1, 0, 0 | |
947 | AVX_INSTR haddps, 1, 0, 0 | |
948 | AVX_INSTR hsubpd, 1, 0, 0 | |
949 | AVX_INSTR hsubps, 1, 0, 0 | |
950 | AVX_INSTR maxpd, 1, 0, 1 | |
951 | AVX_INSTR maxps, 1, 0, 1 | |
952 | AVX_INSTR maxsd, 1, 0, 1 | |
953 | AVX_INSTR maxss, 1, 0, 1 | |
954 | AVX_INSTR minpd, 1, 0, 1 | |
955 | AVX_INSTR minps, 1, 0, 1 | |
956 | AVX_INSTR minsd, 1, 0, 1 | |
957 | AVX_INSTR minss, 1, 0, 1 | |
39df0c43 VS |
958 | AVX_INSTR movhlps, 1, 0, 0 |
959 | AVX_INSTR movlhps, 1, 0, 0 | |
2f7f2e4b LM |
960 | AVX_INSTR movsd, 1, 0, 0 |
961 | AVX_INSTR movss, 1, 0, 0 | |
962 | AVX_INSTR mpsadbw, 0, 1, 0 | |
963 | AVX_INSTR mulpd, 1, 0, 1 | |
964 | AVX_INSTR mulps, 1, 0, 1 | |
965 | AVX_INSTR mulsd, 1, 0, 1 | |
966 | AVX_INSTR mulss, 1, 0, 1 | |
967 | AVX_INSTR orpd, 1, 0, 1 | |
968 | AVX_INSTR orps, 1, 0, 1 | |
969 | AVX_INSTR packsswb, 0, 0, 0 | |
970 | AVX_INSTR packssdw, 0, 0, 0 | |
971 | AVX_INSTR packuswb, 0, 0, 0 | |
972 | AVX_INSTR packusdw, 0, 0, 0 | |
973 | AVX_INSTR paddb, 0, 0, 1 | |
974 | AVX_INSTR paddw, 0, 0, 1 | |
975 | AVX_INSTR paddd, 0, 0, 1 | |
976 | AVX_INSTR paddq, 0, 0, 1 | |
977 | AVX_INSTR paddsb, 0, 0, 1 | |
978 | AVX_INSTR paddsw, 0, 0, 1 | |
979 | AVX_INSTR paddusb, 0, 0, 1 | |
980 | AVX_INSTR paddusw, 0, 0, 1 | |
981 | AVX_INSTR palignr, 0, 1, 0 | |
982 | AVX_INSTR pand, 0, 0, 1 | |
983 | AVX_INSTR pandn, 0, 0, 0 | |
984 | AVX_INSTR pavgb, 0, 0, 1 | |
985 | AVX_INSTR pavgw, 0, 0, 1 | |
986 | AVX_INSTR pblendvb, 0, 0, 0 | |
987 | AVX_INSTR pblendw, 0, 1, 0 | |
988 | AVX_INSTR pcmpestri, 0, 0, 0 | |
989 | AVX_INSTR pcmpestrm, 0, 0, 0 | |
990 | AVX_INSTR pcmpistri, 0, 0, 0 | |
991 | AVX_INSTR pcmpistrm, 0, 0, 0 | |
992 | AVX_INSTR pcmpeqb, 0, 0, 1 | |
993 | AVX_INSTR pcmpeqw, 0, 0, 1 | |
994 | AVX_INSTR pcmpeqd, 0, 0, 1 | |
995 | AVX_INSTR pcmpeqq, 0, 0, 1 | |
996 | AVX_INSTR pcmpgtb, 0, 0, 0 | |
997 | AVX_INSTR pcmpgtw, 0, 0, 0 | |
998 | AVX_INSTR pcmpgtd, 0, 0, 0 | |
999 | AVX_INSTR pcmpgtq, 0, 0, 0 | |
1000 | AVX_INSTR phaddw, 0, 0, 0 | |
1001 | AVX_INSTR phaddd, 0, 0, 0 | |
1002 | AVX_INSTR phaddsw, 0, 0, 0 | |
1003 | AVX_INSTR phsubw, 0, 0, 0 | |
1004 | AVX_INSTR phsubd, 0, 0, 0 | |
1005 | AVX_INSTR phsubsw, 0, 0, 0 | |
1006 | AVX_INSTR pmaddwd, 0, 0, 1 | |
1007 | AVX_INSTR pmaddubsw, 0, 0, 0 | |
1008 | AVX_INSTR pmaxsb, 0, 0, 1 | |
1009 | AVX_INSTR pmaxsw, 0, 0, 1 | |
1010 | AVX_INSTR pmaxsd, 0, 0, 1 | |
1011 | AVX_INSTR pmaxub, 0, 0, 1 | |
1012 | AVX_INSTR pmaxuw, 0, 0, 1 | |
1013 | AVX_INSTR pmaxud, 0, 0, 1 | |
1014 | AVX_INSTR pminsb, 0, 0, 1 | |
1015 | AVX_INSTR pminsw, 0, 0, 1 | |
1016 | AVX_INSTR pminsd, 0, 0, 1 | |
1017 | AVX_INSTR pminub, 0, 0, 1 | |
1018 | AVX_INSTR pminuw, 0, 0, 1 | |
1019 | AVX_INSTR pminud, 0, 0, 1 | |
1020 | AVX_INSTR pmulhuw, 0, 0, 1 | |
1021 | AVX_INSTR pmulhrsw, 0, 0, 1 | |
1022 | AVX_INSTR pmulhw, 0, 0, 1 | |
1023 | AVX_INSTR pmullw, 0, 0, 1 | |
1024 | AVX_INSTR pmulld, 0, 0, 1 | |
1025 | AVX_INSTR pmuludq, 0, 0, 1 | |
1026 | AVX_INSTR pmuldq, 0, 0, 1 | |
1027 | AVX_INSTR por, 0, 0, 1 | |
1028 | AVX_INSTR psadbw, 0, 0, 1 | |
1029 | AVX_INSTR pshufb, 0, 0, 0 | |
1030 | AVX_INSTR psignb, 0, 0, 0 | |
1031 | AVX_INSTR psignw, 0, 0, 0 | |
1032 | AVX_INSTR psignd, 0, 0, 0 | |
1033 | AVX_INSTR psllw, 0, 0, 0 | |
1034 | AVX_INSTR pslld, 0, 0, 0 | |
1035 | AVX_INSTR psllq, 0, 0, 0 | |
1036 | AVX_INSTR pslldq, 0, 0, 0 | |
1037 | AVX_INSTR psraw, 0, 0, 0 | |
1038 | AVX_INSTR psrad, 0, 0, 0 | |
1039 | AVX_INSTR psrlw, 0, 0, 0 | |
1040 | AVX_INSTR psrld, 0, 0, 0 | |
1041 | AVX_INSTR psrlq, 0, 0, 0 | |
1042 | AVX_INSTR psrldq, 0, 0, 0 | |
1043 | AVX_INSTR psubb, 0, 0, 0 | |
1044 | AVX_INSTR psubw, 0, 0, 0 | |
1045 | AVX_INSTR psubd, 0, 0, 0 | |
1046 | AVX_INSTR psubq, 0, 0, 0 | |
1047 | AVX_INSTR psubsb, 0, 0, 0 | |
1048 | AVX_INSTR psubsw, 0, 0, 0 | |
1049 | AVX_INSTR psubusb, 0, 0, 0 | |
1050 | AVX_INSTR psubusw, 0, 0, 0 | |
1051 | AVX_INSTR punpckhbw, 0, 0, 0 | |
1052 | AVX_INSTR punpckhwd, 0, 0, 0 | |
1053 | AVX_INSTR punpckhdq, 0, 0, 0 | |
1054 | AVX_INSTR punpckhqdq, 0, 0, 0 | |
1055 | AVX_INSTR punpcklbw, 0, 0, 0 | |
1056 | AVX_INSTR punpcklwd, 0, 0, 0 | |
1057 | AVX_INSTR punpckldq, 0, 0, 0 | |
1058 | AVX_INSTR punpcklqdq, 0, 0, 0 | |
1059 | AVX_INSTR pxor, 0, 0, 1 | |
6b6ee582 | 1060 | AVX_INSTR shufps, 1, 1, 0 |
2f7f2e4b LM |
1061 | AVX_INSTR subpd, 1, 0, 0 |
1062 | AVX_INSTR subps, 1, 0, 0 | |
1063 | AVX_INSTR subsd, 1, 0, 0 | |
1064 | AVX_INSTR subss, 1, 0, 0 | |
1065 | AVX_INSTR unpckhpd, 1, 0, 0 | |
1066 | AVX_INSTR unpckhps, 1, 0, 0 | |
1067 | AVX_INSTR unpcklpd, 1, 0, 0 | |
1068 | AVX_INSTR unpcklps, 1, 0, 0 | |
1069 | AVX_INSTR xorpd, 1, 0, 1 | |
1070 | AVX_INSTR xorps, 1, 0, 1 | |
33cbfa6f VS |
1071 | |
1072 | ; 3DNow instructions, for sharing code between AVX, SSE and 3DN | |
2f7f2e4b LM |
1073 | AVX_INSTR pfadd, 1, 0, 1 |
1074 | AVX_INSTR pfsub, 1, 0, 0 | |
1075 | AVX_INSTR pfmul, 1, 0, 1 | |
1076 | ||
1077 | ; base-4 constants for shuffles | |
1078 | %assign i 0 | |
1079 | %rep 256 | |
1080 | %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) | |
1081 | %if j < 10 | |
1082 | CAT_XDEFINE q000, j, i | |
1083 | %elif j < 100 | |
1084 | CAT_XDEFINE q00, j, i | |
1085 | %elif j < 1000 | |
1086 | CAT_XDEFINE q0, j, i | |
1087 | %else | |
1088 | CAT_XDEFINE q, j, i | |
1089 | %endif | |
1090 | %assign i i+1 | |
1091 | %endrep | |
1092 | %undef i | |
1093 | %undef j | |
1094 | ||
1095 | %macro FMA_INSTR 3 | |
79687079 JR |
1096 | %macro %1 5-8 %1, %2, %3 |
1097 | %if cpuflag(xop) || cpuflag(fma4) | |
1098 | v%6 %1, %2, %3, %4 | |
2f7f2e4b | 1099 | %else |
79687079 JR |
1100 | %ifidn %1, %4 |
1101 | %7 %5, %2, %3 | |
1102 | %8 %1, %4, %5 | |
1103 | %else | |
1104 | %7 %1, %2, %3 | |
1105 | %8 %1, %4 | |
1106 | %endif | |
2f7f2e4b LM |
1107 | %endif |
1108 | %endmacro | |
1109 | %endmacro | |
1110 | ||
79687079 | 1111 | FMA_INSTR fmaddps, mulps, addps |
2f7f2e4b LM |
1112 | FMA_INSTR pmacsdd, pmulld, paddd |
1113 | FMA_INSTR pmacsww, pmullw, paddw | |
1114 | FMA_INSTR pmadcswd, pmaddwd, paddd |