Commit | Line | Data |
---|---|---|
bafad220 | 1 | ;***************************************************************************** |
2f7f2e4b | 2 | ;* x86inc.asm: x264asm abstraction layer |
bafad220 | 3 | ;***************************************************************************** |
33cbfa6f | 4 | ;* Copyright (C) 2005-2011 x264 project |
bafad220 | 5 | ;* |
2966cc18 JGG |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | ;* Anton Mitrofanov <BugMaster@narod.ru> | |
33cbfa6f | 8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
bafad220 | 9 | ;* |
2966cc18 JGG |
10 | ;* Permission to use, copy, modify, and/or distribute this software for any |
11 | ;* purpose with or without fee is hereby granted, provided that the above | |
12 | ;* copyright notice and this permission notice appear in all copies. | |
bafad220 | 13 | ;* |
2966cc18 JGG |
14 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
15 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
16 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
17 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
18 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
19 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
20 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
bafad220 LM |
21 | ;***************************************************************************** |
22 | ||
2966cc18 JGG |
23 | ; This is a header file for the x264ASM assembly language, which uses |
24 | ; NASM/YASM syntax combined with a large number of macros to provide easy | |
25 | ; abstraction between different calling conventions (x86_32, win64, linux64). | |
26 | ; It also has various other useful features to simplify writing the kind of | |
27 | ; DSP functions that are most often used in x264. | |
28 | ||
29 | ; Unlike the rest of x264, this file is available under an ISC license, as it | |
30 | ; has significant usefulness outside of x264 and we want it to be available | |
31 | ; to the largest audience possible. Of course, if you modify it for your own | |
32 | ; purposes to add a new feature, we strongly encourage contributing a patch | |
33 | ; as this feature might be useful for others as well. Send patches or ideas | |
34 | ; to x264-devel@videolan.org . | |
35 | ||
36 | %define program_name ff | |
37 | ||
3b15a6d7 RB |
38 | %define UNIX64 0 |
39 | %define WIN64 0 | |
40 | %if ARCH_X86_64 | |
3f87f39c | 41 | %ifidn __OUTPUT_FORMAT__,win32 |
3b15a6d7 | 42 | %define WIN64 1 |
166f3993 HY |
43 | %elifidn __OUTPUT_FORMAT__,win64 |
44 | %define WIN64 1 | |
3f87f39c | 45 | %else |
3b15a6d7 | 46 | %define UNIX64 1 |
3f87f39c JA |
47 | %endif |
48 | %endif | |
49 | ||
2966cc18 JGG |
50 | %ifdef PREFIX |
51 | %define mangle(x) _ %+ x | |
52 | %else | |
53 | %define mangle(x) x | |
54 | %endif | |
55 | ||
bafad220 LM |
56 | ; FIXME: All of the 64bit asm functions that take a stride as an argument |
57 | ; via register, assume that the high dword of that register is filled with 0. | |
58 | ; This is true in practice (since we never do any 64bit arithmetic on strides, | |
59 | ; and x264's strides are all positive), but is not guaranteed by the ABI. | |
60 | ||
61 | ; Name of the .rodata section. | |
62 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, | |
63 | ; so use a different read-only section. | |
3f87f39c | 64 | %macro SECTION_RODATA 0-1 16 |
bafad220 | 65 | %ifidn __OUTPUT_FORMAT__,macho64 |
3f87f39c | 66 | SECTION .text align=%1 |
bafad220 | 67 | %elifidn __OUTPUT_FORMAT__,macho |
3f87f39c | 68 | SECTION .text align=%1 |
bafad220 | 69 | fakegot: |
d69f9a42 DY |
70 | %elifidn __OUTPUT_FORMAT__,aout |
71 | section .text | |
bafad220 | 72 | %else |
3f87f39c | 73 | SECTION .rodata align=%1 |
bafad220 LM |
74 | %endif |
75 | %endmacro | |
76 | ||
d69f9a42 DY |
77 | ; aout does not support align= |
78 | %macro SECTION_TEXT 0-1 16 | |
79 | %ifidn __OUTPUT_FORMAT__,aout | |
80 | SECTION .text | |
81 | %else | |
82 | SECTION .text align=%1 | |
83 | %endif | |
84 | %endmacro | |
85 | ||
3b15a6d7 | 86 | %if WIN64 |
3f87f39c | 87 | %define PIC |
412b248e | 88 | %elif ARCH_X86_64 == 0 |
2966cc18 JGG |
89 | ; x86_32 doesn't require PIC. |
90 | ; Some distros prefer shared objects to be PIC, but nothing breaks if | |
91 | ; the code contains a few textrels, so we'll skip that complexity. | |
3f87f39c JA |
92 | %undef PIC |
93 | %endif | |
94 | %ifdef PIC | |
2966cc18 | 95 | default rel |
bafad220 LM |
96 | %endif |
97 | ||
98 | ; Macros to eliminate most code duplication between x86_32 and x86_64: | |
99 | ; Currently this works only for leaf functions which load all their arguments | |
100 | ; into registers at the start, and make no other use of the stack. Luckily that | |
101 | ; covers most of x264's asm. | |
102 | ||
103 | ; PROLOGUE: | |
104 | ; %1 = number of arguments. loads them from stack if needed. | |
3f87f39c JA |
105 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
106 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | |
bafad220 LM |
107 | ; %4 = list of names to define to registers |
108 | ; PROLOGUE can also be invoked by adding the same options to cglobal | |
109 | ||
110 | ; e.g. | |
29e4edbb | 111 | ; cglobal foo, 2,3,0, dst, src, tmp |
3f87f39c | 112 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
bafad220 LM |
113 | |
114 | ; TODO Some functions can use some args directly from the stack. If they're the | |
115 | ; last args then you can just not declare them, but if they're in the middle | |
116 | ; we need more flexible macro. | |
117 | ||
118 | ; RET: | |
2f7f2e4b | 119 | ; Pops anything that was pushed by PROLOGUE, and returns. |
bafad220 LM |
120 | |
121 | ; REP_RET: | |
122 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | |
123 | ; which are slow when a normal ret follows a branch. | |
124 | ||
3f87f39c JA |
125 | ; registers: |
126 | ; rN and rNq are the native-size register holding function argument N | |
127 | ; rNd, rNw, rNb are dword, word, and byte size | |
128 | ; rNm is the original location of arg N (a register or on the stack), dword | |
129 | ; rNmp is native size | |
130 | ||
bafad220 LM |
131 | %macro DECLARE_REG 6 |
132 | %define r%1q %2 | |
133 | %define r%1d %3 | |
134 | %define r%1w %4 | |
135 | %define r%1b %5 | |
136 | %define r%1m %6 | |
3f87f39c JA |
137 | %ifid %6 ; i.e. it's a register |
138 | %define r%1mp %2 | |
3b15a6d7 | 139 | %elif ARCH_X86_64 ; memory |
3f87f39c JA |
140 | %define r%1mp qword %6 |
141 | %else | |
142 | %define r%1mp dword %6 | |
143 | %endif | |
bafad220 LM |
144 | %define r%1 %2 |
145 | %endmacro | |
146 | ||
147 | %macro DECLARE_REG_SIZE 2 | |
148 | %define r%1q r%1 | |
149 | %define e%1q r%1 | |
150 | %define r%1d e%1 | |
151 | %define e%1d e%1 | |
152 | %define r%1w %1 | |
153 | %define e%1w %1 | |
154 | %define r%1b %2 | |
155 | %define e%1b %2 | |
3b15a6d7 | 156 | %if ARCH_X86_64 == 0 |
bafad220 LM |
157 | %define r%1 e%1 |
158 | %endif | |
159 | %endmacro | |
160 | ||
161 | DECLARE_REG_SIZE ax, al | |
162 | DECLARE_REG_SIZE bx, bl | |
163 | DECLARE_REG_SIZE cx, cl | |
164 | DECLARE_REG_SIZE dx, dl | |
165 | DECLARE_REG_SIZE si, sil | |
166 | DECLARE_REG_SIZE di, dil | |
167 | DECLARE_REG_SIZE bp, bpl | |
168 | ||
3f87f39c JA |
169 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
170 | ||
171 | %macro DECLARE_REG_TMP 1-* | |
172 | %assign %%i 0 | |
173 | %rep %0 | |
174 | CAT_XDEFINE t, %%i, r%1 | |
175 | %assign %%i %%i+1 | |
176 | %rotate 1 | |
177 | %endrep | |
178 | %endmacro | |
179 | ||
180 | %macro DECLARE_REG_TMP_SIZE 0-* | |
181 | %rep %0 | |
182 | %define t%1q t%1 %+ q | |
183 | %define t%1d t%1 %+ d | |
184 | %define t%1w t%1 %+ w | |
185 | %define t%1b t%1 %+ b | |
186 | %rotate 1 | |
187 | %endrep | |
188 | %endmacro | |
189 | ||
2966cc18 | 190 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 |
3f87f39c | 191 | |
3b15a6d7 | 192 | %if ARCH_X86_64 |
bafad220 LM |
193 | %define gprsize 8 |
194 | %else | |
195 | %define gprsize 4 | |
196 | %endif | |
197 | ||
198 | %macro PUSH 1 | |
199 | push %1 | |
200 | %assign stack_offset stack_offset+gprsize | |
201 | %endmacro | |
202 | ||
203 | %macro POP 1 | |
204 | pop %1 | |
205 | %assign stack_offset stack_offset-gprsize | |
206 | %endmacro | |
207 | ||
208 | %macro SUB 2 | |
209 | sub %1, %2 | |
210 | %ifidn %1, rsp | |
211 | %assign stack_offset stack_offset+(%2) | |
212 | %endif | |
213 | %endmacro | |
214 | ||
215 | %macro ADD 2 | |
216 | add %1, %2 | |
217 | %ifidn %1, rsp | |
218 | %assign stack_offset stack_offset-(%2) | |
219 | %endif | |
220 | %endmacro | |
221 | ||
222 | %macro movifnidn 2 | |
223 | %ifnidn %1, %2 | |
224 | mov %1, %2 | |
225 | %endif | |
226 | %endmacro | |
227 | ||
228 | %macro movsxdifnidn 2 | |
229 | %ifnidn %1, %2 | |
230 | movsxd %1, %2 | |
231 | %endif | |
232 | %endmacro | |
233 | ||
234 | %macro ASSERT 1 | |
235 | %if (%1) == 0 | |
236 | %error assert failed | |
237 | %endif | |
238 | %endmacro | |
239 | ||
240 | %macro DEFINE_ARGS 0-* | |
241 | %ifdef n_arg_names | |
242 | %assign %%i 0 | |
243 | %rep n_arg_names | |
244 | CAT_UNDEF arg_name %+ %%i, q | |
245 | CAT_UNDEF arg_name %+ %%i, d | |
246 | CAT_UNDEF arg_name %+ %%i, w | |
247 | CAT_UNDEF arg_name %+ %%i, b | |
2f77923d | 248 | CAT_UNDEF arg_name %+ %%i, m |
98b9da2a | 249 | CAT_UNDEF arg_name %+ %%i, mp |
bafad220 LM |
250 | CAT_UNDEF arg_name, %%i |
251 | %assign %%i %%i+1 | |
252 | %endrep | |
253 | %endif | |
254 | ||
0f53d0cf LM |
255 | %xdefine %%stack_offset stack_offset |
256 | %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine | |
bafad220 LM |
257 | %assign %%i 0 |
258 | %rep %0 | |
259 | %xdefine %1q r %+ %%i %+ q | |
260 | %xdefine %1d r %+ %%i %+ d | |
261 | %xdefine %1w r %+ %%i %+ w | |
262 | %xdefine %1b r %+ %%i %+ b | |
2f77923d | 263 | %xdefine %1m r %+ %%i %+ m |
98b9da2a | 264 | %xdefine %1mp r %+ %%i %+ mp |
bafad220 LM |
265 | CAT_XDEFINE arg_name, %%i, %1 |
266 | %assign %%i %%i+1 | |
267 | %rotate 1 | |
268 | %endrep | |
0f53d0cf LM |
269 | %xdefine stack_offset %%stack_offset |
270 | %assign n_arg_names %0 | |
bafad220 LM |
271 | %endmacro |
272 | ||
3b15a6d7 | 273 | %if WIN64 ; Windows x64 ;================================================= |
bafad220 LM |
274 | |
275 | DECLARE_REG 0, rcx, ecx, cx, cl, ecx | |
276 | DECLARE_REG 1, rdx, edx, dx, dl, edx | |
277 | DECLARE_REG 2, r8, r8d, r8w, r8b, r8d | |
278 | DECLARE_REG 3, r9, r9d, r9w, r9b, r9d | |
279 | DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] | |
280 | DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] | |
281 | DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |
282 | %define r7m [rsp + stack_offset + 64] | |
283 | %define r8m [rsp + stack_offset + 72] | |
284 | ||
285 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
286 | %if %1 < %2 | |
3f87f39c JA |
287 | mov r%1, [rsp + stack_offset + 8 + %1*8] |
288 | %endif | |
289 | %endmacro | |
290 | ||
2966cc18 | 291 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
3f87f39c JA |
292 | ASSERT %2 >= %1 |
293 | %assign regs_used %2 | |
294 | ASSERT regs_used <= 7 | |
3f87f39c JA |
295 | %if regs_used > 4 |
296 | push r4 | |
297 | push r5 | |
298 | %assign stack_offset stack_offset+16 | |
299 | %endif | |
9cf73853 HG |
300 | %if mmsize == 8 |
301 | %assign xmm_regs_used 0 | |
302 | %else | |
303 | WIN64_SPILL_XMM %3 | |
304 | %endif | |
532e7697 LM |
305 | LOAD_IF_USED 4, %1 |
306 | LOAD_IF_USED 5, %1 | |
307 | LOAD_IF_USED 6, %1 | |
308 | DEFINE_ARGS %4 | |
309 | %endmacro | |
310 | ||
311 | %macro WIN64_SPILL_XMM 1 | |
312 | %assign xmm_regs_used %1 | |
313 | ASSERT xmm_regs_used <= 16 | |
3f87f39c JA |
314 | %if xmm_regs_used > 6 |
315 | sub rsp, (xmm_regs_used-6)*16+16 | |
316 | %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 | |
317 | %assign %%i xmm_regs_used | |
318 | %rep (xmm_regs_used-6) | |
319 | %assign %%i %%i-1 | |
320 | movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i | |
321 | %endrep | |
322 | %endif | |
3f87f39c JA |
323 | %endmacro |
324 | ||
532e7697 | 325 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
3f87f39c JA |
326 | %if xmm_regs_used > 6 |
327 | %assign %%i xmm_regs_used | |
328 | %rep (xmm_regs_used-6) | |
329 | %assign %%i %%i-1 | |
330 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] | |
331 | %endrep | |
332 | add %1, (xmm_regs_used-6)*16+16 | |
333 | %endif | |
334 | %endmacro | |
335 | ||
532e7697 LM |
336 | %macro WIN64_RESTORE_XMM 1 |
337 | WIN64_RESTORE_XMM_INTERNAL %1 | |
3f87f39c JA |
338 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
339 | %assign xmm_regs_used 0 | |
340 | %endmacro | |
341 | ||
342 | %macro RET 0 | |
532e7697 | 343 | WIN64_RESTORE_XMM_INTERNAL rsp |
3f87f39c JA |
344 | %if regs_used > 4 |
345 | pop r5 | |
346 | pop r4 | |
bafad220 | 347 | %endif |
3f87f39c | 348 | ret |
bafad220 LM |
349 | %endmacro |
350 | ||
3f87f39c JA |
351 | %macro REP_RET 0 |
352 | %if regs_used > 4 || xmm_regs_used > 6 | |
353 | RET | |
354 | %else | |
355 | rep ret | |
356 | %endif | |
357 | %endmacro | |
358 | ||
3b15a6d7 | 359 | %elif ARCH_X86_64 ; *nix x64 ;============================================= |
bafad220 LM |
360 | |
361 | DECLARE_REG 0, rdi, edi, di, dil, edi | |
362 | DECLARE_REG 1, rsi, esi, si, sil, esi | |
363 | DECLARE_REG 2, rdx, edx, dx, dl, edx | |
364 | DECLARE_REG 3, rcx, ecx, cx, cl, ecx | |
365 | DECLARE_REG 4, r8, r8d, r8w, r8b, r8d | |
366 | DECLARE_REG 5, r9, r9d, r9w, r9b, r9d | |
367 | DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] | |
368 | %define r7m [rsp + stack_offset + 16] | |
369 | %define r8m [rsp + stack_offset + 24] | |
370 | ||
371 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
372 | %if %1 < %2 | |
373 | mov r%1, [rsp - 40 + %1*8] | |
374 | %endif | |
375 | %endmacro | |
376 | ||
3f87f39c | 377 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
bafad220 LM |
378 | ASSERT %2 >= %1 |
379 | ASSERT %2 <= 7 | |
bafad220 LM |
380 | LOAD_IF_USED 6, %1 |
381 | DEFINE_ARGS %4 | |
382 | %endmacro | |
383 | ||
384 | %macro RET 0 | |
385 | ret | |
386 | %endmacro | |
387 | ||
388 | %macro REP_RET 0 | |
389 | rep ret | |
390 | %endmacro | |
391 | ||
392 | %else ; X86_32 ;============================================================== | |
393 | ||
394 | DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] | |
395 | DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] | |
396 | DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] | |
397 | DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] | |
398 | DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] | |
399 | DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] | |
400 | DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |
401 | %define r7m [esp + stack_offset + 32] | |
402 | %define r8m [esp + stack_offset + 36] | |
403 | %define rsp esp | |
404 | ||
405 | %macro PUSH_IF_USED 1 ; reg_id | |
406 | %if %1 < regs_used | |
407 | push r%1 | |
408 | %assign stack_offset stack_offset+4 | |
409 | %endif | |
410 | %endmacro | |
411 | ||
412 | %macro POP_IF_USED 1 ; reg_id | |
413 | %if %1 < regs_used | |
414 | pop r%1 | |
415 | %endif | |
416 | %endmacro | |
417 | ||
418 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
419 | %if %1 < %2 | |
420 | mov r%1, [esp + stack_offset + 4 + %1*4] | |
421 | %endif | |
422 | %endmacro | |
423 | ||
2966cc18 | 424 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
bafad220 | 425 | ASSERT %2 >= %1 |
bafad220 | 426 | %assign regs_used %2 |
bafad220 LM |
427 | ASSERT regs_used <= 7 |
428 | PUSH_IF_USED 3 | |
429 | PUSH_IF_USED 4 | |
430 | PUSH_IF_USED 5 | |
431 | PUSH_IF_USED 6 | |
432 | LOAD_IF_USED 0, %1 | |
433 | LOAD_IF_USED 1, %1 | |
434 | LOAD_IF_USED 2, %1 | |
435 | LOAD_IF_USED 3, %1 | |
436 | LOAD_IF_USED 4, %1 | |
437 | LOAD_IF_USED 5, %1 | |
438 | LOAD_IF_USED 6, %1 | |
bafad220 LM |
439 | DEFINE_ARGS %4 |
440 | %endmacro | |
441 | ||
442 | %macro RET 0 | |
443 | POP_IF_USED 6 | |
444 | POP_IF_USED 5 | |
445 | POP_IF_USED 4 | |
446 | POP_IF_USED 3 | |
447 | ret | |
448 | %endmacro | |
449 | ||
450 | %macro REP_RET 0 | |
451 | %if regs_used > 3 | |
452 | RET | |
453 | %else | |
454 | rep ret | |
455 | %endif | |
456 | %endmacro | |
457 | ||
458 | %endif ;====================================================================== | |
459 | ||
3b15a6d7 | 460 | %if WIN64 == 0 |
532e7697 LM |
461 | %macro WIN64_SPILL_XMM 1 |
462 | %endmacro | |
463 | %macro WIN64_RESTORE_XMM 1 | |
464 | %endmacro | |
465 | %endif | |
466 | ||
bafad220 LM |
467 | |
468 | ||
469 | ;============================================================================= | |
470 | ; arch-independent part | |
471 | ;============================================================================= | |
472 | ||
473 | %assign function_align 16 | |
474 | ||
2f7f2e4b LM |
475 | ; Begin a function. |
476 | ; Applies any symbol mangling needed for C linkage, and sets up a define such that | |
477 | ; subsequent uses of the function name automatically refer to the mangled version. | |
478 | ; Appends cpuflags to the function name if cpuflags has been specified. | |
479 | %macro cglobal 1-2+ ; name, [PROLOGUE args] | |
480 | %if %0 == 1 | |
481 | cglobal_internal %1 %+ SUFFIX | |
482 | %else | |
483 | cglobal_internal %1 %+ SUFFIX, %2 | |
484 | %endif | |
485 | %endmacro | |
486 | %macro cglobal_internal 1-2+ | |
487 | %ifndef cglobaled_%1 | |
488 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
489 | %xdefine %1.skip_prologue %1 %+ .skip_prologue | |
490 | CAT_XDEFINE cglobaled_, %1, 1 | |
491 | %endif | |
492 | %xdefine current_function %1 | |
bafad220 | 493 | %ifidn __OUTPUT_FORMAT__,elf |
40c7d0ae | 494 | global %1:function hidden |
bafad220 | 495 | %else |
40c7d0ae | 496 | global %1 |
bafad220 LM |
497 | %endif |
498 | align function_align | |
499 | %1: | |
500 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | |
3f87f39c | 501 | %assign stack_offset 0 |
bafad220 LM |
502 | %if %0 > 1 |
503 | PROLOGUE %2 | |
504 | %endif | |
505 | %endmacro | |
506 | ||
507 | %macro cextern 1 | |
2966cc18 | 508 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
2f7f2e4b | 509 | CAT_XDEFINE cglobaled_, %1, 1 |
2966cc18 JGG |
510 | extern %1 |
511 | %endmacro | |
512 | ||
2f7f2e4b | 513 | ; like cextern, but without the prefix |
2966cc18 JGG |
514 | %macro cextern_naked 1 |
515 | %xdefine %1 mangle(%1) | |
2f7f2e4b | 516 | CAT_XDEFINE cglobaled_, %1, 1 |
3f87f39c | 517 | extern %1 |
bafad220 LM |
518 | %endmacro |
519 | ||
2966cc18 JGG |
520 | %macro const 2+ |
521 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
522 | global %1 | |
523 | %1: %2 | |
524 | %endmacro | |
525 | ||
bafad220 LM |
526 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
527 | ; executable by default. | |
528 | %ifidn __OUTPUT_FORMAT__,elf | |
529 | SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
530 | %endif | |
531 | ||
2f7f2e4b LM |
532 | ; cpuflags |
533 | ||
534 | %assign cpuflags_mmx (1<<0) | |
535 | %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx | |
536 | %assign cpuflags_3dnow (1<<2) | cpuflags_mmx | |
537 | %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow | |
538 | %assign cpuflags_sse (1<<4) | cpuflags_mmx2 | |
539 | %assign cpuflags_sse2 (1<<5) | cpuflags_sse | |
540 | %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 | |
541 | %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 | |
542 | %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 | |
543 | %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 | |
544 | %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 | |
545 | %assign cpuflags_avx (1<<11)| cpuflags_sse42 | |
546 | %assign cpuflags_xop (1<<12)| cpuflags_avx | |
547 | %assign cpuflags_fma4 (1<<13)| cpuflags_avx | |
548 | ||
549 | %assign cpuflags_cache32 (1<<16) | |
550 | %assign cpuflags_cache64 (1<<17) | |
551 | %assign cpuflags_slowctz (1<<18) | |
552 | %assign cpuflags_lzcnt (1<<19) | |
553 | %assign cpuflags_misalign (1<<20) | |
554 | %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant | |
555 | %assign cpuflags_atom (1<<22) | |
556 | ||
557 | %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) | |
558 | %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) | |
559 | ||
560 | ; Takes up to 2 cpuflags from the above list. | |
561 | ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. | |
562 | ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. | |
563 | %macro INIT_CPUFLAGS 0-2 | |
564 | %if %0 >= 1 | |
565 | %xdefine cpuname %1 | |
566 | %assign cpuflags cpuflags_%1 | |
567 | %if %0 >= 2 | |
568 | %xdefine cpuname %1_%2 | |
569 | %assign cpuflags cpuflags | cpuflags_%2 | |
570 | %endif | |
571 | %xdefine SUFFIX _ %+ cpuname | |
572 | %if cpuflag(avx) | |
573 | %assign avx_enabled 1 | |
574 | %endif | |
f2bd8a07 JR |
575 | %if mmsize == 16 && notcpuflag(sse2) |
576 | %define mova movaps | |
577 | %define movu movups | |
578 | %define movnta movntps | |
579 | %endif | |
2f7f2e4b LM |
580 | %if cpuflag(aligned) |
581 | %define movu mova | |
582 | %elifidn %1, sse3 | |
583 | %define movu lddqu | |
584 | %endif | |
585 | %else | |
586 | %xdefine SUFFIX | |
587 | %undef cpuname | |
588 | %undef cpuflags | |
589 | %endif | |
590 | %endmacro | |
591 | ||
bafad220 LM |
592 | ; merge mmx and sse* |
593 | ||
594 | %macro CAT_XDEFINE 3 | |
595 | %xdefine %1%2 %3 | |
596 | %endmacro | |
597 | ||
598 | %macro CAT_UNDEF 2 | |
599 | %undef %1%2 | |
600 | %endmacro | |
601 | ||
2f7f2e4b | 602 | %macro INIT_MMX 0-1+ |
33cbfa6f | 603 | %assign avx_enabled 0 |
2f7f2e4b | 604 | %define RESET_MM_PERMUTATION INIT_MMX %1 |
bafad220 LM |
605 | %define mmsize 8 |
606 | %define num_mmregs 8 | |
607 | %define mova movq | |
608 | %define movu movq | |
609 | %define movh movd | |
532e7697 | 610 | %define movnta movntq |
bafad220 LM |
611 | %assign %%i 0 |
612 | %rep 8 | |
613 | CAT_XDEFINE m, %%i, mm %+ %%i | |
614 | CAT_XDEFINE nmm, %%i, %%i | |
615 | %assign %%i %%i+1 | |
616 | %endrep | |
617 | %rep 8 | |
618 | CAT_UNDEF m, %%i | |
619 | CAT_UNDEF nmm, %%i | |
620 | %assign %%i %%i+1 | |
621 | %endrep | |
2f7f2e4b | 622 | INIT_CPUFLAGS %1 |
bafad220 LM |
623 | %endmacro |
624 | ||
2f7f2e4b | 625 | %macro INIT_XMM 0-1+ |
33cbfa6f | 626 | %assign avx_enabled 0 |
2f7f2e4b | 627 | %define RESET_MM_PERMUTATION INIT_XMM %1 |
bafad220 LM |
628 | %define mmsize 16 |
629 | %define num_mmregs 8 | |
3b15a6d7 | 630 | %if ARCH_X86_64 |
bafad220 LM |
631 | %define num_mmregs 16 |
632 | %endif | |
633 | %define mova movdqa | |
634 | %define movu movdqu | |
635 | %define movh movq | |
532e7697 | 636 | %define movnta movntdq |
bafad220 LM |
637 | %assign %%i 0 |
638 | %rep num_mmregs | |
639 | CAT_XDEFINE m, %%i, xmm %+ %%i | |
640 | CAT_XDEFINE nxmm, %%i, %%i | |
641 | %assign %%i %%i+1 | |
642 | %endrep | |
2f7f2e4b | 643 | INIT_CPUFLAGS %1 |
bafad220 LM |
644 | %endmacro |
645 | ||
2f7f2e4b | 646 | ; FIXME: INIT_AVX can be replaced by INIT_XMM avx |
33cbfa6f VS |
647 | %macro INIT_AVX 0 |
648 | INIT_XMM | |
649 | %assign avx_enabled 1 | |
650 | %define PALIGNR PALIGNR_SSSE3 | |
651 | %define RESET_MM_PERMUTATION INIT_AVX | |
652 | %endmacro | |
653 | ||
2f7f2e4b | 654 | %macro INIT_YMM 0-1+ |
33cbfa6f | 655 | %assign avx_enabled 1 |
2f7f2e4b | 656 | %define RESET_MM_PERMUTATION INIT_YMM %1 |
33cbfa6f VS |
657 | %define mmsize 32 |
658 | %define num_mmregs 8 | |
3b15a6d7 | 659 | %if ARCH_X86_64 |
33cbfa6f VS |
660 | %define num_mmregs 16 |
661 | %endif | |
662 | %define mova vmovaps | |
663 | %define movu vmovups | |
2f7f2e4b LM |
664 | %undef movh |
665 | %define movnta vmovntps | |
33cbfa6f VS |
666 | %assign %%i 0 |
667 | %rep num_mmregs | |
668 | CAT_XDEFINE m, %%i, ymm %+ %%i | |
669 | CAT_XDEFINE nymm, %%i, %%i | |
670 | %assign %%i %%i+1 | |
671 | %endrep | |
2f7f2e4b | 672 | INIT_CPUFLAGS %1 |
33cbfa6f VS |
673 | %endmacro |
674 | ||
2f7f2e4b | 675 | INIT_XMM |
bafad220 LM |
676 | |
677 | ; I often want to use macros that permute their arguments. e.g. there's no | |
678 | ; efficient way to implement butterfly or transpose or dct without swapping some | |
679 | ; arguments. | |
680 | ; | |
681 | ; I would like to not have to manually keep track of the permutations: | |
682 | ; If I insert a permutation in the middle of a function, it should automatically | |
683 | ; change everything that follows. For more complex macros I may also have multiple | |
684 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. | |
685 | ; | |
686 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | |
687 | ; permutes its arguments. It's equivalent to exchanging the contents of the | |
688 | ; registers, except that this way you exchange the register names instead, so it | |
689 | ; doesn't cost any cycles. | |
690 | ||
691 | %macro PERMUTE 2-* ; takes a list of pairs to swap | |
692 | %rep %0/2 | |
693 | %xdefine tmp%2 m%2 | |
694 | %xdefine ntmp%2 nm%2 | |
695 | %rotate 2 | |
696 | %endrep | |
697 | %rep %0/2 | |
698 | %xdefine m%1 tmp%2 | |
699 | %xdefine nm%1 ntmp%2 | |
700 | %undef tmp%2 | |
701 | %undef ntmp%2 | |
702 | %rotate 2 | |
703 | %endrep | |
704 | %endmacro | |
705 | ||
706 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | |
707 | %rep %0-1 | |
708 | %ifdef m%1 | |
709 | %xdefine tmp m%1 | |
710 | %xdefine m%1 m%2 | |
711 | %xdefine m%2 tmp | |
712 | CAT_XDEFINE n, m%1, %1 | |
713 | CAT_XDEFINE n, m%2, %2 | |
714 | %else | |
715 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. | |
716 | ; Be careful using this mode in nested macros though, as in some cases there may be | |
717 | ; other copies of m# that have already been dereferenced and don't get updated correctly. | |
718 | %xdefine %%n1 n %+ %1 | |
719 | %xdefine %%n2 n %+ %2 | |
720 | %xdefine tmp m %+ %%n1 | |
721 | CAT_XDEFINE m, %%n1, m %+ %%n2 | |
722 | CAT_XDEFINE m, %%n2, tmp | |
723 | CAT_XDEFINE n, m %+ %%n1, %%n1 | |
724 | CAT_XDEFINE n, m %+ %%n2, %%n2 | |
725 | %endif | |
726 | %undef tmp | |
727 | %rotate 1 | |
728 | %endrep | |
729 | %endmacro | |
730 | ||
2f7f2e4b LM |
731 | ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
732 | ; calls to that function will automatically load the permutation, so values can | |
733 | ; be returned in mmregs. | |
734 | %macro SAVE_MM_PERMUTATION 0-1 | |
735 | %if %0 | |
736 | %xdefine %%f %1_m | |
737 | %else | |
738 | %xdefine %%f current_function %+ _m | |
739 | %endif | |
bafad220 LM |
740 | %assign %%i 0 |
741 | %rep num_mmregs | |
2f7f2e4b | 742 | CAT_XDEFINE %%f, %%i, m %+ %%i |
bafad220 LM |
743 | %assign %%i %%i+1 |
744 | %endrep | |
745 | %endmacro | |
746 | ||
2966cc18 | 747 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
2f7f2e4b LM |
748 | %ifdef %1_m0 |
749 | %assign %%i 0 | |
750 | %rep num_mmregs | |
751 | CAT_XDEFINE m, %%i, %1_m %+ %%i | |
752 | CAT_XDEFINE n, m %+ %%i, %%i | |
753 | %assign %%i %%i+1 | |
754 | %endrep | |
755 | %endif | |
bafad220 LM |
756 | %endmacro |
757 | ||
2f7f2e4b | 758 | ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't |
bafad220 | 759 | %macro call 1 |
2f7f2e4b LM |
760 | call_internal %1, %1 %+ SUFFIX |
761 | %endmacro | |
762 | %macro call_internal 2 | |
763 | %xdefine %%i %1 | |
764 | %ifndef cglobaled_%1 | |
765 | %ifdef cglobaled_%2 | |
766 | %xdefine %%i %2 | |
767 | %endif | |
bafad220 | 768 | %endif |
2f7f2e4b LM |
769 | call %%i |
770 | LOAD_MM_PERMUTATION %%i | |
bafad220 LM |
771 | %endmacro |
772 | ||
2966cc18 | 773 | ; Substitutions that reduce instruction size but are functionally equivalent |
3f87f39c JA |
774 | %macro add 2 |
775 | %ifnum %2 | |
776 | %if %2==128 | |
777 | sub %1, -128 | |
778 | %else | |
779 | add %1, %2 | |
780 | %endif | |
781 | %else | |
782 | add %1, %2 | |
783 | %endif | |
784 | %endmacro | |
785 | ||
786 | %macro sub 2 | |
787 | %ifnum %2 | |
788 | %if %2==128 | |
789 | add %1, -128 | |
790 | %else | |
791 | sub %1, %2 | |
792 | %endif | |
793 | %else | |
794 | sub %1, %2 | |
795 | %endif | |
796 | %endmacro | |
33cbfa6f VS |
797 | |
798 | ;============================================================================= | |
799 | ; AVX abstraction layer | |
800 | ;============================================================================= | |
801 | ||
802 | %assign i 0 | |
803 | %rep 16 | |
804 | %if i < 8 | |
805 | CAT_XDEFINE sizeofmm, i, 8 | |
806 | %endif | |
807 | CAT_XDEFINE sizeofxmm, i, 16 | |
808 | CAT_XDEFINE sizeofymm, i, 32 | |
809 | %assign i i+1 | |
810 | %endrep | |
811 | %undef i | |
812 | ||
813 | ;%1 == instruction | |
814 | ;%2 == 1 if float, 0 if int | |
2f7f2e4b | 815 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) |
33cbfa6f VS |
816 | ;%4 == number of operands given |
817 | ;%5+: operands | |
818 | %macro RUN_AVX_INSTR 6-7+ | |
2f7f2e4b LM |
819 | %ifid %5 |
820 | %define %%size sizeof%5 | |
821 | %else | |
822 | %define %%size mmsize | |
823 | %endif | |
824 | %if %%size==32 | |
33cbfa6f VS |
825 | v%1 %5, %6, %7 |
826 | %else | |
2f7f2e4b | 827 | %if %%size==8 |
33cbfa6f VS |
828 | %define %%regmov movq |
829 | %elif %2 | |
830 | %define %%regmov movaps | |
831 | %else | |
832 | %define %%regmov movdqa | |
833 | %endif | |
834 | ||
835 | %if %4>=3+%3 | |
836 | %ifnidn %5, %6 | |
837 | %if avx_enabled && sizeof%5==16 | |
838 | v%1 %5, %6, %7 | |
839 | %else | |
840 | %%regmov %5, %6 | |
841 | %1 %5, %7 | |
842 | %endif | |
843 | %else | |
844 | %1 %5, %7 | |
845 | %endif | |
846 | %elif %3 | |
847 | %1 %5, %6, %7 | |
848 | %else | |
849 | %1 %5, %6 | |
850 | %endif | |
851 | %endif | |
852 | %endmacro | |
853 | ||
2f7f2e4b LM |
854 | ; 3arg AVX ops with a memory arg can only have it in src2, |
855 | ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). | |
856 | ; So, if the op is symmetric and the wrong one is memory, swap them. | |
857 | %macro RUN_AVX_INSTR1 8 | |
858 | %assign %%swap 0 | |
859 | %if avx_enabled | |
860 | %ifnid %6 | |
861 | %assign %%swap 1 | |
862 | %endif | |
863 | %elifnidn %5, %6 | |
864 | %ifnid %7 | |
865 | %assign %%swap 1 | |
866 | %endif | |
867 | %endif | |
868 | %if %%swap && %3 == 0 && %8 == 1 | |
869 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 | |
870 | %else | |
871 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 | |
872 | %endif | |
873 | %endmacro | |
874 | ||
33cbfa6f VS |
875 | ;%1 == instruction |
876 | ;%2 == 1 if float, 0 if int | |
2f7f2e4b LM |
877 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) |
878 | ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not | |
879 | %macro AVX_INSTR 4 | |
880 | %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 | |
33cbfa6f VS |
881 | %ifidn %3, fnord |
882 | RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 | |
883 | %elifidn %4, fnord | |
2f7f2e4b | 884 | RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 |
33cbfa6f VS |
885 | %elifidn %5, fnord |
886 | RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 | |
887 | %else | |
888 | RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 | |
889 | %endif | |
890 | %endmacro | |
891 | %endmacro | |
892 | ||
2f7f2e4b LM |
893 | AVX_INSTR addpd, 1, 0, 1 |
894 | AVX_INSTR addps, 1, 0, 1 | |
895 | AVX_INSTR addsd, 1, 0, 1 | |
896 | AVX_INSTR addss, 1, 0, 1 | |
897 | AVX_INSTR addsubpd, 1, 0, 0 | |
898 | AVX_INSTR addsubps, 1, 0, 0 | |
899 | AVX_INSTR andpd, 1, 0, 1 | |
900 | AVX_INSTR andps, 1, 0, 1 | |
901 | AVX_INSTR andnpd, 1, 0, 0 | |
902 | AVX_INSTR andnps, 1, 0, 0 | |
903 | AVX_INSTR blendpd, 1, 0, 0 | |
904 | AVX_INSTR blendps, 1, 0, 0 | |
905 | AVX_INSTR blendvpd, 1, 0, 0 | |
906 | AVX_INSTR blendvps, 1, 0, 0 | |
907 | AVX_INSTR cmppd, 1, 0, 0 | |
908 | AVX_INSTR cmpps, 1, 0, 0 | |
909 | AVX_INSTR cmpsd, 1, 0, 0 | |
910 | AVX_INSTR cmpss, 1, 0, 0 | |
911 | AVX_INSTR divpd, 1, 0, 0 | |
912 | AVX_INSTR divps, 1, 0, 0 | |
913 | AVX_INSTR divsd, 1, 0, 0 | |
914 | AVX_INSTR divss, 1, 0, 0 | |
915 | AVX_INSTR dppd, 1, 1, 0 | |
916 | AVX_INSTR dpps, 1, 1, 0 | |
917 | AVX_INSTR haddpd, 1, 0, 0 | |
918 | AVX_INSTR haddps, 1, 0, 0 | |
919 | AVX_INSTR hsubpd, 1, 0, 0 | |
920 | AVX_INSTR hsubps, 1, 0, 0 | |
921 | AVX_INSTR maxpd, 1, 0, 1 | |
922 | AVX_INSTR maxps, 1, 0, 1 | |
923 | AVX_INSTR maxsd, 1, 0, 1 | |
924 | AVX_INSTR maxss, 1, 0, 1 | |
925 | AVX_INSTR minpd, 1, 0, 1 | |
926 | AVX_INSTR minps, 1, 0, 1 | |
927 | AVX_INSTR minsd, 1, 0, 1 | |
928 | AVX_INSTR minss, 1, 0, 1 | |
39df0c43 VS |
929 | AVX_INSTR movhlps, 1, 0, 0 |
930 | AVX_INSTR movlhps, 1, 0, 0 | |
2f7f2e4b LM |
931 | AVX_INSTR movsd, 1, 0, 0 |
932 | AVX_INSTR movss, 1, 0, 0 | |
933 | AVX_INSTR mpsadbw, 0, 1, 0 | |
934 | AVX_INSTR mulpd, 1, 0, 1 | |
935 | AVX_INSTR mulps, 1, 0, 1 | |
936 | AVX_INSTR mulsd, 1, 0, 1 | |
937 | AVX_INSTR mulss, 1, 0, 1 | |
938 | AVX_INSTR orpd, 1, 0, 1 | |
939 | AVX_INSTR orps, 1, 0, 1 | |
940 | AVX_INSTR packsswb, 0, 0, 0 | |
941 | AVX_INSTR packssdw, 0, 0, 0 | |
942 | AVX_INSTR packuswb, 0, 0, 0 | |
943 | AVX_INSTR packusdw, 0, 0, 0 | |
944 | AVX_INSTR paddb, 0, 0, 1 | |
945 | AVX_INSTR paddw, 0, 0, 1 | |
946 | AVX_INSTR paddd, 0, 0, 1 | |
947 | AVX_INSTR paddq, 0, 0, 1 | |
948 | AVX_INSTR paddsb, 0, 0, 1 | |
949 | AVX_INSTR paddsw, 0, 0, 1 | |
950 | AVX_INSTR paddusb, 0, 0, 1 | |
951 | AVX_INSTR paddusw, 0, 0, 1 | |
952 | AVX_INSTR palignr, 0, 1, 0 | |
953 | AVX_INSTR pand, 0, 0, 1 | |
954 | AVX_INSTR pandn, 0, 0, 0 | |
955 | AVX_INSTR pavgb, 0, 0, 1 | |
956 | AVX_INSTR pavgw, 0, 0, 1 | |
957 | AVX_INSTR pblendvb, 0, 0, 0 | |
958 | AVX_INSTR pblendw, 0, 1, 0 | |
959 | AVX_INSTR pcmpestri, 0, 0, 0 | |
960 | AVX_INSTR pcmpestrm, 0, 0, 0 | |
961 | AVX_INSTR pcmpistri, 0, 0, 0 | |
962 | AVX_INSTR pcmpistrm, 0, 0, 0 | |
963 | AVX_INSTR pcmpeqb, 0, 0, 1 | |
964 | AVX_INSTR pcmpeqw, 0, 0, 1 | |
965 | AVX_INSTR pcmpeqd, 0, 0, 1 | |
966 | AVX_INSTR pcmpeqq, 0, 0, 1 | |
967 | AVX_INSTR pcmpgtb, 0, 0, 0 | |
968 | AVX_INSTR pcmpgtw, 0, 0, 0 | |
969 | AVX_INSTR pcmpgtd, 0, 0, 0 | |
970 | AVX_INSTR pcmpgtq, 0, 0, 0 | |
971 | AVX_INSTR phaddw, 0, 0, 0 | |
972 | AVX_INSTR phaddd, 0, 0, 0 | |
973 | AVX_INSTR phaddsw, 0, 0, 0 | |
974 | AVX_INSTR phsubw, 0, 0, 0 | |
975 | AVX_INSTR phsubd, 0, 0, 0 | |
976 | AVX_INSTR phsubsw, 0, 0, 0 | |
977 | AVX_INSTR pmaddwd, 0, 0, 1 | |
978 | AVX_INSTR pmaddubsw, 0, 0, 0 | |
979 | AVX_INSTR pmaxsb, 0, 0, 1 | |
980 | AVX_INSTR pmaxsw, 0, 0, 1 | |
981 | AVX_INSTR pmaxsd, 0, 0, 1 | |
982 | AVX_INSTR pmaxub, 0, 0, 1 | |
983 | AVX_INSTR pmaxuw, 0, 0, 1 | |
984 | AVX_INSTR pmaxud, 0, 0, 1 | |
985 | AVX_INSTR pminsb, 0, 0, 1 | |
986 | AVX_INSTR pminsw, 0, 0, 1 | |
987 | AVX_INSTR pminsd, 0, 0, 1 | |
988 | AVX_INSTR pminub, 0, 0, 1 | |
989 | AVX_INSTR pminuw, 0, 0, 1 | |
990 | AVX_INSTR pminud, 0, 0, 1 | |
991 | AVX_INSTR pmulhuw, 0, 0, 1 | |
992 | AVX_INSTR pmulhrsw, 0, 0, 1 | |
993 | AVX_INSTR pmulhw, 0, 0, 1 | |
994 | AVX_INSTR pmullw, 0, 0, 1 | |
995 | AVX_INSTR pmulld, 0, 0, 1 | |
996 | AVX_INSTR pmuludq, 0, 0, 1 | |
997 | AVX_INSTR pmuldq, 0, 0, 1 | |
998 | AVX_INSTR por, 0, 0, 1 | |
999 | AVX_INSTR psadbw, 0, 0, 1 | |
1000 | AVX_INSTR pshufb, 0, 0, 0 | |
1001 | AVX_INSTR psignb, 0, 0, 0 | |
1002 | AVX_INSTR psignw, 0, 0, 0 | |
1003 | AVX_INSTR psignd, 0, 0, 0 | |
1004 | AVX_INSTR psllw, 0, 0, 0 | |
1005 | AVX_INSTR pslld, 0, 0, 0 | |
1006 | AVX_INSTR psllq, 0, 0, 0 | |
1007 | AVX_INSTR pslldq, 0, 0, 0 | |
1008 | AVX_INSTR psraw, 0, 0, 0 | |
1009 | AVX_INSTR psrad, 0, 0, 0 | |
1010 | AVX_INSTR psrlw, 0, 0, 0 | |
1011 | AVX_INSTR psrld, 0, 0, 0 | |
1012 | AVX_INSTR psrlq, 0, 0, 0 | |
1013 | AVX_INSTR psrldq, 0, 0, 0 | |
1014 | AVX_INSTR psubb, 0, 0, 0 | |
1015 | AVX_INSTR psubw, 0, 0, 0 | |
1016 | AVX_INSTR psubd, 0, 0, 0 | |
1017 | AVX_INSTR psubq, 0, 0, 0 | |
1018 | AVX_INSTR psubsb, 0, 0, 0 | |
1019 | AVX_INSTR psubsw, 0, 0, 0 | |
1020 | AVX_INSTR psubusb, 0, 0, 0 | |
1021 | AVX_INSTR psubusw, 0, 0, 0 | |
1022 | AVX_INSTR punpckhbw, 0, 0, 0 | |
1023 | AVX_INSTR punpckhwd, 0, 0, 0 | |
1024 | AVX_INSTR punpckhdq, 0, 0, 0 | |
1025 | AVX_INSTR punpckhqdq, 0, 0, 0 | |
1026 | AVX_INSTR punpcklbw, 0, 0, 0 | |
1027 | AVX_INSTR punpcklwd, 0, 0, 0 | |
1028 | AVX_INSTR punpckldq, 0, 0, 0 | |
1029 | AVX_INSTR punpcklqdq, 0, 0, 0 | |
1030 | AVX_INSTR pxor, 0, 0, 1 | |
6b6ee582 | 1031 | AVX_INSTR shufps, 1, 1, 0 |
2f7f2e4b LM |
1032 | AVX_INSTR subpd, 1, 0, 0 |
1033 | AVX_INSTR subps, 1, 0, 0 | |
1034 | AVX_INSTR subsd, 1, 0, 0 | |
1035 | AVX_INSTR subss, 1, 0, 0 | |
1036 | AVX_INSTR unpckhpd, 1, 0, 0 | |
1037 | AVX_INSTR unpckhps, 1, 0, 0 | |
1038 | AVX_INSTR unpcklpd, 1, 0, 0 | |
1039 | AVX_INSTR unpcklps, 1, 0, 0 | |
1040 | AVX_INSTR xorpd, 1, 0, 1 | |
1041 | AVX_INSTR xorps, 1, 0, 1 | |
33cbfa6f VS |
1042 | |
1043 | ; 3DNow instructions, for sharing code between AVX, SSE and 3DN | |
2f7f2e4b LM |
1044 | AVX_INSTR pfadd, 1, 0, 1 |
1045 | AVX_INSTR pfsub, 1, 0, 0 | |
1046 | AVX_INSTR pfmul, 1, 0, 1 | |
1047 | ||
1048 | ; base-4 constants for shuffles | |
1049 | %assign i 0 | |
1050 | %rep 256 | |
1051 | %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) | |
1052 | %if j < 10 | |
1053 | CAT_XDEFINE q000, j, i | |
1054 | %elif j < 100 | |
1055 | CAT_XDEFINE q00, j, i | |
1056 | %elif j < 1000 | |
1057 | CAT_XDEFINE q0, j, i | |
1058 | %else | |
1059 | CAT_XDEFINE q, j, i | |
1060 | %endif | |
1061 | %assign i i+1 | |
1062 | %endrep | |
1063 | %undef i | |
1064 | %undef j | |
1065 | ||
1066 | %macro FMA_INSTR 3 | |
1067 | %macro %1 4-7 %1, %2, %3 | |
1068 | %if cpuflag(xop) | |
1069 | v%5 %1, %2, %3, %4 | |
1070 | %else | |
1071 | %6 %1, %2, %3 | |
1072 | %7 %1, %4 | |
1073 | %endif | |
1074 | %endmacro | |
1075 | %endmacro | |
1076 | ||
1077 | FMA_INSTR pmacsdd, pmulld, paddd | |
1078 | FMA_INSTR pmacsww, pmullw, paddw | |
1079 | FMA_INSTR pmadcswd, pmaddwd, paddd |