http: Remove the custom function for disabling chunked posts
[libav.git] / libavutil / x86 / x86inc.asm
CommitLineData
bafad220
LM
1;*****************************************************************************
2;* x86inc.asm
3;*****************************************************************************
33cbfa6f 4;* Copyright (C) 2005-2011 x264 project
bafad220 5;*
2966cc18
JGG
6;* Authors: Loren Merritt <lorenm@u.washington.edu>
7;* Anton Mitrofanov <BugMaster@narod.ru>
33cbfa6f 8;* Jason Garrett-Glaser <darkshikari@gmail.com>
bafad220 9;*
2966cc18
JGG
10;* Permission to use, copy, modify, and/or distribute this software for any
11;* purpose with or without fee is hereby granted, provided that the above
12;* copyright notice and this permission notice appear in all copies.
bafad220 13;*
2966cc18
JGG
14;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
15;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
16;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
17;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
19;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
20;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
bafad220
LM
21;*****************************************************************************
22
2966cc18
JGG
23; This is a header file for the x264ASM assembly language, which uses
24; NASM/YASM syntax combined with a large number of macros to provide easy
25; abstraction between different calling conventions (x86_32, win64, linux64).
26; It also has various other useful features to simplify writing the kind of
27; DSP functions that are most often used in x264.
28
29; Unlike the rest of x264, this file is available under an ISC license, as it
30; has significant usefulness outside of x264 and we want it to be available
31; to the largest audience possible. Of course, if you modify it for your own
32; purposes to add a new feature, we strongly encourage contributing a patch
33; as this feature might be useful for others as well. Send patches or ideas
34; to x264-devel@videolan.org .
35
36%define program_name ff
37
3f87f39c
JA
38%ifdef ARCH_X86_64
39 %ifidn __OUTPUT_FORMAT__,win32
40 %define WIN64
41 %else
42 %define UNIX64
43 %endif
44%endif
45
2966cc18
JGG
46%ifdef PREFIX
47 %define mangle(x) _ %+ x
48%else
49 %define mangle(x) x
50%endif
51
bafad220
LM
52; FIXME: All of the 64bit asm functions that take a stride as an argument
53; via register, assume that the high dword of that register is filled with 0.
54; This is true in practice (since we never do any 64bit arithmetic on strides,
55; and x264's strides are all positive), but is not guaranteed by the ABI.
56
57; Name of the .rodata section.
58; Kludge: Something on OS X fails to align .rodata even given an align attribute,
59; so use a different read-only section.
3f87f39c 60%macro SECTION_RODATA 0-1 16
bafad220 61 %ifidn __OUTPUT_FORMAT__,macho64
3f87f39c 62 SECTION .text align=%1
bafad220 63 %elifidn __OUTPUT_FORMAT__,macho
3f87f39c 64 SECTION .text align=%1
bafad220 65 fakegot:
d69f9a42
DY
66 %elifidn __OUTPUT_FORMAT__,aout
67 section .text
bafad220 68 %else
3f87f39c 69 SECTION .rodata align=%1
bafad220
LM
70 %endif
71%endmacro
72
d69f9a42
DY
73; aout does not support align=
74%macro SECTION_TEXT 0-1 16
75 %ifidn __OUTPUT_FORMAT__,aout
76 SECTION .text
77 %else
78 SECTION .text align=%1
79 %endif
80%endmacro
81
3f87f39c
JA
82%ifdef WIN64
83 %define PIC
84%elifndef ARCH_X86_64
2966cc18
JGG
85; x86_32 doesn't require PIC.
86; Some distros prefer shared objects to be PIC, but nothing breaks if
87; the code contains a few textrels, so we'll skip that complexity.
3f87f39c
JA
88 %undef PIC
89%endif
90%ifdef PIC
2966cc18 91 default rel
bafad220
LM
92%endif
93
94; Macros to eliminate most code duplication between x86_32 and x86_64:
95; Currently this works only for leaf functions which load all their arguments
96; into registers at the start, and make no other use of the stack. Luckily that
97; covers most of x264's asm.
98
99; PROLOGUE:
100; %1 = number of arguments. loads them from stack if needed.
3f87f39c
JA
101; %2 = number of registers used. pushes callee-saved regs if needed.
102; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
bafad220
LM
103; %4 = list of names to define to registers
104; PROLOGUE can also be invoked by adding the same options to cglobal
105
106; e.g.
29e4edbb 107; cglobal foo, 2,3,0, dst, src, tmp
3f87f39c 108; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
bafad220
LM
109
110; TODO Some functions can use some args directly from the stack. If they're the
111; last args then you can just not declare them, but if they're in the middle
112; we need more flexible macro.
113
114; RET:
115; Pops anything that was pushed by PROLOGUE
116
117; REP_RET:
118; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
119; which are slow when a normal ret follows a branch.
120
3f87f39c
JA
121; registers:
122; rN and rNq are the native-size register holding function argument N
123; rNd, rNw, rNb are dword, word, and byte size
124; rNm is the original location of arg N (a register or on the stack), dword
125; rNmp is native size
126
bafad220
LM
127%macro DECLARE_REG 6
128 %define r%1q %2
129 %define r%1d %3
130 %define r%1w %4
131 %define r%1b %5
132 %define r%1m %6
3f87f39c
JA
133 %ifid %6 ; i.e. it's a register
134 %define r%1mp %2
135 %elifdef ARCH_X86_64 ; memory
136 %define r%1mp qword %6
137 %else
138 %define r%1mp dword %6
139 %endif
bafad220
LM
140 %define r%1 %2
141%endmacro
142
143%macro DECLARE_REG_SIZE 2
144 %define r%1q r%1
145 %define e%1q r%1
146 %define r%1d e%1
147 %define e%1d e%1
148 %define r%1w %1
149 %define e%1w %1
150 %define r%1b %2
151 %define e%1b %2
152%ifndef ARCH_X86_64
153 %define r%1 e%1
154%endif
155%endmacro
156
157DECLARE_REG_SIZE ax, al
158DECLARE_REG_SIZE bx, bl
159DECLARE_REG_SIZE cx, cl
160DECLARE_REG_SIZE dx, dl
161DECLARE_REG_SIZE si, sil
162DECLARE_REG_SIZE di, dil
163DECLARE_REG_SIZE bp, bpl
164
3f87f39c
JA
165; t# defines for when per-arch register allocation is more complex than just function arguments
166
167%macro DECLARE_REG_TMP 1-*
168 %assign %%i 0
169 %rep %0
170 CAT_XDEFINE t, %%i, r%1
171 %assign %%i %%i+1
172 %rotate 1
173 %endrep
174%endmacro
175
176%macro DECLARE_REG_TMP_SIZE 0-*
177 %rep %0
178 %define t%1q t%1 %+ q
179 %define t%1d t%1 %+ d
180 %define t%1w t%1 %+ w
181 %define t%1b t%1 %+ b
182 %rotate 1
183 %endrep
184%endmacro
185
2966cc18 186DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9
3f87f39c 187
bafad220
LM
188%ifdef ARCH_X86_64
189 %define gprsize 8
190%else
191 %define gprsize 4
192%endif
193
194%macro PUSH 1
195 push %1
196 %assign stack_offset stack_offset+gprsize
197%endmacro
198
199%macro POP 1
200 pop %1
201 %assign stack_offset stack_offset-gprsize
202%endmacro
203
204%macro SUB 2
205 sub %1, %2
206 %ifidn %1, rsp
207 %assign stack_offset stack_offset+(%2)
208 %endif
209%endmacro
210
211%macro ADD 2
212 add %1, %2
213 %ifidn %1, rsp
214 %assign stack_offset stack_offset-(%2)
215 %endif
216%endmacro
217
218%macro movifnidn 2
219 %ifnidn %1, %2
220 mov %1, %2
221 %endif
222%endmacro
223
224%macro movsxdifnidn 2
225 %ifnidn %1, %2
226 movsxd %1, %2
227 %endif
228%endmacro
229
230%macro ASSERT 1
231 %if (%1) == 0
232 %error assert failed
233 %endif
234%endmacro
235
236%macro DEFINE_ARGS 0-*
237 %ifdef n_arg_names
238 %assign %%i 0
239 %rep n_arg_names
240 CAT_UNDEF arg_name %+ %%i, q
241 CAT_UNDEF arg_name %+ %%i, d
242 CAT_UNDEF arg_name %+ %%i, w
243 CAT_UNDEF arg_name %+ %%i, b
2f77923d 244 CAT_UNDEF arg_name %+ %%i, m
bafad220
LM
245 CAT_UNDEF arg_name, %%i
246 %assign %%i %%i+1
247 %endrep
248 %endif
249
250 %assign %%i 0
251 %rep %0
252 %xdefine %1q r %+ %%i %+ q
253 %xdefine %1d r %+ %%i %+ d
254 %xdefine %1w r %+ %%i %+ w
255 %xdefine %1b r %+ %%i %+ b
2f77923d 256 %xdefine %1m r %+ %%i %+ m
bafad220
LM
257 CAT_XDEFINE arg_name, %%i, %1
258 %assign %%i %%i+1
259 %rotate 1
260 %endrep
261 %assign n_arg_names %%i
262%endmacro
263
3f87f39c 264%ifdef WIN64 ; Windows x64 ;=================================================
bafad220
LM
265
266DECLARE_REG 0, rcx, ecx, cx, cl, ecx
267DECLARE_REG 1, rdx, edx, dx, dl, edx
268DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
269DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
270DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
271DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
272DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
273%define r7m [rsp + stack_offset + 64]
274%define r8m [rsp + stack_offset + 72]
275
276%macro LOAD_IF_USED 2 ; reg_id, number_of_args
277 %if %1 < %2
3f87f39c
JA
278 mov r%1, [rsp + stack_offset + 8 + %1*8]
279 %endif
280%endmacro
281
2966cc18 282%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
3f87f39c
JA
283 ASSERT %2 >= %1
284 %assign regs_used %2
285 ASSERT regs_used <= 7
3f87f39c
JA
286 %if regs_used > 4
287 push r4
288 push r5
289 %assign stack_offset stack_offset+16
290 %endif
532e7697
LM
291 WIN64_SPILL_XMM %3
292 LOAD_IF_USED 4, %1
293 LOAD_IF_USED 5, %1
294 LOAD_IF_USED 6, %1
295 DEFINE_ARGS %4
296%endmacro
297
298%macro WIN64_SPILL_XMM 1
299 %assign xmm_regs_used %1
300 ASSERT xmm_regs_used <= 16
3f87f39c
JA
301 %if xmm_regs_used > 6
302 sub rsp, (xmm_regs_used-6)*16+16
303 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
304 %assign %%i xmm_regs_used
305 %rep (xmm_regs_used-6)
306 %assign %%i %%i-1
307 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
308 %endrep
309 %endif
3f87f39c
JA
310%endmacro
311
532e7697 312%macro WIN64_RESTORE_XMM_INTERNAL 1
3f87f39c
JA
313 %if xmm_regs_used > 6
314 %assign %%i xmm_regs_used
315 %rep (xmm_regs_used-6)
316 %assign %%i %%i-1
317 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
318 %endrep
319 add %1, (xmm_regs_used-6)*16+16
320 %endif
321%endmacro
322
532e7697
LM
323%macro WIN64_RESTORE_XMM 1
324 WIN64_RESTORE_XMM_INTERNAL %1
3f87f39c
JA
325 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
326 %assign xmm_regs_used 0
327%endmacro
328
329%macro RET 0
532e7697 330 WIN64_RESTORE_XMM_INTERNAL rsp
3f87f39c
JA
331 %if regs_used > 4
332 pop r5
333 pop r4
bafad220 334 %endif
3f87f39c 335 ret
bafad220
LM
336%endmacro
337
3f87f39c
JA
338%macro REP_RET 0
339 %if regs_used > 4 || xmm_regs_used > 6
340 RET
341 %else
342 rep ret
343 %endif
344%endmacro
345
346%elifdef ARCH_X86_64 ; *nix x64 ;=============================================
bafad220
LM
347
348DECLARE_REG 0, rdi, edi, di, dil, edi
349DECLARE_REG 1, rsi, esi, si, sil, esi
350DECLARE_REG 2, rdx, edx, dx, dl, edx
351DECLARE_REG 3, rcx, ecx, cx, cl, ecx
352DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
353DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
354DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
355%define r7m [rsp + stack_offset + 16]
356%define r8m [rsp + stack_offset + 24]
357
358%macro LOAD_IF_USED 2 ; reg_id, number_of_args
359 %if %1 < %2
360 mov r%1, [rsp - 40 + %1*8]
361 %endif
362%endmacro
363
3f87f39c 364%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
bafad220
LM
365 ASSERT %2 >= %1
366 ASSERT %2 <= 7
bafad220
LM
367 LOAD_IF_USED 6, %1
368 DEFINE_ARGS %4
369%endmacro
370
371%macro RET 0
372 ret
373%endmacro
374
375%macro REP_RET 0
376 rep ret
377%endmacro
378
379%else ; X86_32 ;==============================================================
380
381DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
382DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
383DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
384DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
385DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
386DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
387DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
388%define r7m [esp + stack_offset + 32]
389%define r8m [esp + stack_offset + 36]
390%define rsp esp
391
392%macro PUSH_IF_USED 1 ; reg_id
393 %if %1 < regs_used
394 push r%1
395 %assign stack_offset stack_offset+4
396 %endif
397%endmacro
398
399%macro POP_IF_USED 1 ; reg_id
400 %if %1 < regs_used
401 pop r%1
402 %endif
403%endmacro
404
405%macro LOAD_IF_USED 2 ; reg_id, number_of_args
406 %if %1 < %2
407 mov r%1, [esp + stack_offset + 4 + %1*4]
408 %endif
409%endmacro
410
2966cc18 411%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
bafad220 412 ASSERT %2 >= %1
bafad220 413 %assign regs_used %2
bafad220
LM
414 ASSERT regs_used <= 7
415 PUSH_IF_USED 3
416 PUSH_IF_USED 4
417 PUSH_IF_USED 5
418 PUSH_IF_USED 6
419 LOAD_IF_USED 0, %1
420 LOAD_IF_USED 1, %1
421 LOAD_IF_USED 2, %1
422 LOAD_IF_USED 3, %1
423 LOAD_IF_USED 4, %1
424 LOAD_IF_USED 5, %1
425 LOAD_IF_USED 6, %1
bafad220
LM
426 DEFINE_ARGS %4
427%endmacro
428
429%macro RET 0
430 POP_IF_USED 6
431 POP_IF_USED 5
432 POP_IF_USED 4
433 POP_IF_USED 3
434 ret
435%endmacro
436
437%macro REP_RET 0
438 %if regs_used > 3
439 RET
440 %else
441 rep ret
442 %endif
443%endmacro
444
445%endif ;======================================================================
446
532e7697
LM
447%ifndef WIN64
448%macro WIN64_SPILL_XMM 1
449%endmacro
450%macro WIN64_RESTORE_XMM 1
451%endmacro
452%endif
453
bafad220
LM
454
455
456;=============================================================================
457; arch-independent part
458;=============================================================================
459
460%assign function_align 16
461
462; Symbol prefix for C linkage
463%macro cglobal 1-2+
2966cc18 464 %xdefine %1 mangle(program_name %+ _ %+ %1)
29e4edbb 465 %xdefine %1.skip_prologue %1 %+ .skip_prologue
bafad220 466 %ifidn __OUTPUT_FORMAT__,elf
40c7d0ae 467 global %1:function hidden
bafad220 468 %else
40c7d0ae 469 global %1
bafad220
LM
470 %endif
471 align function_align
472 %1:
473 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
3f87f39c 474 %assign stack_offset 0
bafad220
LM
475 %if %0 > 1
476 PROLOGUE %2
477 %endif
478%endmacro
479
480%macro cextern 1
2966cc18
JGG
481 %xdefine %1 mangle(program_name %+ _ %+ %1)
482 extern %1
483%endmacro
484
485;like cextern, but without the prefix
486%macro cextern_naked 1
487 %xdefine %1 mangle(%1)
3f87f39c 488 extern %1
bafad220
LM
489%endmacro
490
2966cc18
JGG
491%macro const 2+
492 %xdefine %1 mangle(program_name %+ _ %+ %1)
493 global %1
494 %1: %2
495%endmacro
496
bafad220
LM
497; This is needed for ELF, otherwise the GNU linker assumes the stack is
498; executable by default.
499%ifidn __OUTPUT_FORMAT__,elf
500SECTION .note.GNU-stack noalloc noexec nowrite progbits
501%endif
502
bafad220
LM
503; merge mmx and sse*
504
505%macro CAT_XDEFINE 3
506 %xdefine %1%2 %3
507%endmacro
508
509%macro CAT_UNDEF 2
510 %undef %1%2
511%endmacro
512
513%macro INIT_MMX 0
33cbfa6f 514 %assign avx_enabled 0
bafad220
LM
515 %define RESET_MM_PERMUTATION INIT_MMX
516 %define mmsize 8
517 %define num_mmregs 8
518 %define mova movq
519 %define movu movq
520 %define movh movd
532e7697 521 %define movnta movntq
bafad220
LM
522 %assign %%i 0
523 %rep 8
524 CAT_XDEFINE m, %%i, mm %+ %%i
525 CAT_XDEFINE nmm, %%i, %%i
526 %assign %%i %%i+1
527 %endrep
528 %rep 8
529 CAT_UNDEF m, %%i
530 CAT_UNDEF nmm, %%i
531 %assign %%i %%i+1
532 %endrep
533%endmacro
534
535%macro INIT_XMM 0
33cbfa6f 536 %assign avx_enabled 0
bafad220
LM
537 %define RESET_MM_PERMUTATION INIT_XMM
538 %define mmsize 16
539 %define num_mmregs 8
540 %ifdef ARCH_X86_64
541 %define num_mmregs 16
542 %endif
543 %define mova movdqa
544 %define movu movdqu
545 %define movh movq
532e7697 546 %define movnta movntdq
bafad220
LM
547 %assign %%i 0
548 %rep num_mmregs
549 CAT_XDEFINE m, %%i, xmm %+ %%i
550 CAT_XDEFINE nxmm, %%i, %%i
551 %assign %%i %%i+1
552 %endrep
553%endmacro
554
33cbfa6f
VS
555%macro INIT_AVX 0
556 INIT_XMM
557 %assign avx_enabled 1
558 %define PALIGNR PALIGNR_SSSE3
559 %define RESET_MM_PERMUTATION INIT_AVX
560%endmacro
561
562%macro INIT_YMM 0
563 %assign avx_enabled 1
564 %define RESET_MM_PERMUTATION INIT_YMM
565 %define mmsize 32
566 %define num_mmregs 8
567 %ifdef ARCH_X86_64
568 %define num_mmregs 16
569 %endif
570 %define mova vmovaps
571 %define movu vmovups
572 %assign %%i 0
573 %rep num_mmregs
574 CAT_XDEFINE m, %%i, ymm %+ %%i
575 CAT_XDEFINE nymm, %%i, %%i
576 %assign %%i %%i+1
577 %endrep
578%endmacro
579
bafad220
LM
580INIT_MMX
581
582; I often want to use macros that permute their arguments. e.g. there's no
583; efficient way to implement butterfly or transpose or dct without swapping some
584; arguments.
585;
586; I would like to not have to manually keep track of the permutations:
587; If I insert a permutation in the middle of a function, it should automatically
588; change everything that follows. For more complex macros I may also have multiple
589; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
590;
591; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
592; permutes its arguments. It's equivalent to exchanging the contents of the
593; registers, except that this way you exchange the register names instead, so it
594; doesn't cost any cycles.
595
596%macro PERMUTE 2-* ; takes a list of pairs to swap
597%rep %0/2
598 %xdefine tmp%2 m%2
599 %xdefine ntmp%2 nm%2
600 %rotate 2
601%endrep
602%rep %0/2
603 %xdefine m%1 tmp%2
604 %xdefine nm%1 ntmp%2
605 %undef tmp%2
606 %undef ntmp%2
607 %rotate 2
608%endrep
609%endmacro
610
611%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
612%rep %0-1
613%ifdef m%1
614 %xdefine tmp m%1
615 %xdefine m%1 m%2
616 %xdefine m%2 tmp
617 CAT_XDEFINE n, m%1, %1
618 CAT_XDEFINE n, m%2, %2
619%else
620 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
621 ; Be careful using this mode in nested macros though, as in some cases there may be
622 ; other copies of m# that have already been dereferenced and don't get updated correctly.
623 %xdefine %%n1 n %+ %1
624 %xdefine %%n2 n %+ %2
625 %xdefine tmp m %+ %%n1
626 CAT_XDEFINE m, %%n1, m %+ %%n2
627 CAT_XDEFINE m, %%n2, tmp
628 CAT_XDEFINE n, m %+ %%n1, %%n1
629 CAT_XDEFINE n, m %+ %%n2, %%n2
630%endif
631 %undef tmp
632 %rotate 1
633%endrep
634%endmacro
635
2966cc18
JGG
636; If SAVE_MM_PERMUTATION is placed at the end of a function and given the
637; function name, then any later calls to that function will automatically
638; load the permutation, so values can be returned in mmregs.
639%macro SAVE_MM_PERMUTATION 1 ; name to save as
bafad220
LM
640 %assign %%i 0
641 %rep num_mmregs
642 CAT_XDEFINE %1_m, %%i, m %+ %%i
643 %assign %%i %%i+1
644 %endrep
645%endmacro
646
2966cc18 647%macro LOAD_MM_PERMUTATION 1 ; name to load from
bafad220
LM
648 %assign %%i 0
649 %rep num_mmregs
650 CAT_XDEFINE m, %%i, %1_m %+ %%i
3f87f39c 651 CAT_XDEFINE n, m %+ %%i, %%i
bafad220
LM
652 %assign %%i %%i+1
653 %endrep
654%endmacro
655
656%macro call 1
657 call %1
658 %ifdef %1_m0
659 LOAD_MM_PERMUTATION %1
660 %endif
661%endmacro
662
2966cc18 663; Substitutions that reduce instruction size but are functionally equivalent
3f87f39c
JA
664%macro add 2
665 %ifnum %2
666 %if %2==128
667 sub %1, -128
668 %else
669 add %1, %2
670 %endif
671 %else
672 add %1, %2
673 %endif
674%endmacro
675
676%macro sub 2
677 %ifnum %2
678 %if %2==128
679 add %1, -128
680 %else
681 sub %1, %2
682 %endif
683 %else
684 sub %1, %2
685 %endif
686%endmacro
33cbfa6f
VS
687
688;=============================================================================
689; AVX abstraction layer
690;=============================================================================
691
692%assign i 0
693%rep 16
694 %if i < 8
695 CAT_XDEFINE sizeofmm, i, 8
696 %endif
697 CAT_XDEFINE sizeofxmm, i, 16
698 CAT_XDEFINE sizeofymm, i, 32
699%assign i i+1
700%endrep
701%undef i
702
703;%1 == instruction
704;%2 == 1 if float, 0 if int
705;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
706;%4 == number of operands given
707;%5+: operands
708%macro RUN_AVX_INSTR 6-7+
709 %if sizeof%5==32
710 v%1 %5, %6, %7
711 %else
712 %if sizeof%5==8
713 %define %%regmov movq
714 %elif %2
715 %define %%regmov movaps
716 %else
717 %define %%regmov movdqa
718 %endif
719
720 %if %4>=3+%3
721 %ifnidn %5, %6
722 %if avx_enabled && sizeof%5==16
723 v%1 %5, %6, %7
724 %else
725 %%regmov %5, %6
726 %1 %5, %7
727 %endif
728 %else
729 %1 %5, %7
730 %endif
731 %elif %3
732 %1 %5, %6, %7
733 %else
734 %1 %5, %6
735 %endif
736 %endif
737%endmacro
738
739;%1 == instruction
740;%2 == 1 if float, 0 if int
741;%3 == 0 if 3-operand (xmm, xmm, xmm), 1 if 4-operand (xmm, xmm, xmm, imm)
742%macro AVX_INSTR 3
743 %macro %1 2-8 fnord, fnord, fnord, %1, %2, %3
744 %ifidn %3, fnord
745 RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
746 %elifidn %4, fnord
747 RUN_AVX_INSTR %6, %7, %8, 3, %1, %2, %3
748 %elifidn %5, fnord
749 RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
750 %else
751 RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
752 %endif
753 %endmacro
754%endmacro
755
756AVX_INSTR addpd, 1, 0
757AVX_INSTR addps, 1, 0
758AVX_INSTR addsd, 1, 0
759AVX_INSTR addss, 1, 0
760AVX_INSTR addsubpd, 1, 0
761AVX_INSTR addsubps, 1, 0
762AVX_INSTR andpd, 1, 0
763AVX_INSTR andps, 1, 0
764AVX_INSTR andnpd, 1, 0
765AVX_INSTR andnps, 1, 0
766AVX_INSTR blendpd, 1, 0
767AVX_INSTR blendps, 1, 0
768AVX_INSTR blendvpd, 1, 0
769AVX_INSTR blendvps, 1, 0
770AVX_INSTR cmppd, 1, 0
771AVX_INSTR cmpps, 1, 0
772AVX_INSTR cmpsd, 1, 0
773AVX_INSTR cmpss, 1, 0
774AVX_INSTR divpd, 1, 0
775AVX_INSTR divps, 1, 0
776AVX_INSTR divsd, 1, 0
777AVX_INSTR divss, 1, 0
778AVX_INSTR dppd, 1, 0
779AVX_INSTR dpps, 1, 0
780AVX_INSTR haddpd, 1, 0
781AVX_INSTR haddps, 1, 0
782AVX_INSTR hsubpd, 1, 0
783AVX_INSTR hsubps, 1, 0
784AVX_INSTR maxpd, 1, 0
785AVX_INSTR maxps, 1, 0
786AVX_INSTR maxsd, 1, 0
787AVX_INSTR maxss, 1, 0
788AVX_INSTR minpd, 1, 0
789AVX_INSTR minps, 1, 0
790AVX_INSTR minsd, 1, 0
791AVX_INSTR minss, 1, 0
792AVX_INSTR mpsadbw, 0, 1
793AVX_INSTR mulpd, 1, 0
794AVX_INSTR mulps, 1, 0
795AVX_INSTR mulsd, 1, 0
796AVX_INSTR mulss, 1, 0
797AVX_INSTR orpd, 1, 0
798AVX_INSTR orps, 1, 0
799AVX_INSTR packsswb, 0, 0
800AVX_INSTR packssdw, 0, 0
801AVX_INSTR packuswb, 0, 0
802AVX_INSTR packusdw, 0, 0
803AVX_INSTR paddb, 0, 0
804AVX_INSTR paddw, 0, 0
805AVX_INSTR paddd, 0, 0
806AVX_INSTR paddq, 0, 0
807AVX_INSTR paddsb, 0, 0
808AVX_INSTR paddsw, 0, 0
809AVX_INSTR paddusb, 0, 0
810AVX_INSTR paddusw, 0, 0
811AVX_INSTR palignr, 0, 1
812AVX_INSTR pand, 0, 0
813AVX_INSTR pandn, 0, 0
814AVX_INSTR pavgb, 0, 0
815AVX_INSTR pavgw, 0, 0
816AVX_INSTR pblendvb, 0, 0
817AVX_INSTR pblendw, 0, 1
818AVX_INSTR pcmpestri, 0, 0
819AVX_INSTR pcmpestrm, 0, 0
820AVX_INSTR pcmpistri, 0, 0
821AVX_INSTR pcmpistrm, 0, 0
822AVX_INSTR pcmpeqb, 0, 0
823AVX_INSTR pcmpeqw, 0, 0
824AVX_INSTR pcmpeqd, 0, 0
825AVX_INSTR pcmpeqq, 0, 0
826AVX_INSTR pcmpgtb, 0, 0
827AVX_INSTR pcmpgtw, 0, 0
828AVX_INSTR pcmpgtd, 0, 0
829AVX_INSTR pcmpgtq, 0, 0
830AVX_INSTR phaddw, 0, 0
831AVX_INSTR phaddd, 0, 0
832AVX_INSTR phaddsw, 0, 0
833AVX_INSTR phsubw, 0, 0
834AVX_INSTR phsubd, 0, 0
835AVX_INSTR phsubsw, 0, 0
836AVX_INSTR pmaddwd, 0, 0
837AVX_INSTR pmaddubsw, 0, 0
838AVX_INSTR pmaxsb, 0, 0
839AVX_INSTR pmaxsw, 0, 0
840AVX_INSTR pmaxsd, 0, 0
841AVX_INSTR pmaxub, 0, 0
842AVX_INSTR pmaxuw, 0, 0
843AVX_INSTR pmaxud, 0, 0
844AVX_INSTR pminsb, 0, 0
845AVX_INSTR pminsw, 0, 0
846AVX_INSTR pminsd, 0, 0
847AVX_INSTR pminub, 0, 0
848AVX_INSTR pminuw, 0, 0
849AVX_INSTR pminud, 0, 0
850AVX_INSTR pmulhuw, 0, 0
851AVX_INSTR pmulhrsw, 0, 0
852AVX_INSTR pmulhw, 0, 0
853AVX_INSTR pmullw, 0, 0
854AVX_INSTR pmulld, 0, 0
855AVX_INSTR pmuludq, 0, 0
856AVX_INSTR pmuldq, 0, 0
857AVX_INSTR por, 0, 0
858AVX_INSTR psadbw, 0, 0
859AVX_INSTR pshufb, 0, 0
860AVX_INSTR psignb, 0, 0
861AVX_INSTR psignw, 0, 0
862AVX_INSTR psignd, 0, 0
863AVX_INSTR psllw, 0, 0
864AVX_INSTR pslld, 0, 0
865AVX_INSTR psllq, 0, 0
866AVX_INSTR pslldq, 0, 0
867AVX_INSTR psraw, 0, 0
868AVX_INSTR psrad, 0, 0
869AVX_INSTR psrlw, 0, 0
870AVX_INSTR psrld, 0, 0
871AVX_INSTR psrlq, 0, 0
872AVX_INSTR psrldq, 0, 0
873AVX_INSTR psubb, 0, 0
874AVX_INSTR psubw, 0, 0
875AVX_INSTR psubd, 0, 0
876AVX_INSTR psubq, 0, 0
877AVX_INSTR psubsb, 0, 0
878AVX_INSTR psubsw, 0, 0
879AVX_INSTR psubusb, 0, 0
880AVX_INSTR psubusw, 0, 0
881AVX_INSTR punpckhbw, 0, 0
882AVX_INSTR punpckhwd, 0, 0
883AVX_INSTR punpckhdq, 0, 0
884AVX_INSTR punpckhqdq, 0, 0
885AVX_INSTR punpcklbw, 0, 0
886AVX_INSTR punpcklwd, 0, 0
887AVX_INSTR punpckldq, 0, 0
888AVX_INSTR punpcklqdq, 0, 0
889AVX_INSTR pxor, 0, 0
890AVX_INSTR shufps, 0, 1
891AVX_INSTR subpd, 1, 0
892AVX_INSTR subps, 1, 0
893AVX_INSTR subsd, 1, 0
894AVX_INSTR subss, 1, 0
895AVX_INSTR unpckhpd, 1, 0
896AVX_INSTR unpckhps, 1, 0
897AVX_INSTR unpcklpd, 1, 0
898AVX_INSTR unpcklps, 1, 0
899AVX_INSTR xorpd, 1, 0
900AVX_INSTR xorps, 1, 0
901
902; 3DNow instructions, for sharing code between AVX, SSE and 3DN
903AVX_INSTR pfadd, 1, 0
904AVX_INSTR pfsub, 1, 0
905AVX_INSTR pfmul, 1, 0