sync yasm macros to x264
[libav.git] / libavcodec / x86 / x86inc.asm
1 ;*****************************************************************************
2 ;* x86inc.asm
3 ;*****************************************************************************
4 ;* Copyright (C) 2005-2008 Loren Merritt <lorenm@u.washington.edu>
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
12 ;*
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
17 ;*
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;*****************************************************************************
22
23 %ifdef ARCH_X86_64
24 %ifidn __OUTPUT_FORMAT__,win32
25 %define WIN64
26 %else
27 %define UNIX64
28 %endif
29 %endif
30
31 ; FIXME: All of the 64bit asm functions that take a stride as an argument
32 ; via register, assume that the high dword of that register is filled with 0.
33 ; This is true in practice (since we never do any 64bit arithmetic on strides,
34 ; and x264's strides are all positive), but is not guaranteed by the ABI.
35
36 ; Name of the .rodata section.
37 ; Kludge: Something on OS X fails to align .rodata even given an align attribute,
38 ; so use a different read-only section.
39 %macro SECTION_RODATA 0-1 16
40 %ifidn __OUTPUT_FORMAT__,macho64
41 SECTION .text align=%1
42 %elifidn __OUTPUT_FORMAT__,macho
43 SECTION .text align=%1
44 fakegot:
45 %else
46 SECTION .rodata align=%1
47 %endif
48 %endmacro
49
50 ; PIC support macros.
51 ; x86_64 can't fit 64bit address literals in most instruction types,
52 ; so shared objects (under the assumption that they might be anywhere
53 ; in memory) must use an address mode that does fit.
54 ; So all accesses to global variables must use this macro, e.g.
55 ; mov eax, [foo GLOBAL]
56 ; instead of
57 ; mov eax, [foo]
58 ;
59 ; x86_32 doesn't require PIC.
60 ; Some distros prefer shared objects to be PIC, but nothing breaks if
61 ; the code contains a few textrels, so we'll skip that complexity.
62
63 %ifdef WIN64
64 %define PIC
65 %elifndef ARCH_X86_64
66 %undef PIC
67 %endif
68 %ifdef PIC
69 %define GLOBAL wrt rip
70 %else
71 %define GLOBAL
72 %endif
73
74 ; Macros to eliminate most code duplication between x86_32 and x86_64:
75 ; Currently this works only for leaf functions which load all their arguments
76 ; into registers at the start, and make no other use of the stack. Luckily that
77 ; covers most of x264's asm.
78
79 ; PROLOGUE:
80 ; %1 = number of arguments. loads them from stack if needed.
81 ; %2 = number of registers used. pushes callee-saved regs if needed.
82 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
83 ; %4 = list of names to define to registers
84 ; PROLOGUE can also be invoked by adding the same options to cglobal
85
86 ; e.g.
87 ; cglobal foo, 2,3,0, dst, src, tmp
88 ; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
89
90 ; TODO Some functions can use some args directly from the stack. If they're the
91 ; last args then you can just not declare them, but if they're in the middle
92 ; we need more flexible macro.
93
94 ; RET:
95 ; Pops anything that was pushed by PROLOGUE
96
97 ; REP_RET:
98 ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
99 ; which are slow when a normal ret follows a branch.
100
101 ; registers:
102 ; rN and rNq are the native-size register holding function argument N
103 ; rNd, rNw, rNb are dword, word, and byte size
104 ; rNm is the original location of arg N (a register or on the stack), dword
105 ; rNmp is native size
106
107 %macro DECLARE_REG 6
108 %define r%1q %2
109 %define r%1d %3
110 %define r%1w %4
111 %define r%1b %5
112 %define r%1m %6
113 %ifid %6 ; i.e. it's a register
114 %define r%1mp %2
115 %elifdef ARCH_X86_64 ; memory
116 %define r%1mp qword %6
117 %else
118 %define r%1mp dword %6
119 %endif
120 %define r%1 %2
121 %endmacro
122
123 %macro DECLARE_REG_SIZE 2
124 %define r%1q r%1
125 %define e%1q r%1
126 %define r%1d e%1
127 %define e%1d e%1
128 %define r%1w %1
129 %define e%1w %1
130 %define r%1b %2
131 %define e%1b %2
132 %ifndef ARCH_X86_64
133 %define r%1 e%1
134 %endif
135 %endmacro
136
137 DECLARE_REG_SIZE ax, al
138 DECLARE_REG_SIZE bx, bl
139 DECLARE_REG_SIZE cx, cl
140 DECLARE_REG_SIZE dx, dl
141 DECLARE_REG_SIZE si, sil
142 DECLARE_REG_SIZE di, dil
143 DECLARE_REG_SIZE bp, bpl
144
145 ; t# defines for when per-arch register allocation is more complex than just function arguments
146
147 %macro DECLARE_REG_TMP 1-*
148 %assign %%i 0
149 %rep %0
150 CAT_XDEFINE t, %%i, r%1
151 %assign %%i %%i+1
152 %rotate 1
153 %endrep
154 %endmacro
155
156 %macro DECLARE_REG_TMP_SIZE 0-*
157 %rep %0
158 %define t%1q t%1 %+ q
159 %define t%1d t%1 %+ d
160 %define t%1w t%1 %+ w
161 %define t%1b t%1 %+ b
162 %rotate 1
163 %endrep
164 %endmacro
165
166 DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7
167
168 %ifdef ARCH_X86_64
169 %define gprsize 8
170 %else
171 %define gprsize 4
172 %endif
173
174 %macro PUSH 1
175 push %1
176 %assign stack_offset stack_offset+gprsize
177 %endmacro
178
179 %macro POP 1
180 pop %1
181 %assign stack_offset stack_offset-gprsize
182 %endmacro
183
184 %macro SUB 2
185 sub %1, %2
186 %ifidn %1, rsp
187 %assign stack_offset stack_offset+(%2)
188 %endif
189 %endmacro
190
191 %macro ADD 2
192 add %1, %2
193 %ifidn %1, rsp
194 %assign stack_offset stack_offset-(%2)
195 %endif
196 %endmacro
197
198 %macro movifnidn 2
199 %ifnidn %1, %2
200 mov %1, %2
201 %endif
202 %endmacro
203
204 %macro movsxdifnidn 2
205 %ifnidn %1, %2
206 movsxd %1, %2
207 %endif
208 %endmacro
209
210 %macro ASSERT 1
211 %if (%1) == 0
212 %error assert failed
213 %endif
214 %endmacro
215
216 %macro DEFINE_ARGS 0-*
217 %ifdef n_arg_names
218 %assign %%i 0
219 %rep n_arg_names
220 CAT_UNDEF arg_name %+ %%i, q
221 CAT_UNDEF arg_name %+ %%i, d
222 CAT_UNDEF arg_name %+ %%i, w
223 CAT_UNDEF arg_name %+ %%i, b
224 CAT_UNDEF arg_name %+ %%i, m
225 CAT_UNDEF arg_name, %%i
226 %assign %%i %%i+1
227 %endrep
228 %endif
229
230 %assign %%i 0
231 %rep %0
232 %xdefine %1q r %+ %%i %+ q
233 %xdefine %1d r %+ %%i %+ d
234 %xdefine %1w r %+ %%i %+ w
235 %xdefine %1b r %+ %%i %+ b
236 %xdefine %1m r %+ %%i %+ m
237 CAT_XDEFINE arg_name, %%i, %1
238 %assign %%i %%i+1
239 %rotate 1
240 %endrep
241 %assign n_arg_names %%i
242 %endmacro
243
244 %ifdef WIN64 ; Windows x64 ;=================================================
245
246 DECLARE_REG 0, rcx, ecx, cx, cl, ecx
247 DECLARE_REG 1, rdx, edx, dx, dl, edx
248 DECLARE_REG 2, r8, r8d, r8w, r8b, r8d
249 DECLARE_REG 3, r9, r9d, r9w, r9b, r9d
250 DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40]
251 DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48]
252 DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56]
253 %define r7m [rsp + stack_offset + 64]
254 %define r8m [rsp + stack_offset + 72]
255
256 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
257 %if %1 < %2
258 mov r%1, [rsp + stack_offset + 8 + %1*8]
259 %endif
260 %endmacro
261
262 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
263 ASSERT %2 >= %1
264 %assign regs_used %2
265 ASSERT regs_used <= 7
266 %if %0 > 2
267 %assign xmm_regs_used %3
268 %else
269 %assign xmm_regs_used 0
270 %endif
271 ASSERT xmm_regs_used <= 16
272 %if regs_used > 4
273 push r4
274 push r5
275 %assign stack_offset stack_offset+16
276 %endif
277 %if xmm_regs_used > 6
278 sub rsp, (xmm_regs_used-6)*16+16
279 %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
280 %assign %%i xmm_regs_used
281 %rep (xmm_regs_used-6)
282 %assign %%i %%i-1
283 movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
284 %endrep
285 %endif
286 LOAD_IF_USED 4, %1
287 LOAD_IF_USED 5, %1
288 LOAD_IF_USED 6, %1
289 DEFINE_ARGS %4
290 %endmacro
291
292 %macro RESTORE_XMM_INTERNAL 1
293 %if xmm_regs_used > 6
294 %assign %%i xmm_regs_used
295 %rep (xmm_regs_used-6)
296 %assign %%i %%i-1
297 movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8]
298 %endrep
299 add %1, (xmm_regs_used-6)*16+16
300 %endif
301 %endmacro
302
303 %macro RESTORE_XMM 1
304 RESTORE_XMM_INTERNAL %1
305 %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
306 %assign xmm_regs_used 0
307 %endmacro
308
309 %macro RET 0
310 RESTORE_XMM_INTERNAL rsp
311 %if regs_used > 4
312 pop r5
313 pop r4
314 %endif
315 ret
316 %endmacro
317
318 %macro REP_RET 0
319 %if regs_used > 4 || xmm_regs_used > 6
320 RET
321 %else
322 rep ret
323 %endif
324 %endmacro
325
326 %elifdef ARCH_X86_64 ; *nix x64 ;=============================================
327
328 DECLARE_REG 0, rdi, edi, di, dil, edi
329 DECLARE_REG 1, rsi, esi, si, sil, esi
330 DECLARE_REG 2, rdx, edx, dx, dl, edx
331 DECLARE_REG 3, rcx, ecx, cx, cl, ecx
332 DECLARE_REG 4, r8, r8d, r8w, r8b, r8d
333 DECLARE_REG 5, r9, r9d, r9w, r9b, r9d
334 DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8]
335 %define r7m [rsp + stack_offset + 16]
336 %define r8m [rsp + stack_offset + 24]
337
338 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
339 %if %1 < %2
340 mov r%1, [rsp - 40 + %1*8]
341 %endif
342 %endmacro
343
344 %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
345 ASSERT %2 >= %1
346 ASSERT %2 <= 7
347 LOAD_IF_USED 6, %1
348 DEFINE_ARGS %4
349 %endmacro
350
351 %macro RET 0
352 ret
353 %endmacro
354
355 %macro REP_RET 0
356 rep ret
357 %endmacro
358
359 %else ; X86_32 ;==============================================================
360
361 DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4]
362 DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8]
363 DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12]
364 DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16]
365 DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20]
366 DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24]
367 DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28]
368 %define r7m [esp + stack_offset + 32]
369 %define r8m [esp + stack_offset + 36]
370 %define rsp esp
371
372 %macro PUSH_IF_USED 1 ; reg_id
373 %if %1 < regs_used
374 push r%1
375 %assign stack_offset stack_offset+4
376 %endif
377 %endmacro
378
379 %macro POP_IF_USED 1 ; reg_id
380 %if %1 < regs_used
381 pop r%1
382 %endif
383 %endmacro
384
385 %macro LOAD_IF_USED 2 ; reg_id, number_of_args
386 %if %1 < %2
387 mov r%1, [esp + stack_offset + 4 + %1*4]
388 %endif
389 %endmacro
390
391 %macro PROLOGUE 2-4+ ; #args, #regs, arg_names...
392 ASSERT %2 >= %1
393 %assign regs_used %2
394 ASSERT regs_used <= 7
395 PUSH_IF_USED 3
396 PUSH_IF_USED 4
397 PUSH_IF_USED 5
398 PUSH_IF_USED 6
399 LOAD_IF_USED 0, %1
400 LOAD_IF_USED 1, %1
401 LOAD_IF_USED 2, %1
402 LOAD_IF_USED 3, %1
403 LOAD_IF_USED 4, %1
404 LOAD_IF_USED 5, %1
405 LOAD_IF_USED 6, %1
406 DEFINE_ARGS %4
407 %endmacro
408
409 %macro RET 0
410 POP_IF_USED 6
411 POP_IF_USED 5
412 POP_IF_USED 4
413 POP_IF_USED 3
414 ret
415 %endmacro
416
417 %macro REP_RET 0
418 %if regs_used > 3
419 RET
420 %else
421 rep ret
422 %endif
423 %endmacro
424
425 %endif ;======================================================================
426
427
428
429 ;=============================================================================
430 ; arch-independent part
431 ;=============================================================================
432
433 %assign function_align 16
434
435 ; Symbol prefix for C linkage
436 %macro cglobal 1-2+
437 %xdefine %1 ff_%1
438 %ifdef PREFIX
439 %xdefine %1 _ %+ %1
440 %endif
441 %xdefine %1.skip_prologue %1 %+ .skip_prologue
442 %ifidn __OUTPUT_FORMAT__,elf
443 global %1:function hidden
444 %else
445 global %1
446 %endif
447 align function_align
448 %1:
449 RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
450 %assign stack_offset 0
451 %if %0 > 1
452 PROLOGUE %2
453 %endif
454 %endmacro
455
456 %macro cextern 1
457 %ifdef PREFIX
458 %xdefine %1 _%1
459 %endif
460 extern %1
461 %endmacro
462
463 ; This is needed for ELF, otherwise the GNU linker assumes the stack is
464 ; executable by default.
465 %ifidn __OUTPUT_FORMAT__,elf
466 SECTION .note.GNU-stack noalloc noexec nowrite progbits
467 %endif
468
469 %assign FENC_STRIDE 16
470 %assign FDEC_STRIDE 32
471
472 ; merge mmx and sse*
473
474 %macro CAT_XDEFINE 3
475 %xdefine %1%2 %3
476 %endmacro
477
478 %macro CAT_UNDEF 2
479 %undef %1%2
480 %endmacro
481
482 %macro INIT_MMX 0
483 %define RESET_MM_PERMUTATION INIT_MMX
484 %define mmsize 8
485 %define num_mmregs 8
486 %define mova movq
487 %define movu movq
488 %define movh movd
489 %define movnt movntq
490 %assign %%i 0
491 %rep 8
492 CAT_XDEFINE m, %%i, mm %+ %%i
493 CAT_XDEFINE nmm, %%i, %%i
494 %assign %%i %%i+1
495 %endrep
496 %rep 8
497 CAT_UNDEF m, %%i
498 CAT_UNDEF nmm, %%i
499 %assign %%i %%i+1
500 %endrep
501 %endmacro
502
503 %macro INIT_XMM 0
504 %define RESET_MM_PERMUTATION INIT_XMM
505 %define mmsize 16
506 %define num_mmregs 8
507 %ifdef ARCH_X86_64
508 %define num_mmregs 16
509 %endif
510 %define mova movdqa
511 %define movu movdqu
512 %define movh movq
513 %define movnt movntdq
514 %assign %%i 0
515 %rep num_mmregs
516 CAT_XDEFINE m, %%i, xmm %+ %%i
517 CAT_XDEFINE nxmm, %%i, %%i
518 %assign %%i %%i+1
519 %endrep
520 %endmacro
521
522 INIT_MMX
523
524 ; I often want to use macros that permute their arguments. e.g. there's no
525 ; efficient way to implement butterfly or transpose or dct without swapping some
526 ; arguments.
527 ;
528 ; I would like to not have to manually keep track of the permutations:
529 ; If I insert a permutation in the middle of a function, it should automatically
530 ; change everything that follows. For more complex macros I may also have multiple
531 ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
532 ;
533 ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
534 ; permutes its arguments. It's equivalent to exchanging the contents of the
535 ; registers, except that this way you exchange the register names instead, so it
536 ; doesn't cost any cycles.
537
538 %macro PERMUTE 2-* ; takes a list of pairs to swap
539 %rep %0/2
540 %xdefine tmp%2 m%2
541 %xdefine ntmp%2 nm%2
542 %rotate 2
543 %endrep
544 %rep %0/2
545 %xdefine m%1 tmp%2
546 %xdefine nm%1 ntmp%2
547 %undef tmp%2
548 %undef ntmp%2
549 %rotate 2
550 %endrep
551 %endmacro
552
553 %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
554 %rep %0-1
555 %ifdef m%1
556 %xdefine tmp m%1
557 %xdefine m%1 m%2
558 %xdefine m%2 tmp
559 CAT_XDEFINE n, m%1, %1
560 CAT_XDEFINE n, m%2, %2
561 %else
562 ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
563 ; Be careful using this mode in nested macros though, as in some cases there may be
564 ; other copies of m# that have already been dereferenced and don't get updated correctly.
565 %xdefine %%n1 n %+ %1
566 %xdefine %%n2 n %+ %2
567 %xdefine tmp m %+ %%n1
568 CAT_XDEFINE m, %%n1, m %+ %%n2
569 CAT_XDEFINE m, %%n2, tmp
570 CAT_XDEFINE n, m %+ %%n1, %%n1
571 CAT_XDEFINE n, m %+ %%n2, %%n2
572 %endif
573 %undef tmp
574 %rotate 1
575 %endrep
576 %endmacro
577
578 %macro SAVE_MM_PERMUTATION 1
579 %assign %%i 0
580 %rep num_mmregs
581 CAT_XDEFINE %1_m, %%i, m %+ %%i
582 %assign %%i %%i+1
583 %endrep
584 %endmacro
585
586 %macro LOAD_MM_PERMUTATION 1
587 %assign %%i 0
588 %rep num_mmregs
589 CAT_XDEFINE m, %%i, %1_m %+ %%i
590 CAT_XDEFINE n, m %+ %%i, %%i
591 %assign %%i %%i+1
592 %endrep
593 %endmacro
594
595 %macro call 1
596 call %1
597 %ifdef %1_m0
598 LOAD_MM_PERMUTATION %1
599 %endif
600 %endmacro
601
602 ;Substitutions that reduce instruction size but are functionally equivalent
603 %macro add 2
604 %ifnum %2
605 %if %2==128
606 sub %1, -128
607 %else
608 add %1, %2
609 %endif
610 %else
611 add %1, %2
612 %endif
613 %endmacro
614
615 %macro sub 2
616 %ifnum %2
617 %if %2==128
618 add %1, -128
619 %else
620 sub %1, %2
621 %endif
622 %else
623 sub %1, %2
624 %endif
625 %endmacro