Commit | Line | Data |
---|---|---|
bafad220 LM |
1 | ;***************************************************************************** |
2 | ;* x86inc.asm | |
3 | ;***************************************************************************** | |
2966cc18 | 4 | ;* Copyright (C) 2005-2008 x264 project |
bafad220 | 5 | ;* |
2966cc18 JGG |
6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
7 | ;* Anton Mitrofanov <BugMaster@narod.ru> | |
bafad220 | 8 | ;* |
2966cc18 JGG |
9 | ;* Permission to use, copy, modify, and/or distribute this software for any |
10 | ;* purpose with or without fee is hereby granted, provided that the above | |
11 | ;* copyright notice and this permission notice appear in all copies. | |
bafad220 | 12 | ;* |
2966cc18 JGG |
13 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
14 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |
15 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | |
16 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |
17 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | |
18 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | |
19 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |
bafad220 LM |
20 | ;***************************************************************************** |
21 | ||
2966cc18 JGG |
22 | ; This is a header file for the x264ASM assembly language, which uses |
23 | ; NASM/YASM syntax combined with a large number of macros to provide easy | |
24 | ; abstraction between different calling conventions (x86_32, win64, linux64). | |
25 | ; It also has various other useful features to simplify writing the kind of | |
26 | ; DSP functions that are most often used in x264. | |
27 | ||
28 | ; Unlike the rest of x264, this file is available under an ISC license, as it | |
29 | ; has significant usefulness outside of x264 and we want it to be available | |
30 | ; to the largest audience possible. Of course, if you modify it for your own | |
31 | ; purposes to add a new feature, we strongly encourage contributing a patch | |
32 | ; as this feature might be useful for others as well. Send patches or ideas | |
33 | ; to x264-devel@videolan.org . | |
34 | ||
35 | %define program_name ff | |
36 | ||
3f87f39c JA |
37 | %ifdef ARCH_X86_64 |
38 | %ifidn __OUTPUT_FORMAT__,win32 | |
39 | %define WIN64 | |
40 | %else | |
41 | %define UNIX64 | |
42 | %endif | |
43 | %endif | |
44 | ||
2966cc18 JGG |
45 | %ifdef PREFIX |
46 | %define mangle(x) _ %+ x | |
47 | %else | |
48 | %define mangle(x) x | |
49 | %endif | |
50 | ||
bafad220 LM |
51 | ; FIXME: All of the 64bit asm functions that take a stride as an argument |
52 | ; via register, assume that the high dword of that register is filled with 0. | |
53 | ; This is true in practice (since we never do any 64bit arithmetic on strides, | |
54 | ; and x264's strides are all positive), but is not guaranteed by the ABI. | |
55 | ||
56 | ; Name of the .rodata section. | |
57 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, | |
58 | ; so use a different read-only section. | |
3f87f39c | 59 | %macro SECTION_RODATA 0-1 16 |
bafad220 | 60 | %ifidn __OUTPUT_FORMAT__,macho64 |
3f87f39c | 61 | SECTION .text align=%1 |
bafad220 | 62 | %elifidn __OUTPUT_FORMAT__,macho |
3f87f39c | 63 | SECTION .text align=%1 |
bafad220 LM |
64 | fakegot: |
65 | %else | |
3f87f39c | 66 | SECTION .rodata align=%1 |
bafad220 LM |
67 | %endif |
68 | %endmacro | |
69 | ||
3f87f39c JA |
70 | %ifdef WIN64 |
71 | %define PIC | |
72 | %elifndef ARCH_X86_64 | |
2966cc18 JGG |
73 | ; x86_32 doesn't require PIC. |
74 | ; Some distros prefer shared objects to be PIC, but nothing breaks if | |
75 | ; the code contains a few textrels, so we'll skip that complexity. | |
3f87f39c JA |
76 | %undef PIC |
77 | %endif | |
78 | %ifdef PIC | |
2966cc18 | 79 | default rel |
bafad220 LM |
80 | %endif |
81 | ||
82 | ; Macros to eliminate most code duplication between x86_32 and x86_64: | |
83 | ; Currently this works only for leaf functions which load all their arguments | |
84 | ; into registers at the start, and make no other use of the stack. Luckily that | |
85 | ; covers most of x264's asm. | |
86 | ||
87 | ; PROLOGUE: | |
88 | ; %1 = number of arguments. loads them from stack if needed. | |
3f87f39c JA |
89 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
90 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | |
bafad220 LM |
91 | ; %4 = list of names to define to registers |
92 | ; PROLOGUE can also be invoked by adding the same options to cglobal | |
93 | ||
94 | ; e.g. | |
29e4edbb | 95 | ; cglobal foo, 2,3,0, dst, src, tmp |
3f87f39c | 96 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
bafad220 LM |
97 | |
98 | ; TODO Some functions can use some args directly from the stack. If they're the | |
99 | ; last args then you can just not declare them, but if they're in the middle | |
100 | ; we need more flexible macro. | |
101 | ||
102 | ; RET: | |
103 | ; Pops anything that was pushed by PROLOGUE | |
104 | ||
105 | ; REP_RET: | |
106 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons | |
107 | ; which are slow when a normal ret follows a branch. | |
108 | ||
3f87f39c JA |
109 | ; registers: |
110 | ; rN and rNq are the native-size register holding function argument N | |
111 | ; rNd, rNw, rNb are dword, word, and byte size | |
112 | ; rNm is the original location of arg N (a register or on the stack), dword | |
113 | ; rNmp is native size | |
114 | ||
bafad220 LM |
115 | %macro DECLARE_REG 6 |
116 | %define r%1q %2 | |
117 | %define r%1d %3 | |
118 | %define r%1w %4 | |
119 | %define r%1b %5 | |
120 | %define r%1m %6 | |
3f87f39c JA |
121 | %ifid %6 ; i.e. it's a register |
122 | %define r%1mp %2 | |
123 | %elifdef ARCH_X86_64 ; memory | |
124 | %define r%1mp qword %6 | |
125 | %else | |
126 | %define r%1mp dword %6 | |
127 | %endif | |
bafad220 LM |
128 | %define r%1 %2 |
129 | %endmacro | |
130 | ||
131 | %macro DECLARE_REG_SIZE 2 | |
132 | %define r%1q r%1 | |
133 | %define e%1q r%1 | |
134 | %define r%1d e%1 | |
135 | %define e%1d e%1 | |
136 | %define r%1w %1 | |
137 | %define e%1w %1 | |
138 | %define r%1b %2 | |
139 | %define e%1b %2 | |
140 | %ifndef ARCH_X86_64 | |
141 | %define r%1 e%1 | |
142 | %endif | |
143 | %endmacro | |
144 | ||
145 | DECLARE_REG_SIZE ax, al | |
146 | DECLARE_REG_SIZE bx, bl | |
147 | DECLARE_REG_SIZE cx, cl | |
148 | DECLARE_REG_SIZE dx, dl | |
149 | DECLARE_REG_SIZE si, sil | |
150 | DECLARE_REG_SIZE di, dil | |
151 | DECLARE_REG_SIZE bp, bpl | |
152 | ||
3f87f39c JA |
153 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
154 | ||
155 | %macro DECLARE_REG_TMP 1-* | |
156 | %assign %%i 0 | |
157 | %rep %0 | |
158 | CAT_XDEFINE t, %%i, r%1 | |
159 | %assign %%i %%i+1 | |
160 | %rotate 1 | |
161 | %endrep | |
162 | %endmacro | |
163 | ||
164 | %macro DECLARE_REG_TMP_SIZE 0-* | |
165 | %rep %0 | |
166 | %define t%1q t%1 %+ q | |
167 | %define t%1d t%1 %+ d | |
168 | %define t%1w t%1 %+ w | |
169 | %define t%1b t%1 %+ b | |
170 | %rotate 1 | |
171 | %endrep | |
172 | %endmacro | |
173 | ||
2966cc18 | 174 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9 |
3f87f39c | 175 | |
bafad220 LM |
176 | %ifdef ARCH_X86_64 |
177 | %define gprsize 8 | |
178 | %else | |
179 | %define gprsize 4 | |
180 | %endif | |
181 | ||
182 | %macro PUSH 1 | |
183 | push %1 | |
184 | %assign stack_offset stack_offset+gprsize | |
185 | %endmacro | |
186 | ||
187 | %macro POP 1 | |
188 | pop %1 | |
189 | %assign stack_offset stack_offset-gprsize | |
190 | %endmacro | |
191 | ||
192 | %macro SUB 2 | |
193 | sub %1, %2 | |
194 | %ifidn %1, rsp | |
195 | %assign stack_offset stack_offset+(%2) | |
196 | %endif | |
197 | %endmacro | |
198 | ||
199 | %macro ADD 2 | |
200 | add %1, %2 | |
201 | %ifidn %1, rsp | |
202 | %assign stack_offset stack_offset-(%2) | |
203 | %endif | |
204 | %endmacro | |
205 | ||
206 | %macro movifnidn 2 | |
207 | %ifnidn %1, %2 | |
208 | mov %1, %2 | |
209 | %endif | |
210 | %endmacro | |
211 | ||
212 | %macro movsxdifnidn 2 | |
213 | %ifnidn %1, %2 | |
214 | movsxd %1, %2 | |
215 | %endif | |
216 | %endmacro | |
217 | ||
218 | %macro ASSERT 1 | |
219 | %if (%1) == 0 | |
220 | %error assert failed | |
221 | %endif | |
222 | %endmacro | |
223 | ||
224 | %macro DEFINE_ARGS 0-* | |
225 | %ifdef n_arg_names | |
226 | %assign %%i 0 | |
227 | %rep n_arg_names | |
228 | CAT_UNDEF arg_name %+ %%i, q | |
229 | CAT_UNDEF arg_name %+ %%i, d | |
230 | CAT_UNDEF arg_name %+ %%i, w | |
231 | CAT_UNDEF arg_name %+ %%i, b | |
2f77923d | 232 | CAT_UNDEF arg_name %+ %%i, m |
bafad220 LM |
233 | CAT_UNDEF arg_name, %%i |
234 | %assign %%i %%i+1 | |
235 | %endrep | |
236 | %endif | |
237 | ||
238 | %assign %%i 0 | |
239 | %rep %0 | |
240 | %xdefine %1q r %+ %%i %+ q | |
241 | %xdefine %1d r %+ %%i %+ d | |
242 | %xdefine %1w r %+ %%i %+ w | |
243 | %xdefine %1b r %+ %%i %+ b | |
2f77923d | 244 | %xdefine %1m r %+ %%i %+ m |
bafad220 LM |
245 | CAT_XDEFINE arg_name, %%i, %1 |
246 | %assign %%i %%i+1 | |
247 | %rotate 1 | |
248 | %endrep | |
249 | %assign n_arg_names %%i | |
250 | %endmacro | |
251 | ||
3f87f39c | 252 | %ifdef WIN64 ; Windows x64 ;================================================= |
bafad220 LM |
253 | |
254 | DECLARE_REG 0, rcx, ecx, cx, cl, ecx | |
255 | DECLARE_REG 1, rdx, edx, dx, dl, edx | |
256 | DECLARE_REG 2, r8, r8d, r8w, r8b, r8d | |
257 | DECLARE_REG 3, r9, r9d, r9w, r9b, r9d | |
258 | DECLARE_REG 4, rdi, edi, di, dil, [rsp + stack_offset + 40] | |
259 | DECLARE_REG 5, rsi, esi, si, sil, [rsp + stack_offset + 48] | |
260 | DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 56] | |
261 | %define r7m [rsp + stack_offset + 64] | |
262 | %define r8m [rsp + stack_offset + 72] | |
263 | ||
264 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
265 | %if %1 < %2 | |
3f87f39c JA |
266 | mov r%1, [rsp + stack_offset + 8 + %1*8] |
267 | %endif | |
268 | %endmacro | |
269 | ||
2966cc18 | 270 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
3f87f39c JA |
271 | ASSERT %2 >= %1 |
272 | %assign regs_used %2 | |
273 | ASSERT regs_used <= 7 | |
3f87f39c JA |
274 | %if regs_used > 4 |
275 | push r4 | |
276 | push r5 | |
277 | %assign stack_offset stack_offset+16 | |
278 | %endif | |
532e7697 LM |
279 | WIN64_SPILL_XMM %3 |
280 | LOAD_IF_USED 4, %1 | |
281 | LOAD_IF_USED 5, %1 | |
282 | LOAD_IF_USED 6, %1 | |
283 | DEFINE_ARGS %4 | |
284 | %endmacro | |
285 | ||
286 | %macro WIN64_SPILL_XMM 1 | |
287 | %assign xmm_regs_used %1 | |
288 | ASSERT xmm_regs_used <= 16 | |
3f87f39c JA |
289 | %if xmm_regs_used > 6 |
290 | sub rsp, (xmm_regs_used-6)*16+16 | |
291 | %assign stack_offset stack_offset+(xmm_regs_used-6)*16+16 | |
292 | %assign %%i xmm_regs_used | |
293 | %rep (xmm_regs_used-6) | |
294 | %assign %%i %%i-1 | |
295 | movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i | |
296 | %endrep | |
297 | %endif | |
3f87f39c JA |
298 | %endmacro |
299 | ||
532e7697 | 300 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
3f87f39c JA |
301 | %if xmm_regs_used > 6 |
302 | %assign %%i xmm_regs_used | |
303 | %rep (xmm_regs_used-6) | |
304 | %assign %%i %%i-1 | |
305 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+8] | |
306 | %endrep | |
307 | add %1, (xmm_regs_used-6)*16+16 | |
308 | %endif | |
309 | %endmacro | |
310 | ||
532e7697 LM |
311 | %macro WIN64_RESTORE_XMM 1 |
312 | WIN64_RESTORE_XMM_INTERNAL %1 | |
3f87f39c JA |
313 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
314 | %assign xmm_regs_used 0 | |
315 | %endmacro | |
316 | ||
317 | %macro RET 0 | |
532e7697 | 318 | WIN64_RESTORE_XMM_INTERNAL rsp |
3f87f39c JA |
319 | %if regs_used > 4 |
320 | pop r5 | |
321 | pop r4 | |
bafad220 | 322 | %endif |
3f87f39c | 323 | ret |
bafad220 LM |
324 | %endmacro |
325 | ||
3f87f39c JA |
326 | %macro REP_RET 0 |
327 | %if regs_used > 4 || xmm_regs_used > 6 | |
328 | RET | |
329 | %else | |
330 | rep ret | |
331 | %endif | |
332 | %endmacro | |
333 | ||
334 | %elifdef ARCH_X86_64 ; *nix x64 ;============================================= | |
bafad220 LM |
335 | |
336 | DECLARE_REG 0, rdi, edi, di, dil, edi | |
337 | DECLARE_REG 1, rsi, esi, si, sil, esi | |
338 | DECLARE_REG 2, rdx, edx, dx, dl, edx | |
339 | DECLARE_REG 3, rcx, ecx, cx, cl, ecx | |
340 | DECLARE_REG 4, r8, r8d, r8w, r8b, r8d | |
341 | DECLARE_REG 5, r9, r9d, r9w, r9b, r9d | |
342 | DECLARE_REG 6, rax, eax, ax, al, [rsp + stack_offset + 8] | |
343 | %define r7m [rsp + stack_offset + 16] | |
344 | %define r8m [rsp + stack_offset + 24] | |
345 | ||
346 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
347 | %if %1 < %2 | |
348 | mov r%1, [rsp - 40 + %1*8] | |
349 | %endif | |
350 | %endmacro | |
351 | ||
3f87f39c | 352 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
bafad220 LM |
353 | ASSERT %2 >= %1 |
354 | ASSERT %2 <= 7 | |
bafad220 LM |
355 | LOAD_IF_USED 6, %1 |
356 | DEFINE_ARGS %4 | |
357 | %endmacro | |
358 | ||
359 | %macro RET 0 | |
360 | ret | |
361 | %endmacro | |
362 | ||
363 | %macro REP_RET 0 | |
364 | rep ret | |
365 | %endmacro | |
366 | ||
367 | %else ; X86_32 ;============================================================== | |
368 | ||
369 | DECLARE_REG 0, eax, eax, ax, al, [esp + stack_offset + 4] | |
370 | DECLARE_REG 1, ecx, ecx, cx, cl, [esp + stack_offset + 8] | |
371 | DECLARE_REG 2, edx, edx, dx, dl, [esp + stack_offset + 12] | |
372 | DECLARE_REG 3, ebx, ebx, bx, bl, [esp + stack_offset + 16] | |
373 | DECLARE_REG 4, esi, esi, si, null, [esp + stack_offset + 20] | |
374 | DECLARE_REG 5, edi, edi, di, null, [esp + stack_offset + 24] | |
375 | DECLARE_REG 6, ebp, ebp, bp, null, [esp + stack_offset + 28] | |
376 | %define r7m [esp + stack_offset + 32] | |
377 | %define r8m [esp + stack_offset + 36] | |
378 | %define rsp esp | |
379 | ||
380 | %macro PUSH_IF_USED 1 ; reg_id | |
381 | %if %1 < regs_used | |
382 | push r%1 | |
383 | %assign stack_offset stack_offset+4 | |
384 | %endif | |
385 | %endmacro | |
386 | ||
387 | %macro POP_IF_USED 1 ; reg_id | |
388 | %if %1 < regs_used | |
389 | pop r%1 | |
390 | %endif | |
391 | %endmacro | |
392 | ||
393 | %macro LOAD_IF_USED 2 ; reg_id, number_of_args | |
394 | %if %1 < %2 | |
395 | mov r%1, [esp + stack_offset + 4 + %1*4] | |
396 | %endif | |
397 | %endmacro | |
398 | ||
2966cc18 | 399 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
bafad220 | 400 | ASSERT %2 >= %1 |
bafad220 | 401 | %assign regs_used %2 |
bafad220 LM |
402 | ASSERT regs_used <= 7 |
403 | PUSH_IF_USED 3 | |
404 | PUSH_IF_USED 4 | |
405 | PUSH_IF_USED 5 | |
406 | PUSH_IF_USED 6 | |
407 | LOAD_IF_USED 0, %1 | |
408 | LOAD_IF_USED 1, %1 | |
409 | LOAD_IF_USED 2, %1 | |
410 | LOAD_IF_USED 3, %1 | |
411 | LOAD_IF_USED 4, %1 | |
412 | LOAD_IF_USED 5, %1 | |
413 | LOAD_IF_USED 6, %1 | |
bafad220 LM |
414 | DEFINE_ARGS %4 |
415 | %endmacro | |
416 | ||
417 | %macro RET 0 | |
418 | POP_IF_USED 6 | |
419 | POP_IF_USED 5 | |
420 | POP_IF_USED 4 | |
421 | POP_IF_USED 3 | |
422 | ret | |
423 | %endmacro | |
424 | ||
425 | %macro REP_RET 0 | |
426 | %if regs_used > 3 | |
427 | RET | |
428 | %else | |
429 | rep ret | |
430 | %endif | |
431 | %endmacro | |
432 | ||
433 | %endif ;====================================================================== | |
434 | ||
532e7697 LM |
435 | %ifndef WIN64 |
436 | %macro WIN64_SPILL_XMM 1 | |
437 | %endmacro | |
438 | %macro WIN64_RESTORE_XMM 1 | |
439 | %endmacro | |
440 | %endif | |
441 | ||
bafad220 LM |
442 | |
443 | ||
444 | ;============================================================================= | |
445 | ; arch-independent part | |
446 | ;============================================================================= | |
447 | ||
448 | %assign function_align 16 | |
449 | ||
450 | ; Symbol prefix for C linkage | |
451 | %macro cglobal 1-2+ | |
2966cc18 | 452 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
29e4edbb | 453 | %xdefine %1.skip_prologue %1 %+ .skip_prologue |
bafad220 | 454 | %ifidn __OUTPUT_FORMAT__,elf |
40c7d0ae | 455 | global %1:function hidden |
bafad220 | 456 | %else |
40c7d0ae | 457 | global %1 |
bafad220 LM |
458 | %endif |
459 | align function_align | |
460 | %1: | |
461 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | |
3f87f39c | 462 | %assign stack_offset 0 |
bafad220 LM |
463 | %if %0 > 1 |
464 | PROLOGUE %2 | |
465 | %endif | |
466 | %endmacro | |
467 | ||
468 | %macro cextern 1 | |
2966cc18 JGG |
469 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
470 | extern %1 | |
471 | %endmacro | |
472 | ||
473 | ;like cextern, but without the prefix | |
474 | %macro cextern_naked 1 | |
475 | %xdefine %1 mangle(%1) | |
3f87f39c | 476 | extern %1 |
bafad220 LM |
477 | %endmacro |
478 | ||
2966cc18 JGG |
479 | %macro const 2+ |
480 | %xdefine %1 mangle(program_name %+ _ %+ %1) | |
481 | global %1 | |
482 | %1: %2 | |
483 | %endmacro | |
484 | ||
bafad220 LM |
485 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
486 | ; executable by default. | |
487 | %ifidn __OUTPUT_FORMAT__,elf | |
488 | SECTION .note.GNU-stack noalloc noexec nowrite progbits | |
489 | %endif | |
490 | ||
bafad220 LM |
491 | ; merge mmx and sse* |
492 | ||
493 | %macro CAT_XDEFINE 3 | |
494 | %xdefine %1%2 %3 | |
495 | %endmacro | |
496 | ||
497 | %macro CAT_UNDEF 2 | |
498 | %undef %1%2 | |
499 | %endmacro | |
500 | ||
501 | %macro INIT_MMX 0 | |
502 | %define RESET_MM_PERMUTATION INIT_MMX | |
503 | %define mmsize 8 | |
504 | %define num_mmregs 8 | |
505 | %define mova movq | |
506 | %define movu movq | |
507 | %define movh movd | |
532e7697 | 508 | %define movnta movntq |
bafad220 LM |
509 | %assign %%i 0 |
510 | %rep 8 | |
511 | CAT_XDEFINE m, %%i, mm %+ %%i | |
512 | CAT_XDEFINE nmm, %%i, %%i | |
513 | %assign %%i %%i+1 | |
514 | %endrep | |
515 | %rep 8 | |
516 | CAT_UNDEF m, %%i | |
517 | CAT_UNDEF nmm, %%i | |
518 | %assign %%i %%i+1 | |
519 | %endrep | |
520 | %endmacro | |
521 | ||
522 | %macro INIT_XMM 0 | |
523 | %define RESET_MM_PERMUTATION INIT_XMM | |
524 | %define mmsize 16 | |
525 | %define num_mmregs 8 | |
526 | %ifdef ARCH_X86_64 | |
527 | %define num_mmregs 16 | |
528 | %endif | |
529 | %define mova movdqa | |
530 | %define movu movdqu | |
531 | %define movh movq | |
532e7697 | 532 | %define movnta movntdq |
bafad220 LM |
533 | %assign %%i 0 |
534 | %rep num_mmregs | |
535 | CAT_XDEFINE m, %%i, xmm %+ %%i | |
536 | CAT_XDEFINE nxmm, %%i, %%i | |
537 | %assign %%i %%i+1 | |
538 | %endrep | |
539 | %endmacro | |
540 | ||
541 | INIT_MMX | |
542 | ||
543 | ; I often want to use macros that permute their arguments. e.g. there's no | |
544 | ; efficient way to implement butterfly or transpose or dct without swapping some | |
545 | ; arguments. | |
546 | ; | |
547 | ; I would like to not have to manually keep track of the permutations: | |
548 | ; If I insert a permutation in the middle of a function, it should automatically | |
549 | ; change everything that follows. For more complex macros I may also have multiple | |
550 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. | |
551 | ; | |
552 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that | |
553 | ; permutes its arguments. It's equivalent to exchanging the contents of the | |
554 | ; registers, except that this way you exchange the register names instead, so it | |
555 | ; doesn't cost any cycles. | |
556 | ||
557 | %macro PERMUTE 2-* ; takes a list of pairs to swap | |
558 | %rep %0/2 | |
559 | %xdefine tmp%2 m%2 | |
560 | %xdefine ntmp%2 nm%2 | |
561 | %rotate 2 | |
562 | %endrep | |
563 | %rep %0/2 | |
564 | %xdefine m%1 tmp%2 | |
565 | %xdefine nm%1 ntmp%2 | |
566 | %undef tmp%2 | |
567 | %undef ntmp%2 | |
568 | %rotate 2 | |
569 | %endrep | |
570 | %endmacro | |
571 | ||
572 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) | |
573 | %rep %0-1 | |
574 | %ifdef m%1 | |
575 | %xdefine tmp m%1 | |
576 | %xdefine m%1 m%2 | |
577 | %xdefine m%2 tmp | |
578 | CAT_XDEFINE n, m%1, %1 | |
579 | CAT_XDEFINE n, m%2, %2 | |
580 | %else | |
581 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. | |
582 | ; Be careful using this mode in nested macros though, as in some cases there may be | |
583 | ; other copies of m# that have already been dereferenced and don't get updated correctly. | |
584 | %xdefine %%n1 n %+ %1 | |
585 | %xdefine %%n2 n %+ %2 | |
586 | %xdefine tmp m %+ %%n1 | |
587 | CAT_XDEFINE m, %%n1, m %+ %%n2 | |
588 | CAT_XDEFINE m, %%n2, tmp | |
589 | CAT_XDEFINE n, m %+ %%n1, %%n1 | |
590 | CAT_XDEFINE n, m %+ %%n2, %%n2 | |
591 | %endif | |
592 | %undef tmp | |
593 | %rotate 1 | |
594 | %endrep | |
595 | %endmacro | |
596 | ||
2966cc18 JGG |
597 | ; If SAVE_MM_PERMUTATION is placed at the end of a function and given the |
598 | ; function name, then any later calls to that function will automatically | |
599 | ; load the permutation, so values can be returned in mmregs. | |
600 | %macro SAVE_MM_PERMUTATION 1 ; name to save as | |
bafad220 LM |
601 | %assign %%i 0 |
602 | %rep num_mmregs | |
603 | CAT_XDEFINE %1_m, %%i, m %+ %%i | |
604 | %assign %%i %%i+1 | |
605 | %endrep | |
606 | %endmacro | |
607 | ||
2966cc18 | 608 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
bafad220 LM |
609 | %assign %%i 0 |
610 | %rep num_mmregs | |
611 | CAT_XDEFINE m, %%i, %1_m %+ %%i | |
3f87f39c | 612 | CAT_XDEFINE n, m %+ %%i, %%i |
bafad220 LM |
613 | %assign %%i %%i+1 |
614 | %endrep | |
615 | %endmacro | |
616 | ||
617 | %macro call 1 | |
618 | call %1 | |
619 | %ifdef %1_m0 | |
620 | LOAD_MM_PERMUTATION %1 | |
621 | %endif | |
622 | %endmacro | |
623 | ||
2966cc18 | 624 | ; Substitutions that reduce instruction size but are functionally equivalent |
3f87f39c JA |
625 | %macro add 2 |
626 | %ifnum %2 | |
627 | %if %2==128 | |
628 | sub %1, -128 | |
629 | %else | |
630 | add %1, %2 | |
631 | %endif | |
632 | %else | |
633 | add %1, %2 | |
634 | %endif | |
635 | %endmacro | |
636 | ||
637 | %macro sub 2 | |
638 | %ifnum %2 | |
639 | %if %2==128 | |
640 | add %1, -128 | |
641 | %else | |
642 | sub %1, %2 | |
643 | %endif | |
644 | %else | |
645 | sub %1, %2 | |
646 | %endif | |
647 | %endmacro |