This is the mail archive of the
libffi-discuss@sourceware.org
mailing list for the libffi project.
[PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants
- From: Richard Henderson <rth at twiddle dot net>
- To: libffi-discuss at sourceware dot org
- Date: Tue, 28 Oct 2014 11:31:33 -0700
- Subject: [PATCH 7/8] x86_64: Decouple return types from FFI_TYPE constants
- Authentication-results: sourceware.org; auth=none
- References: <1414521094-18403-1-git-send-email-rth at twiddle dot net>
We can better support structure returns, and as prep for
complex types.
---
src/x86/ffi64.c | 142 ++++++++++++++++++-------------
src/x86/internal64.h | 20 +++++
src/x86/unix64.S | 236 +++++++++++++++++++++------------------------------
3 files changed, 202 insertions(+), 196 deletions(-)
create mode 100644 src/x86/internal64.h
diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c
index 65fb595..a03061b 100644
--- a/src/x86/ffi64.c
+++ b/src/x86/ffi64.c
@@ -33,6 +33,7 @@
#include <stdlib.h>
#include <stdarg.h>
#include <stdint.h>
+#include "internal64.h"
#ifdef __x86_64__
@@ -191,7 +192,7 @@ classify_argument (ffi_type *type, enum x86_64_reg_class classes[],
}
else if (size <= 16)
{
- classes[0] = classes[1] = X86_64_INTEGERSI_CLASS;
+ classes[0] = classes[1] = X86_64_INTEGER_CLASS;
return 2;
}
else
@@ -360,15 +361,55 @@ ffi_prep_cif_machdep (ffi_cif *cif)
int gprcount, ssecount, i, avn, ngpr, nsse, flags;
enum x86_64_reg_class classes[MAX_CLASSES];
size_t bytes, n;
+ ffi_type *rtype;
if (cif->abi != FFI_UNIX64)
return FFI_BAD_ABI;
gprcount = ssecount = 0;
- flags = cif->rtype->type;
- if (flags != FFI_TYPE_VOID)
+ rtype = cif->rtype;
+ switch (rtype->type)
{
+ case FFI_TYPE_VOID:
+ flags = UNIX64_RET_VOID;
+ break;
+ case FFI_TYPE_UINT8:
+ flags = UNIX64_RET_UINT8;
+ break;
+ case FFI_TYPE_SINT8:
+ flags = UNIX64_RET_SINT8;
+ break;
+ case FFI_TYPE_UINT16:
+ flags = UNIX64_RET_UINT16;
+ break;
+ case FFI_TYPE_SINT16:
+ flags = UNIX64_RET_SINT16;
+ break;
+ case FFI_TYPE_UINT32:
+ flags = UNIX64_RET_UINT32;
+ break;
+ case FFI_TYPE_INT:
+ case FFI_TYPE_SINT32:
+ flags = UNIX64_RET_SINT32;
+ break;
+ case FFI_TYPE_UINT64:
+ case FFI_TYPE_SINT64:
+ flags = UNIX64_RET_INT64;
+ break;
+ case FFI_TYPE_POINTER:
+ flags = (sizeof(void *) == 4 ? UNIX64_RET_UINT32 : UNIX64_RET_INT64);
+ break;
+ case FFI_TYPE_FLOAT:
+ flags = UNIX64_RET_XMM32;
+ break;
+ case FFI_TYPE_DOUBLE:
+ flags = UNIX64_RET_XMM64;
+ break;
+ case FFI_TYPE_LONGDOUBLE:
+ flags = UNIX64_RET_X87;
+ break;
+ case FFI_TYPE_STRUCT:
n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
if (n == 0)
{
@@ -376,22 +417,24 @@ ffi_prep_cif_machdep (ffi_cif *cif)
memory is the first argument. Allocate a register for it. */
gprcount++;
/* We don't have to do anything in asm for the return. */
- flags = FFI_TYPE_VOID;
+ flags = UNIX64_RET_VOID | UNIX64_FLAG_RET_IN_MEM;
}
- else if (flags == FFI_TYPE_STRUCT)
+ else
{
/* Mark which registers the result appears in. */
_Bool sse0 = SSE_CLASS_P (classes[0]);
_Bool sse1 = n == 2 && SSE_CLASS_P (classes[1]);
- if (sse0 && !sse1)
- flags |= 1 << 8;
- else if (!sse0 && sse1)
- flags |= 1 << 9;
- else if (sse0 && sse1)
- flags |= 1 << 10;
+ if (sse0)
+ flags = (sse1 ? UNIX64_RET_ST_XMM0_XMM1 : UNIX64_RET_ST_XMM0_RAX);
+ else
+ flags = (sse1 ? UNIX64_RET_ST_RAX_XMM0 : UNIX64_RET_ST_RAX_RDX);
+
/* Mark the true size of the structure. */
- flags |= cif->rtype->size << 12;
+ flags |= rtype->size << UNIX64_SIZE_SHIFT;
}
+ break;
+ default:
+ return FFI_BAD_TYPEDEF;
}
/* Go over all arguments and determine the way they should be passed.
@@ -418,9 +461,10 @@ ffi_prep_cif_machdep (ffi_cif *cif)
}
}
if (ssecount)
- flags |= 1 << 11;
+ flags |= UNIX64_FLAG_XMM_ARGS;
+
cif->flags = flags;
- cif->bytes = (unsigned)ALIGN (bytes, 8);
+ cif->bytes = ALIGN (bytes, 8);
return FFI_OK;
}
@@ -432,20 +476,22 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
enum x86_64_reg_class classes[MAX_CLASSES];
char *stack, *argp;
ffi_type **arg_types;
- int gprcount, ssecount, ngpr, nsse, i, avn;
- _Bool ret_in_memory;
+ int gprcount, ssecount, ngpr, nsse, i, avn, flags;
struct register_args *reg_args;
/* Can't call 32-bit mode from 64-bit mode. */
FFI_ASSERT (cif->abi == FFI_UNIX64);
/* If the return value is a struct and we don't have a return value
- address then we need to make one. Note the setting of flags to
- VOID above in ffi_prep_cif_machdep. */
- ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
- && (cif->flags & 0xff) == FFI_TYPE_VOID);
- if (rvalue == NULL && ret_in_memory)
- rvalue = alloca (cif->rtype->size);
+ address then we need to make one. Otherwise we can ignore it. */
+ flags = cif->flags;
+ if (rvalue == NULL)
+ {
+ if (flags & UNIX64_FLAG_RET_IN_MEM)
+ rvalue = alloca (cif->rtype->size);
+ else
+ flags = UNIX64_RET_VOID;
+ }
/* Allocate the space for the arguments, plus 4 words of temp space. */
stack = alloca (sizeof (struct register_args) + cif->bytes + 4*8);
@@ -458,7 +504,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
/* If the return value is passed in memory, add the pointer as the
first integer argument. */
- if (ret_in_memory)
+ if (flags & UNIX64_FLAG_RET_IN_MEM)
reg_args->gpr[gprcount++] = (unsigned long) rvalue;
avn = cif->nargs;
@@ -503,17 +549,17 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
switch (arg_types[i]->type)
{
case FFI_TYPE_SINT8:
- *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
+ reg_args->gpr[gprcount] = (SINT64) *((SINT8 *) a);
break;
case FFI_TYPE_SINT16:
- *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
+ reg_args->gpr[gprcount] = (SINT64) *((SINT16 *) a);
break;
case FFI_TYPE_SINT32:
- *(SINT64 *)®_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
+ reg_args->gpr[gprcount] = (SINT64) *((SINT32 *) a);
break;
default:
reg_args->gpr[gprcount] = 0;
- memcpy (®_args->gpr[gprcount], a, size < 8 ? size : 8);
+ memcpy (®_args->gpr[gprcount], a, size);
}
gprcount++;
break;
@@ -533,7 +579,7 @@ ffi_call_int (ffi_cif *cif, void (*fn)(void), void *rvalue,
reg_args->rax = ssecount;
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
- cif->flags, rvalue, fn);
+ flags, rvalue, fn);
}
void
@@ -573,7 +619,7 @@ ffi_prep_closure_loc (ffi_closure* closure,
if (cif->abi != FFI_UNIX64)
return FFI_BAD_ABI;
- if (cif->flags & (1 << 11))
+ if (cif->flags & UNIX64_FLAG_XMM_ARGS)
dest = ffi_closure_unix64_sse;
else
dest = ffi_closure_unix64;
@@ -600,39 +646,17 @@ ffi_closure_unix64_inner(ffi_cif *cif,
ffi_type **arg_types;
long i, avn;
int gprcount, ssecount, ngpr, nsse;
- int ret;
+ int flags;
- avalue = alloca(cif->nargs * sizeof(void *));
+ avn = cif->nargs;
+ flags = cif->flags;
+ avalue = alloca(avn * sizeof(void *));
gprcount = ssecount = 0;
- ret = cif->rtype->type;
- if (ret != FFI_TYPE_VOID)
- {
- enum x86_64_reg_class classes[MAX_CLASSES];
- size_t n = examine_argument (cif->rtype, classes, 1, &ngpr, &nsse);
- if (n == 0)
- {
- /* The return value goes in memory. Arrange for the closure
- return value to go directly back to the original caller. */
- rvalue = (void *) (unsigned long) reg_args->gpr[gprcount++];
- /* We don't have to do anything in asm for the return. */
- ret = FFI_TYPE_VOID;
- }
- else if (ret == FFI_TYPE_STRUCT && n == 2)
- {
- /* Mark which register the second word of the structure goes in. */
- _Bool sse0 = SSE_CLASS_P (classes[0]);
- _Bool sse1 = SSE_CLASS_P (classes[1]);
- if (!sse0 && sse1)
- ret |= 1 << 8;
- else if (sse0 && !sse1)
- ret |= 1 << 9;
- }
- }
+ if (flags & UNIX64_FLAG_RET_IN_MEM)
+ rvalue = (void *)(uintptr_t)reg_args->gpr[gprcount++];
- avn = cif->nargs;
arg_types = cif->arg_types;
-
for (i = 0; i < avn; ++i)
{
enum x86_64_reg_class classes[MAX_CLASSES];
@@ -693,7 +717,7 @@ ffi_closure_unix64_inner(ffi_cif *cif,
fun (cif, rvalue, avalue, user_data);
/* Tell assembly how to perform return type promotions. */
- return ret;
+ return flags;
}
extern void ffi_go_closure_unix64(void) FFI_HIDDEN;
@@ -706,7 +730,7 @@ ffi_prep_go_closure (ffi_go_closure* closure, ffi_cif* cif,
if (cif->abi != FFI_UNIX64)
return FFI_BAD_ABI;
- closure->tramp = (cif->flags & (1 << 11)
+ closure->tramp = (cif->flags & UNIX64_FLAG_XMM_ARGS
? ffi_go_closure_unix64_sse
: ffi_go_closure_unix64);
closure->cif = cif;
diff --git a/src/x86/internal64.h b/src/x86/internal64.h
new file mode 100644
index 0000000..07b1b10
--- /dev/null
+++ b/src/x86/internal64.h
@@ -0,0 +1,20 @@
+#define UNIX64_RET_VOID 0
+#define UNIX64_RET_UINT8 1
+#define UNIX64_RET_UINT16 2
+#define UNIX64_RET_UINT32 3
+#define UNIX64_RET_SINT8 4
+#define UNIX64_RET_SINT16 5
+#define UNIX64_RET_SINT32 6
+#define UNIX64_RET_INT64 7
+#define UNIX64_RET_XMM32 8
+#define UNIX64_RET_XMM64 9
+#define UNIX64_RET_X87 10
+#define UNIX64_RET_ST_RAX_RDX 11
+#define UNIX64_RET_ST_XMM0_RAX 12
+#define UNIX64_RET_ST_RAX_XMM0 13
+#define UNIX64_RET_ST_XMM0_XMM1 14
+#define UNIX64_RET_LAST 14
+
+#define UNIX64_FLAG_RET_IN_MEM (1 << 10)
+#define UNIX64_FLAG_XMM_ARGS (1 << 11)
+#define UNIX64_SIZE_SHIFT 12
diff --git a/src/x86/unix64.S b/src/x86/unix64.S
index 797b9d9..0151229 100644
--- a/src/x86/unix64.S
+++ b/src/x86/unix64.S
@@ -31,9 +31,15 @@
#include <fficonfig.h>
#include <ffi.h>
#include <ffi_cfi.h>
+#include "internal64.h"
.text
+.macro E index
+ .align 8
+ .org 0b + \index * 8, 0x90
+.endm
+
/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)(void));
@@ -41,7 +47,7 @@
for this function. This has been allocated by ffi_call. We also
deallocate some of the stack that has been alloca'd. */
- .align 2
+ .align 8
.globl ffi_call_unix64
.type ffi_call_unix64,@function
FFI_HIDDEN(ffi_call_unix64)
@@ -100,109 +106,81 @@ ffi_call_unix64:
cfi_restore(%rbp)
/* The first byte of the flags contains the FFI_TYPE. */
+ cmpb $UNIX64_RET_LAST, %cl
movzbl %cl, %r10d
- leaq .Lstore_table(%rip), %r11
- movslq (%r11, %r10, 4), %r10
- addq %r11, %r10
- jmp *%r10
+ leaq 0f(%rip), %r11
+ ja 9f
+ leaq (%r11, %r10, 8), %r10
- .section .rodata
- .align 2
-.Lstore_table:
- .long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */
- .long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */
- .long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */
- .long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */
- .long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */
- .long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */
- .long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */
- .long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */
- .long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */
- .long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */
- .long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */
- .long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */
- .long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */
- .long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */
- .long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */
- .previous
+ /* Prep for the structure cases: scratch area in redzone. */
+ leaq -20(%rsp), %rsi
+ jmp *%r10
- .align 2
-.Lst_void:
+ .align 8
+0:
+E UNIX64_RET_VOID
ret
- .align 2
-
-.Lst_uint8:
- movzbq %al, %rax
+E UNIX64_RET_UINT8
+ movzbl %al, %eax
movq %rax, (%rdi)
ret
- .align 2
-.Lst_sint8:
- movsbq %al, %rax
+E UNIX64_RET_UINT16
+ movzwl %ax, %eax
movq %rax, (%rdi)
ret
- .align 2
-.Lst_uint16:
- movzwq %ax, %rax
+E UNIX64_RET_UINT32
+ movl %eax, %eax
movq %rax, (%rdi)
- .align 2
-.Lst_sint16:
- movswq %ax, %rax
+ ret
+E UNIX64_RET_SINT8
+ movsbq %al, %rax
movq %rax, (%rdi)
ret
- .align 2
-.Lst_uint32:
- movl %eax, %eax
+E UNIX64_RET_SINT16
+ movswq %ax, %rax
movq %rax, (%rdi)
- .align 2
-.Lst_sint32:
+ ret
+E UNIX64_RET_SINT32
cltq
movq %rax, (%rdi)
ret
- .align 2
-.Lst_int64:
+E UNIX64_RET_INT64
movq %rax, (%rdi)
ret
-
- .align 2
-.Lst_float:
- movss %xmm0, (%rdi)
+E UNIX64_RET_XMM32
+ movd %xmm0, (%rdi)
ret
- .align 2
-.Lst_double:
- movsd %xmm0, (%rdi)
+E UNIX64_RET_XMM64
+ movq %xmm0, (%rdi)
ret
-.Lst_ldouble:
+E UNIX64_RET_X87
fstpt (%rdi)
ret
-
- .align 2
-.Lst_struct:
- leaq -20(%rsp), %rsi /* Scratch area in redzone. */
-
- /* We have to locate the values now, and since we don't want to
- write too much data into the user's return value, we spill the
- value to a 16 byte scratch area first. Bits 8, 9, and 10
- control where the values are located. Only one of the three
- bits will be set; see ffi_prep_cif_machdep for the pattern. */
- movd %xmm0, %r10
- movd %xmm1, %r11
- testl $0x100, %ecx
- cmovnz %rax, %rdx
- cmovnz %r10, %rax
- testl $0x200, %ecx
- cmovnz %r10, %rdx
- testl $0x400, %ecx
- cmovnz %r10, %rax
- cmovnz %r11, %rdx
- movq %rax, (%rsi)
+E UNIX64_RET_ST_RAX_RDX
movq %rdx, 8(%rsi)
-
- /* Bits 12-31 contain the true size of the structure. Copy from
- the scratch area to the true destination. */
- shrl $12, %ecx
+ jmp 2f
+E UNIX64_RET_ST_XMM0_RAX
+ movq %rax, 8(%rsi)
+ jmp 3f
+E UNIX64_RET_ST_RAX_XMM0
+ movq %xmm0, 8(%rsi)
+ jmp 2f
+E UNIX64_RET_ST_XMM0_XMM1
+ movq %xmm1, 8(%rsi)
+
+ .align 8
+3: movq %xmm0, (%rsi)
+ shrl $UNIX64_SIZE_SHIFT, %ecx
+ rep movsb
+ ret
+ .align 8
+2: movq %rax, (%rsi)
+ shrl $UNIX64_SIZE_SHIFT, %ecx
rep movsb
ret
+9: call abort@PLT
+
/* Many times we can avoid loading any SSE registers at all.
It's not worth an indirect jump to load the exact set of
SSE registers needed; zero or all is a good compromise. */
@@ -292,84 +270,68 @@ ffi_closure_unix64:
cfi_adjust_cfa_offset(-ffi_closure_FS)
/* The first byte of the return value contains the FFI_TYPE. */
+ cmpb $UNIX64_RET_LAST, %al
movzbl %al, %r10d
- leaq .Lload_table(%rip), %r11
- movslq (%r11, %r10, 4), %r10
- addq %r11, %r10
+ leaq 0f(%rip), %r11
+ ja 9f
+ leaq (%r11, %r10, 8), %r10
jmp *%r10
- .section .rodata
- .align 2
-.Lload_table:
- .long .Lld_void-.Lload_table /* FFI_TYPE_VOID */
- .long .Lld_int32-.Lload_table /* FFI_TYPE_INT */
- .long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */
- .long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */
- .long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */
- .long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */
- .long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */
- .long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */
- .long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */
- .long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */
- .long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */
- .long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */
- .long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */
- .long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */
- .long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */
- .previous
-
- .align 2
-.Lld_void:
+ .align 8
+0:
+E UNIX64_RET_VOID
ret
-
- .align 2
-.Lld_int8:
+E UNIX64_RET_UINT8
movzbl ffi_closure_RED_RVALUE(%rsp), %eax
ret
- .align 2
-.Lld_int16:
+E UNIX64_RET_UINT16
movzwl ffi_closure_RED_RVALUE(%rsp), %eax
ret
- .align 2
-.Lld_int32:
+E UNIX64_RET_UINT32
movl ffi_closure_RED_RVALUE(%rsp), %eax
ret
- .align 2
-.Lld_int64:
+E UNIX64_RET_SINT8
+ movsbl ffi_closure_RED_RVALUE(%rsp), %eax
+ ret
+E UNIX64_RET_SINT16
+ movswl ffi_closure_RED_RVALUE(%rsp), %eax
+ ret
+E UNIX64_RET_SINT32
+ movl ffi_closure_RED_RVALUE(%rsp), %eax
+ ret
+E UNIX64_RET_INT64
movq ffi_closure_RED_RVALUE(%rsp), %rax
ret
-
- .align 2
-.Lld_float:
- movss ffi_closure_RED_RVALUE(%rsp), %xmm0
+E UNIX64_RET_XMM32
+ movd ffi_closure_RED_RVALUE(%rsp), %xmm0
ret
- .align 2
-.Lld_double:
- movsd ffi_closure_RED_RVALUE(%rsp), %xmm0
+E UNIX64_RET_XMM64
+ movq ffi_closure_RED_RVALUE(%rsp), %xmm0
ret
- .align 2
-.Lld_ldouble:
+E UNIX64_RET_X87
fldt ffi_closure_RED_RVALUE(%rsp)
ret
-
- .align 2
-.Lld_struct:
- /* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
- %rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading
- both rdx and xmm1 with the second word. For the remaining,
- bit 8 set means xmm0 gets the second word, and bit 9 means
- that rax gets the second word. */
- movq ffi_closure_RED_RVALUE(%rsp), %rcx
+E UNIX64_RET_ST_RAX_RDX
movq ffi_closure_RED_RVALUE+8(%rsp), %rdx
+ jmp 2f
+E UNIX64_RET_ST_XMM0_RAX
+ movq ffi_closure_RED_RVALUE+8(%rsp), %rax
+ jmp 3f
+E UNIX64_RET_ST_RAX_XMM0
+ movq ffi_closure_RED_RVALUE+8(%rsp), %xmm0
+ jmp 2f
+E UNIX64_RET_ST_XMM0_XMM1
movq ffi_closure_RED_RVALUE+8(%rsp), %xmm1
- testl $0x100, %eax
- cmovnz %rdx, %rcx
- movd %rcx, %xmm0
- testl $0x200, %eax
- movq ffi_closure_RED_RVALUE(%rsp), %rax
- cmovnz %rdx, %rax
+
+ .align 8
+3: movq ffi_closure_RED_RVALUE(%rsp), %xmm0
+ ret
+ .align 8
+2: movq ffi_closure_RED_RVALUE(%rsp), %rax
ret
+9: call abort@PLT
+
cfi_endproc
.size ffi_closure_unix64,.-ffi_closure_unix64
--
1.9.3