This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] mips/o32: fix internal_syscall5/6/7
On Wed, 16 Aug 2017, Joseph Myers wrote:
> > If the answer to any of these questions is "yes", then would factoring
> > out the syscall `asm' along with the associated VLA declaration to a
> > helper `always_inline' function help or would it not?
>
> I don't think that would help. An asm can never make assumptions about
> which parts of the stack are used for what, only use its operands.
There may be ABI restrictions however, which could provide guarantees
beyond those resulting from the lone `asm' operands. And it would be
enough if we could prove that a certain arrangement has to be done in
order not to break the ABI. I can't think of anything right now though
and if neither you nor anyone else can, then we'll have to live with what
we have right now.
> > I mean it is a tiny optimisation, but some syscalls are frequently
> > called, so if we can avoid a waste of resources, then why not?
>
> Are any 5/6/7-argument syscalls frequently called?
Good question, however I have no data available.
Anyway, here's my counter-proposal implementing the approach previously
outlined. I have passed it through regular MIPS o32 testing with these
changes in test outputs resulting:
@@ -2575,7 +2575,7 @@
PASS: nptl/tst-cond22
PASS: nptl/tst-cond23
PASS: nptl/tst-cond24
-FAIL: nptl/tst-cond25
+PASS: nptl/tst-cond25
PASS: nptl/tst-cond3
PASS: nptl/tst-cond4
PASS: nptl/tst-cond5
@@ -2704,7 +2704,7 @@
PASS: nptl/tst-rwlock12
PASS: nptl/tst-rwlock13
PASS: nptl/tst-rwlock14
-FAIL: nptl/tst-rwlock15
+PASS: nptl/tst-rwlock15
PASS: nptl/tst-rwlock16
PASS: nptl/tst-rwlock17
PASS: nptl/tst-rwlock18
The drawback is it adds a bit to code generated, e.g. `__libc_pwrite'
(from nptl/pwrite.o and nptl/pwrite.os) grows by 4 and 6 instructions
respectively for non-PIC and PIC code respectively, and the whole
libraries:
text data bss dec hex filename
1483315 21129 11560 1516004 1721e4 libc.so
105482 960 8448 114890 1c0ca nptl/libpthread.so
vs:
text data bss dec hex filename
1484295 21133 11560 1516988 1725bc libc.so
105974 960 8448 115382 1c2b6 nptl/libpthread.so
due to the insertion of the VLA size calculation (although GCC is smart
enough to reuse a value of 0 already available, e.g.:
38: 7c03e83b rdhwr v1,$29
3c: 8c638b70 lw v1,-29840(v1)
40: 14600018 bnez v1,a4 <__libc_pwrite+0xa4>
44: 000787c3 sra s0,a3,0x1f
48: 000318c0 sll v1,v1,0x3
4c: 03a08825 move s1,sp
50: 03a3e823 subu sp,sp,v1
and save an isntruction) and the use of an extra register to preserve the
value of $sp across the block containing the VLA (as also seen with $s1 in
the disassembly above) even though it could use $fp that holds the same
value instead (e.g. continuing from the above:
74: 0220e825 move sp,s1
78: 03c0e825 move sp,s8
). It would be good to know how this compares to Adhemerval's proposal.
Maciej
* sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
(FORCE_FRAME_POINTER): Remove macro.
(internal_syscall5): Use a variable-length array to force the
use of a frame pointer.
(internal_syscall6): Likewise.
(internal_syscall7): Likewise.
---
sysdeps/unix/sysv/linux/mips/mips32/sysdep.h | 24 +++++++++++++++++-------
1 file changed, 17 insertions(+), 7 deletions(-)
glibc-mips-o32-syscall-stack.diff
Index: glibc/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
===================================================================
--- glibc.orig/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h 2017-04-11 21:27:25.000000000 +0100
+++ glibc/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h 2017-08-16 20:49:15.758029215 +0100
@@ -264,18 +264,20 @@
/* We need to use a frame pointer for the functions in which we
adjust $sp around the syscall, or debug information and unwind
- information will be $sp relative and thus wrong during the syscall. As
- of GCC 4.7, this is sufficient. */
-#define FORCE_FRAME_POINTER \
- void *volatile __fp_force __attribute__ ((unused)) = alloca (4)
+ information will be $sp relative and thus wrong during the syscall.
+ We use a variable-length array to persuade GCC to use $fp. */
#define internal_syscall5(v0_init, input, number, err, \
arg1, arg2, arg3, arg4, arg5) \
({ \
long _sys_result; \
\
- FORCE_FRAME_POINTER; \
+ size_t s = 0; \
+ asm ("" : "+r" (s)); \
{ \
+ char vla[s << 3]; \
+ asm ("" : : "p" (vla)); \
+ \
register long __s0 asm ("$16") __attribute__ ((unused)) \
= (number); \
register long __v0 asm ("$2"); \
@@ -306,8 +308,12 @@
({ \
long _sys_result; \
\
- FORCE_FRAME_POINTER; \
+ size_t s = 0; \
+ asm ("" : "+r" (s)); \
{ \
+ char vla[s << 3]; \
+ asm ("" : : "p" (vla)); \
+ \
register long __s0 asm ("$16") __attribute__ ((unused)) \
= (number); \
register long __v0 asm ("$2"); \
@@ -339,8 +345,12 @@
({ \
long _sys_result; \
\
- FORCE_FRAME_POINTER; \
+ size_t s = 0; \
+ asm ("" : "+r" (s)); \
{ \
+ char vla[s << 3]; \
+ asm ("" : : "p" (vla)); \
+ \
register long __s0 asm ("$16") __attribute__ ((unused)) \
= (number); \
register long __v0 asm ("$2"); \