[PATCH v6] x86-64: Allocate state buffer space for RDI, RSI and RBX

Sunil Pandey skpgkp2@gmail.com
Mon Mar 18 23:20:41 GMT 2024


On Mon, Mar 18, 2024 at 6:40 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
> After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11.  Define
> TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
> to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
> STATE_SAVE_OFFSET(%rsp).
>
>    +==================+<- stack frame start aligned at 8 or 16 bytes
>    |                  |<- RDI saved in the red zone
>    |                  |<- RSI saved in the red zone
>    |                  |<- RBX saved in the red zone
>    |                  |<- paddings for stack realignment of 64 bytes
>    |------------------|<- xsave buffer end aligned at 64 bytes
>    |                  |<-
>    |                  |<-
>    |                  |<-
>    |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
>    |                  |<- 8-byte padding for 64-byte alignment
>    |                  |<- 8-byte padding for 64-byte alignment
>    |                  |<- R11
>    |                  |<- R10
>    |                  |<- R9
>    |                  |<- R8
>    |                  |<- RDX
>    |                  |<- RCX
>    +==================+<- RSP aligned at 64 bytes
>
> Define TLSDESC_CALL_REGISTER_SAVE_AREA, the total register save area size
> for all integer registers by adding 24 to STATE_SAVE_OFFSET since RDI, RSI
> and RBX are saved onto stack without adjusting stack pointer first, using
> the red-zone.  This fixes BZ #31501.
> ---
>  sysdeps/x86/cpu-features.c         | 11 ++--
>  sysdeps/x86/sysdep.h               | 60 ++++++++++++++++++---
>  sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++
>  3 files changed, 147 insertions(+), 11 deletions(-)
>  create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 4ea373dffa..3d7c2819d7 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features)
>               /* NB: On AMX capable processors, ebx always includes AMX
>                  states.  */
>               unsigned int xsave_state_full_size
> -               = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
> +               = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);
>
>               cpu_features->xsave_state_size
>                 = xsave_state_full_size;
> @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features)
>                       unsigned int amx_size
>                         = (xstate_amx_comp_offsets[31]
>                            + xstate_amx_comp_sizes[31]);
> -                     amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
> -                                          64);
> +                     amx_size
> +                       = ALIGN_UP ((amx_size
> +                                    + TLSDESC_CALL_REGISTER_SAVE_AREA),
> +                                   64);
>                       /* Set xsave_state_full_size to the compact AMX
>                          state size for XSAVEC.  NB: xsave_state_full_size
>                          is only used in _dl_tlsdesc_dynamic_xsave and
> @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features)
>                       cpu_features->xsave_state_full_size = amx_size;
>  #endif
>                       cpu_features->xsave_state_size
> -                       = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
> +                       = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
> +                                   64);
>                       CPU_FEATURE_SET (cpu_features, XSAVEC);
>                     }
>                 }
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index db8e576e91..7359149e17 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -38,14 +38,59 @@
>  #ifdef __x86_64__
>  /* Offset for fxsave/xsave area used by _dl_runtime_resolve.  Also need
>     space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX.  It must be
> -   aligned to 16 bytes for fxsave and 64 bytes for xsave.
> -
> -   NB: Is is non-zero because of the 128-byte red-zone.  Some registers
> -   are saved on stack without adjusting stack pointer first.  When we
> -   update stack pointer to allocate more space, we need to take the
> -   red-zone into account.  */
> +   aligned to 16 bytes for fxsave and 64 bytes for xsave.  It is non-zero
> +   because MOV, instead of PUSH, is used to save registers onto stack.
> +
> +   +==================+<- stack frame start aligned at 8 or 16 bytes
> +   |                  |<- paddings for stack realignment of 64 bytes
> +   |------------------|<- xsave buffer end aligned at 64 bytes
> +   |                  |<-
> +   |                  |<-
> +   |                  |<-
> +   |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> +   |                  |<- 8-byte padding for 64-byte alignment
> +   |                  |<- R9
> +   |                  |<- R8
> +   |                  |<- RDI
> +   |                  |<- RSI
> +   |                  |<- RDX
> +   |                  |<- RCX
> +   |                  |<- RAX
> +   +==================+<- RSP aligned at 64 bytes
> +
> + */
>  # define STATE_SAVE_OFFSET (8 * 7 + 8)
>
> +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning
> +   stack.  After realigning stack, it saves RCX, RDX, R8, R9, R10 and
> +   R11.  Allocate space for RDI, RSI and RBX to avoid clobbering saved
> +   RDI, RSI and RBX values on stack by xsave.
> +
> +   +==================+<- stack frame start aligned at 8 or 16 bytes
> +   |                  |<- RDI saved in the red zone
> +   |                  |<- RSI saved in the red zone
> +   |                  |<- RBX saved in the red zone
> +   |                  |<- paddings for stack realignment of 64 bytes
> +   |------------------|<- xsave buffer end aligned at 64 bytes
> +   |                  |<-
> +   |                  |<-
> +   |                  |<-
> +   |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> +   |                  |<- 8-byte padding for 64-byte alignment
> +   |                  |<- 8-byte padding for 64-byte alignment
> +   |                  |<- R11
> +   |                  |<- R10
> +   |                  |<- R9
> +   |                  |<- R8
> +   |                  |<- RDX
> +   |                  |<- RCX
> +   +==================+<- RSP aligned at 64 bytes
> +
> +   Define the total register save area size for all integer registers by
> +   adding 24 to STATE_SAVE_OFFSET since RDI, RSI and RBX are saved onto
> +   stack without adjusting stack pointer first, using the red-zone.  */
> +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)
> +
>  /* Save SSE, AVX, AVX512, mask, bound and APX registers.  Bound and APX
>     registers are mutually exclusive.  */
>  # define STATE_SAVE_MASK               \
> @@ -66,8 +111,9 @@
>    (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)
>  #else
>  /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic.  Since i386
> -   doesn't have red-zone, use 0 here.  */
> +   uses PUSH to save registers onto stack, use 0 here.  */
>  # define STATE_SAVE_OFFSET 0
> +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0
>
>  /* Save SSE, AVX, AXV512, mask and bound registers.   */
>  # define STATE_SAVE_MASK               \
> diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> new file mode 100644
> index 0000000000..1d636669ba
> --- /dev/null
> +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> @@ -0,0 +1,87 @@
> +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* On AVX512 machines, OFFSET == 40 caused _dl_tlsdesc_dynamic_xsavec
> +   to clobber %rdi, %rsi and %rbx.  On Intel AVX CPUs, the state size
> +   is 960 bytes and this test didn't fail.  It may be due to the unused
> +   last 128 bytes.  On AMD AVX CPUs, the state size is 832 bytes and
> +   this test might fail without the fix.  */
> +#ifndef OFFSET
> +# define OFFSET 40
> +#endif
> +
> +       .text
> +       .p2align 4
> +       .globl  apply_tls
> +       .type   apply_tls, @function
> +apply_tls:
> +       cfi_startproc
> +       _CET_ENDBR
> +       pushq   %rbp
> +       cfi_def_cfa_offset (16)
> +       cfi_offset (6, -16)
> +       movdqu  (%RDI_LP), %xmm0
> +       lea     tls_var1@TLSDESC(%rip), %RAX_LP
> +       mov     %RSP_LP, %RBP_LP
> +       cfi_def_cfa_register (6)
> +       /* Align stack to 64 bytes.  */
> +       and     $-64, %RSP_LP
> +       sub     $OFFSET, %RSP_LP
> +       pushq   %rbx
> +       /* Set %ebx to 0xbadbeef.  */
> +       movl    $0xbadbeef, %ebx
> +       movl    $0xbadbeef, %esi
> +       movq    %rdi, saved_rdi(%rip)
> +       movq    %rsi, saved_rsi(%rip)
> +       call    *tls_var1@TLSCALL(%RAX_LP)
> +       /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx.  */
> +       cmpq    saved_rdi(%rip), %rdi
> +       jne     L(hlt)
> +       cmpq    saved_rsi(%rip), %rsi
> +       jne     L(hlt)
> +       cmpl    $0xbadbeef, %ebx
> +       jne     L(hlt)
> +       add     %fs:0, %RAX_LP
> +       movups  %xmm0, 32(%RAX_LP)
> +       movdqu  16(%RDI_LP), %xmm1
> +       mov     %RAX_LP, %RBX_LP
> +       movups  %xmm1, 48(%RAX_LP)
> +       lea     32(%RBX_LP), %RAX_LP
> +       pop     %rbx
> +       leave
> +       cfi_def_cfa (7, 8)
> +       ret
> +L(hlt):
> +       hlt
> +       cfi_endproc
> +       .size   apply_tls, .-apply_tls
> +       .hidden tls_var1
> +       .globl  tls_var1
> +       .section        .tbss,"awT",@nobits
> +       .align 16
> +       .type   tls_var1, @object
> +       .size   tls_var1, 3200
> +tls_var1:
> +       .zero   3200
> +       .local  saved_rdi
> +       .comm   saved_rdi,8,8
> +       .local  saved_rsi
> +       .comm   saved_rsi,8,8
> +       .section        .note.GNU-stack,"",@progbits
> --
> 2.44.0
>
>
LGTM
Reviewed-by: Sunil K Pandey <skpgkp2@gmail.com>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://sourceware.org/pipermail/libc-alpha/attachments/20240318/c50780c2/attachment-0001.htm>


More information about the Libc-alpha mailing list