<div dir="ltr"><div dir="ltr"><br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Mon, Mar 18, 2024 at 6:40 AM H.J. Lu <<a href="mailto:hjl.tools@gmail.com">hjl.tools@gmail.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">_dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.<br>
After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define<br>
TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX<br>
to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to<br>
STATE_SAVE_OFFSET(%rsp).<br>
<br>
+==================+<- stack frame start aligned at 8 or 16 bytes<br>
| |<- RDI saved in the red zone<br>
| |<- RSI saved in the red zone<br>
| |<- RBX saved in the red zone<br>
| |<- paddings for stack realignment of 64 bytes<br>
|------------------|<- xsave buffer end aligned at 64 bytes<br>
| |<-<br>
| |<-<br>
| |<-<br>
|------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)<br>
| |<- 8-byte padding for 64-byte alignment<br>
| |<- 8-byte padding for 64-byte alignment<br>
| |<- R11<br>
| |<- R10<br>
| |<- R9<br>
| |<- R8<br>
| |<- RDX<br>
| |<- RCX<br>
+==================+<- RSP aligned at 64 bytes<br>
<br>
Define TLSDESC_CALL_REGISTER_SAVE_AREA, the total register save area size<br>
for all integer registers by adding 24 to STATE_SAVE_OFFSET since RDI, RSI<br>
and RBX are saved onto stack without adjusting stack pointer first, using<br>
the red-zone. This fixes BZ #31501.<br>
---<br>
sysdeps/x86/cpu-features.c | 11 ++--<br>
sysdeps/x86/sysdep.h | 60 ++++++++++++++++++---<br>
sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++<br>
3 files changed, 147 insertions(+), 11 deletions(-)<br>
create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S<br>
<br>
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c<br>
index 4ea373dffa..3d7c2819d7 100644<br>
--- a/sysdeps/x86/cpu-features.c<br>
+++ b/sysdeps/x86/cpu-features.c<br>
@@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features)<br>
/* NB: On AMX capable processors, ebx always includes AMX<br>
states. */<br>
unsigned int xsave_state_full_size<br>
- = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);<br>
+ = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);<br>
<br>
cpu_features->xsave_state_size<br>
= xsave_state_full_size;<br>
@@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features)<br>
unsigned int amx_size<br>
= (xstate_amx_comp_offsets[31]<br>
+ xstate_amx_comp_sizes[31]);<br>
- amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,<br>
- 64);<br>
+ amx_size<br>
+ = ALIGN_UP ((amx_size<br>
+ + TLSDESC_CALL_REGISTER_SAVE_AREA),<br>
+ 64);<br>
/* Set xsave_state_full_size to the compact AMX<br>
state size for XSAVEC. NB: xsave_state_full_size<br>
is only used in _dl_tlsdesc_dynamic_xsave and<br>
@@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features)<br>
cpu_features->xsave_state_full_size = amx_size;<br>
#endif<br>
cpu_features->xsave_state_size<br>
- = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);<br>
+ = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,<br>
+ 64);<br>
CPU_FEATURE_SET (cpu_features, XSAVEC);<br>
}<br>
}<br>
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h<br>
index db8e576e91..7359149e17 100644<br>
--- a/sysdeps/x86/sysdep.h<br>
+++ b/sysdeps/x86/sysdep.h<br>
@@ -38,14 +38,59 @@<br>
#ifdef __x86_64__<br>
/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need<br>
space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be<br>
- aligned to 16 bytes for fxsave and 64 bytes for xsave.<br>
-<br>
- NB: Is is non-zero because of the 128-byte red-zone. Some registers<br>
- are saved on stack without adjusting stack pointer first. When we<br>
- update stack pointer to allocate more space, we need to take the<br>
- red-zone into account. */<br>
+ aligned to 16 bytes for fxsave and 64 bytes for xsave. It is non-zero<br>
+ because MOV, instead of PUSH, is used to save registers onto stack.<br>
+<br>
+ +==================+<- stack frame start aligned at 8 or 16 bytes<br>
+ | |<- paddings for stack realignment of 64 bytes<br>
+ |------------------|<- xsave buffer end aligned at 64 bytes<br>
+ | |<-<br>
+ | |<-<br>
+ | |<-<br>
+ |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)<br>
+ | |<- 8-byte padding for 64-byte alignment<br>
+ | |<- R9<br>
+ | |<- R8<br>
+ | |<- RDI<br>
+ | |<- RSI<br>
+ | |<- RDX<br>
+ | |<- RCX<br>
+ | |<- RAX<br>
+ +==================+<- RSP aligned at 64 bytes<br>
+<br>
+ */<br>
# define STATE_SAVE_OFFSET (8 * 7 + 8)<br>
<br>
+/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning<br>
+ stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and<br>
+ R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved<br>
+ RDI, RSI and RBX values on stack by xsave.<br>
+<br>
+ +==================+<- stack frame start aligned at 8 or 16 bytes<br>
+ | |<- RDI saved in the red zone<br>
+ | |<- RSI saved in the red zone<br>
+ | |<- RBX saved in the red zone<br>
+ | |<- paddings for stack realignment of 64 bytes<br>
+ |------------------|<- xsave buffer end aligned at 64 bytes<br>
+ | |<-<br>
+ | |<-<br>
+ | |<-<br>
+ |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)<br>
+ | |<- 8-byte padding for 64-byte alignment<br>
+ | |<- 8-byte padding for 64-byte alignment<br>
+ | |<- R11<br>
+ | |<- R10<br>
+ | |<- R9<br>
+ | |<- R8<br>
+ | |<- RDX<br>
+ | |<- RCX<br>
+ +==================+<- RSP aligned at 64 bytes<br>
+<br>
+ Define the total register save area size for all integer registers by<br>
+ adding 24 to STATE_SAVE_OFFSET since RDI, RSI and RBX are saved onto<br>
+ stack without adjusting stack pointer first, using the red-zone. */<br>
+# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)<br>
+<br>
/* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX<br>
registers are mutually exclusive. */<br>
# define STATE_SAVE_MASK \<br>
@@ -66,8 +111,9 @@<br>
(STATE_SAVE_MASK | AMX_STATE_SAVE_MASK)<br>
#else<br>
/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386<br>
- doesn't have red-zone, use 0 here. */<br>
+ uses PUSH to save registers onto stack, use 0 here. */<br>
# define STATE_SAVE_OFFSET 0<br>
+# define TLSDESC_CALL_REGISTER_SAVE_AREA 0<br>
<br>
/* Save SSE, AVX, AXV512, mask and bound registers. */<br>
# define STATE_SAVE_MASK \<br>
diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S<br>
new file mode 100644<br>
index 0000000000..1d636669ba<br>
--- /dev/null<br>
+++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S<br>
@@ -0,0 +1,87 @@<br>
+/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.<br>
+ Copyright (C) 2024 Free Software Foundation, Inc.<br>
+ This file is part of the GNU C Library.<br>
+<br>
+ The GNU C Library is free software; you can redistribute it and/or<br>
+ modify it under the terms of the GNU Lesser General Public<br>
+ License as published by the Free Software Foundation; either<br>
+ version 2.1 of the License, or (at your option) any later version.<br>
+<br>
+ The GNU C Library is distributed in the hope that it will be useful,<br>
+ but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU<br>
+ Lesser General Public License for more details.<br>
+<br>
+ You should have received a copy of the GNU Lesser General Public<br>
+ License along with the GNU C Library; if not, see<br>
+ <<a href="http://www.gnu.org/licenses/" rel="noreferrer" target="_blank">http://www.gnu.org/licenses/</a>>. */<br>
+<br>
+#include <sysdep.h><br>
+<br>
+/* On AVX512 machines, OFFSET == 40 caused _dl_tlsdesc_dynamic_xsavec<br>
+ to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size<br>
+ is 960 bytes and this test didn't fail. It may be due to the unused<br>
+ last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and<br>
+ this test might fail without the fix. */<br>
+#ifndef OFFSET<br>
+# define OFFSET 40<br>
+#endif<br>
+<br>
+ .text<br>
+ .p2align 4<br>
+ .globl apply_tls<br>
+ .type apply_tls, @function<br>
+apply_tls:<br>
+ cfi_startproc<br>
+ _CET_ENDBR<br>
+ pushq %rbp<br>
+ cfi_def_cfa_offset (16)<br>
+ cfi_offset (6, -16)<br>
+ movdqu (%RDI_LP), %xmm0<br>
+ lea tls_var1@TLSDESC(%rip), %RAX_LP<br>
+ mov %RSP_LP, %RBP_LP<br>
+ cfi_def_cfa_register (6)<br>
+ /* Align stack to 64 bytes. */<br>
+ and $-64, %RSP_LP<br>
+ sub $OFFSET, %RSP_LP<br>
+ pushq %rbx<br>
+ /* Set %ebx to 0xbadbeef. */<br>
+ movl $0xbadbeef, %ebx<br>
+ movl $0xbadbeef, %esi<br>
+ movq %rdi, saved_rdi(%rip)<br>
+ movq %rsi, saved_rsi(%rip)<br>
+ call *tls_var1@TLSCALL(%RAX_LP)<br>
+ /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */<br>
+ cmpq saved_rdi(%rip), %rdi<br>
+ jne L(hlt)<br>
+ cmpq saved_rsi(%rip), %rsi<br>
+ jne L(hlt)<br>
+ cmpl $0xbadbeef, %ebx<br>
+ jne L(hlt)<br>
+ add %fs:0, %RAX_LP<br>
+ movups %xmm0, 32(%RAX_LP)<br>
+ movdqu 16(%RDI_LP), %xmm1<br>
+ mov %RAX_LP, %RBX_LP<br>
+ movups %xmm1, 48(%RAX_LP)<br>
+ lea 32(%RBX_LP), %RAX_LP<br>
+ pop %rbx<br>
+ leave<br>
+ cfi_def_cfa (7, 8)<br>
+ ret<br>
+L(hlt):<br>
+ hlt<br>
+ cfi_endproc<br>
+ .size apply_tls, .-apply_tls<br>
+ .hidden tls_var1<br>
+ .globl tls_var1<br>
+ .section .tbss,"awT",@nobits<br>
+ .align 16<br>
+ .type tls_var1, @object<br>
+ .size tls_var1, 3200<br>
+tls_var1:<br>
+ .zero 3200<br>
+ .local saved_rdi<br>
+ .comm saved_rdi,8,8<br>
+ .local saved_rsi<br>
+ .comm saved_rsi,8,8<br>
+ .section .note.GNU-stack,"",@progbits<br>
-- <br>
2.44.0<br>
<br></blockquote><div><br></div><div>LGTM</div><div>Reviewed-by: Sunil K Pandey <<a href="mailto:skpgkp2@gmail.com">skpgkp2@gmail.com</a>> </div></div></div>