This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] X86-64: Add _dl_runtime_resolve_avx[512]_opt [BZ #20508]
- From: Florian Weimer <fweimer at redhat dot com>
- To: "H.J. Lu" <hjl dot tools at gmail dot com>
- Cc: Richard Henderson <rth at twiddle dot net>, GNU C Library <libc-alpha at sourceware dot org>
- Date: Tue, 4 Oct 2016 21:20:25 +0200
- Subject: Re: [PATCH] X86-64: Add _dl_runtime_resolve_avx[512]_opt [BZ #20508]
- Authentication-results: sourceware.org; auth=none
- References: <CAMe9rOojpuFz1jTbMpNcqZK1KVDqaWozNuEuS3E67dvD3Rh=hw@mail.gmail.com> <c6e21847-bc95-3c75-ed54-798c62194072@redhat.com> <CAMe9rOp=bvrk0MBF8SrP4JqLHembRdeyfGNqNkcVWvoEs3R=tA@mail.gmail.com> <CAMe9rOo7MmGNPKct=AzzbtR564yH1P96tUcLD7pFv7GtxF3-Ng@mail.gmail.com> <48eeea78-99e8-f255-bd26-b6d28929b4f0@twiddle.net> <CAMe9rOqXhVLVbT90JBfkJnCLzZBrkUHtkO4Ddfk9HPH7jiyEzw@mail.gmail.com> <CAMe9rOoDp7XYwLdDetFNghAdDo_zj-4342LKV4i0zCH36aBWtw@mail.gmail.com> <CAMe9rOo+_TQABBp0eC4nQcJ5ENLrZAOSvtAjo17E6H0ezDpWvg@mail.gmail.com> <CAMe9rOqoATTPzXqjtqenV0h+hMtMYcr+W-ZL3Bi4jFJWYBCUpg@mail.gmail.com> <76f9e5c1-d1de-04d1-49f5-30673bf3060b@redhat.com> <CAMe9rOq8yu+b0-N6UR3vb843NApZ+ZjBwAw5GsERHpvuCrVjrw@mail.gmail.com> <14cc7a47-1a39-6285-d4b5-dab2769c092b@redhat.com> <CAMe9rOrqmLmHLhHL7pDSvR=8=dUUGP07zY5A3DiOubtpvR25WQ@mail.gmail.com> <ea7fe73f-b8cb-6049-d4c4-918d724bd883@redhat.com> <CAMe9rOqxqffhZAb1Sj-qxg_1oU0J=0v7rvhoXVbjbJrrthHucg@mail.gmail.com>
On 10/04/2016 06:08 PM, H.J. Lu wrote:
Good question. This is also needed:
commit f43cb35c9b3c35addc6dc0f1427caf51786ca1d2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Jul 1 05:54:43 2016 -0700
Require binutils 2.24 to build x86-64 glibc [BZ #20139]
That's not really backportable, I'm afraid. Our users don't expect we
break
builds in this way.
Who are those users?
We don't know, really. But moving forward the baseline binutils requirement
in a stable release really contradicts what a stable release is about.
Do our users expect a broken glibc binary of a stable release on AVX512
machine?
Either switch to the XSAVE approach, or encode the relevant opcodes
using .byte.
We used the latter in Red Hat Enterprise Linux 6, and I'm attaching our
patch in case it is helpful.
Florian
#
# Based on AVX-512 support for glibc, but heavaily modified for rhel-6.7.
# Without assembler support we drop all of the configure checks and simply
# output using .byte directives the minimal AVX512 instructsion required
# by the loader. Likewise testing is also impossible, so instead we use
# the Intel emulator running in `-skx` (Skylake Xeon) emulation mode and
# verify that a pre-built set of tests passes.
#
# commit 6986b98a18490e76b16911d1c6b1ba013598d40d
# Author: Ulrich Drepper <drepper@gmail.com>
# Date: Wed Jul 20 14:20:00 2011 -0400
#
# Force :a_x86_64_ymm to be 16-byte aligned
#
# commit aa4de9cea5c07d43caeaca9722c2d417e9a2919c
# Author: H.J. Lu <hjl.tools@gmail.com>
# Date: Fri Mar 14 08:51:25 2014 -0700
#
# Check AVX-512 assembler support first
#
# It checks AVX-512 assembler support first and sets libc_cv_cc_avx512 to
# $libc_cv_asm_avx512, instead of yes. GCC won't support AVX-512 if
# assembler doesn't support it.
#
# * sysdeps/x86_64/configure.ac: Check AVX-512 assembler support
# first. Disable AVX-512 GCC support if assembler doesn't support
# it.
# * sysdeps/x86_64/configure: Regenerated.
#
# commit 2d63a517e4084ec80403cd9f278690fa8b676cc4
# Author: Igor Zamyatin <igor.zamyatin@intel.com>
# Date: Thu Mar 13 11:10:22 2014 -0700
#
# Save and restore AVX-512 zmm registers to x86-64 ld.so
#
# AVX-512 ISA adds 512-bit zmm registers. This patch updates
# _dl_runtime_profile to pass zmm registers to run-time audit. It also
# changes _dl_x86_64_save_sse and _dl_x86_64_restore_sse to upport zmm
# registers, which are called when only when RTLD_PREPARE_FOREIGN_CALL
# is used. Its performance impact is minimum.
#
# * config.h.in (HAVE_AVX512_SUPPORT): New #undef.
# (HAVE_AVX512_ASM_SUPPORT): Likewise.
# * sysdeps/x86_64/bits/link.h (La_x86_64_zmm): New.
# (La_x86_64_vector): Add zmm.
# * sysdeps/x86_64/Makefile (tests): Add tst-audit10.
# (modules-names): Add tst-auditmod10a and tst-auditmod10b.
# ($(objpfx)tst-audit10): New target.
# ($(objpfx)tst-audit10.out): Likewise.
# (tst-audit10-ENV): New.
# (AVX512-CFLAGS): Likewise.
# (CFLAGS-tst-audit10.c): Likewise.
# (CFLAGS-tst-auditmod10a.c): Likewise.
# (CFLAGS-tst-auditmod10b.c): Likewise.
# * sysdeps/x86_64/configure.ac: Set config-cflags-avx512,
# HAVE_AVX512_SUPPORT and HAVE_AVX512_ASM_SUPPORT.
# * sysdeps/x86_64/configure: Regenerated.
# * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Add
# AVX-512 zmm register support.
# (_dl_x86_64_save_sse): Likewise.
# (_dl_x86_64_restore_sse): Likewise.
# * sysdeps/x86_64/dl-trampoline.h: Updated to support different
# size vector registers.
# * sysdeps/x86_64/link-defines.sym (YMM_SIZE): New.
# (ZMM_SIZE): Likewise.
# * sysdeps/x86_64/tst-audit10.c: New file.
# * sysdeps/x86_64/tst-auditmod10a.c: Likewise.
# * sysdeps/x86_64/tst-auditmod10b.c: Likewise.
#
# In addition adds:
# https://sourceware.org/ml/libc-alpha/2014-09/msg00228.html
# To extend zmm register checking.
#
diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h
--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/bits/link.h 2010-05-04 07:27:23.000000000 -0400
+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/bits/link.h 2015-03-03 23:03:25.041829238 -0500
@@ -65,7 +65,10 @@
/* Registers for entry into PLT on x86-64. */
# if __GNUC_PREREQ (4,0)
typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16)));
-typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32)));
+typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32),
+ __aligned__ (16)));
+typedef double La_x86_64_zmm __attribute__ ((__vector_size__ (64),
+ __aligned__ (16)));
# else
typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__)));
# endif
@@ -74,9 +77,10 @@
{
# if __GNUC_PREREQ (4,0)
La_x86_64_ymm ymm[2];
+ La_x86_64_zmm zmm[1];
# endif
La_x86_64_xmm xmm[4];
-} La_x86_64_vector __attribute__ ((aligned(16)));
+} La_x86_64_vector __attribute__ ((__aligned__(16)));
typedef struct La_x86_64_regs
{
diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h
--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:03:05.109457627 -0500
+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.h 2015-03-03 23:06:58.434101818 -0500
@@ -20,14 +20,26 @@
#ifdef RESTORE_AVX
/* This is to support AVX audit modules. */
- vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp)
- vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
- vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
- vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
- vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
- vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
- vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
- vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+# if HAVE_NO_AVX512_ASM_SUPPORT
+ /* Restore AVX-512 registers. Use .byte becaues we lack assembler support. */
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x44,0x24,0x03 # vmovdqu64 %zmm0,0xc0(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x4c,0x24,0x04 # vmovdqu64 %zmm1,0x100(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x54,0x24,0x05 # vmovdqu64 %zmm2,0x140(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x5c,0x24,0x06 # vmovdqu64 %zmm3,0x180(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x64,0x24,0x07 # vmovdqu64 %zmm4,0x1c0(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x6c,0x24,0x08 # vmovdqu64 %zmm5,0x200(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x74,0x24,0x09 # vmovdqu64 %zmm6,0x240(%rsp)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x7c,0x24,0x0a # vmovdqu64 %zmm7,0x280(%rsp)
+# else
+ VMOV %VEC(0), (LR_VECTOR_OFFSET)(%rsp)
+ VMOV %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
+ VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+ VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+ VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+ VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+ VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+ VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+# endif
/* Save xmm0-xmm7 registers to detect if any of them are
changed by audit module. */
@@ -73,7 +85,11 @@
je 2f
vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x44,0x24,0x03 # vmovdqu64 0xc0(%rsp),%zmm0
+# else
+2: VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+# endif
vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -82,7 +98,11 @@
je 2f
vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x4c,0x24,0x04 # vmovdqu64 0x100(%rsp),%zmm1
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+# endif
vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -91,7 +111,11 @@
je 2f
vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x54,0x24,0x05 # vmovdqu64 0x140(%rsp),%zmm2
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+# endif
vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -100,7 +124,11 @@
je 2f
vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x5c,0x24,0x06 # vmovdqu64 0x180(%rsp),%zmm3
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+# endif
vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -109,7 +137,11 @@
je 2f
vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x64,0x24,0x07 # vmovdqu64 0x1c0(%rsp),%zmm4
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+# endif
vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -118,7 +150,11 @@
je 2f
vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x6c,0x24,0x08 # vmovdqu64 0x200(%rsp),%zmm5
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+# endif
vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -127,7 +163,11 @@
je 2f
vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x74,0x24,0x09 # vmovdqu64 0x240(%rsp),%zmm6
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+# endif
vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -136,7 +176,11 @@
je 2f
vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
jmp 1f
-2: vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7
+# if HAVE_NO_AVX512_ASM_SUPPORT
+2: .byte 0x62,0xf1,0xfe,0x48,0x6f,0x7c,0x24,0x0a # vmovdqu64 0x280(%rsp),%zmm7
+# else
+2: VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+# endif
vmovdqa %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
1:
@@ -214,8 +258,13 @@
#ifdef RESTORE_AVX
/* This is to support AVX audit modules. */
- vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx)
- vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx)
+# if HAVE_NO_AVX512_ASM_SUPPORT
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x81,0x50,0x00,0x00,0x00 # vmovdqu64 %zmm0,0x50(%rcx)
+ .byte 0x62,0xf1,0xfe,0x48,0x7f,0x89,0x90,0x00,0x00,0x00 # vmovdqu64 %zmm1,0x90(%rcx)
+# else
+ VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+ VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+# endif
/* Save xmm0/xmm1 registers to detect if they are changed
by audit module. */
@@ -244,13 +293,21 @@
vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
jne 1f
- vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0
+# if HAVE_NO_AVX512_ASM_SUPPORT
+ .byte 0x62,0xf1,0xfe,0x48,0x6f,0x84,0x24,0x50,0x00,0x00,0x00 # vmovdqu64 0x50(%rsp),%zmm0
+# else
+ VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+# endif
1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
vpmovmskb %xmm2, %esi
cmpl $0xffff, %esi
jne 1f
- vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1
+# if HAVE_NO_AVX512_ASM_SUPPORT
+ .byte 0x62,0xf1,0xfe,0x48,0x6f,0x8c,0x24,0x90,0x00,0x00,0x00 # vmovdqu64 0x90(%rsp),%zmm1
+# else
+ VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+# endif
1:
#endif
diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S
--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:03:05.108457659 -0500
+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/dl-trampoline.S 2015-03-03 23:07:31.799049953 -0500
@@ -134,7 +134,7 @@
.previous
cmpl $0, L(have_avx)(%rip)
- jne 1f
+ jne L(defined)
movq %rbx, %r11 # Save rbx
movl $1, %eax
cpuid
@@ -143,18 +143,51 @@
// AVX and XSAVE supported?
andl $((1 << 28) | (1 << 27)), %ecx
cmpl $((1 << 28) | (1 << 27)), %ecx
- jne 2f
+ jne 10f
+ // AVX512 supported in processor?
+ movq %rbx, %r11 # Save rbx
+ xorl %ecx, %ecx
+ mov $0x7, %eax
+ cpuid
+ andl $(1 << 16), %ebx
xorl %ecx, %ecx
// Get XFEATURE_ENABLED_MASK
xgetbv
- andl $0x6, %eax
-2: subl $0x5, %eax
+ test %ebx, %ebx
+ movq %r11, %rbx # Restore rbx
+ je 20f
+ // Verify that XCR0[7:5] = '111b' and
+ // XCR0[2:1] = '11b' which means
+ // that zmm state is enabled
+ andl $0xe6, %eax
+ cmpl $0xe6, %eax
+ jne 20f
+ movl %eax, L(have_avx)(%rip)
+L(avx512):
+# define RESTORE_AVX
+# define HAVE_NO_AVX512_ASM_SUPPORT 1
+# define VMOV vmovdqu64
+# define VEC(i) zmm##i
+# define MORE_CODE
+# include "dl-trampoline.h"
+# undef VMOV
+# undef VEC
+# undef RESTORE_AVX
+# undef HAVE_NO_AVX512_ASM_SUPPORT
+20: andl $0x6, %eax
+10: subl $0x5, %eax
movl %eax, L(have_avx)(%rip)
cmpl $0, %eax
-1: js L(no_avx)
+L(defined):
+ js L(no_avx)
+ cmpl $0xe6, L(have_avx)(%rip)
+ je L(avx512)
+
# define RESTORE_AVX
+# define VMOV vmovdqu
+# define VEC(i) ymm##i
# define MORE_CODE
# include "dl-trampoline.h"
@@ -178,7 +211,7 @@
_dl_x86_64_save_sse:
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
- jne 1f
+ jne L(defined_5)
movq %rbx, %r11 # Save rbx
movl $1, %eax
cpuid
@@ -187,21 +220,37 @@
// AVX and XSAVE supported?
andl $((1 << 28) | (1 << 27)), %ecx
cmpl $((1 << 28) | (1 << 27)), %ecx
- jne 2f
+ jne 1f
+ // AVX512 supported in a processor?
+ movq %rbx, %r11 # Save rbx
+ xorl %ecx,%ecx
+ mov $0x7,%eax
+ cpuid
+ andl $(1 << 16), %ebx
xorl %ecx, %ecx
// Get XFEATURE_ENABLED_MASK
xgetbv
- andl $0x6, %eax
- cmpl $0x6, %eax
- // Nonzero if SSE and AVX state saving is enabled.
- sete %al
-2: leal -1(%eax,%eax), %eax
+ test %ebx, %ebx
+ movq %r11, %rbx # Restore rbx
+ je 2f
+ // Verify that XCR0[7:5] = '111b' and
+ // XCR0[2:1] = '11b' which means
+ // that zmm state is enabled
+ andl $0xe6, %eax
movl %eax, L(have_avx)(%rip)
- cmpl $0, %eax
+ cmpl $0xe6, %eax
+ je L(avx512_5)
-1: js L(no_avx5)
+2: andl $0x6, %eax
+1: subl $0x5, %eax
+ movl %eax, L(have_avx)(%rip)
+ cmpl $0, %eax
-# define YMM_SIZE 32
+L(defined_5):
+ js L(no_avx5)
+ cmpl $0xe6, L(have_avx)(%rip)
+ je L(avx512_5)
+
vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
@@ -211,6 +260,26 @@
vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
ret
+L(avx512_5):
+# Original instructions:
+# vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
+# vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
+# vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
+# vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
+# vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
+# vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
+# vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
+# vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
+# Assembled instructions:
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %zmm0,%fs:0x80
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %zmm1,%fs:0xc0
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %zmm2,%fs:0x100
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %zmm3,%fs:0x140
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %zmm4,%fs:0x180
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %zmm5,%fs:0x1c0
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %zmm6,%fs:0x200
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x7f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %zmm7,%fs:0x240
+ ret
L(no_avx5):
# endif
movdqa %xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
@@ -234,6 +303,8 @@
# ifdef HAVE_AVX_SUPPORT
cmpl $0, L(have_avx)(%rip)
js L(no_avx6)
+ cmpl $0xe6, L(have_avx)(%rip)
+ je L(avx512_6)
vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
@@ -244,6 +315,26 @@
vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
ret
+L(avx512_6):
+# Original instructions:
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
+# vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
+# Assembled instructions:
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x04,0x25,0x80,0x00,0x00,0x00 # vmovdqu64 %fs:0x80,%zmm0
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x0c,0x25,0xc0,0x00,0x00,0x00 # vmovdqu64 %fs:0xc0,%zmm1
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x14,0x25,0x00,0x01,0x00,0x00 # vmovdqu64 %fs:0x100,%zmm2
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x1c,0x25,0x40,0x01,0x00,0x00 # vmovdqu64 %fs:0x140,%zmm3
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x24,0x25,0x80,0x01,0x00,0x00 # vmovdqu64 %fs:0x180,%zmm4
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x2c,0x25,0xc0,0x01,0x00,0x00 # vmovdqu64 %fs:0x1c0,%zmm5
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x34,0x25,0x00,0x02,0x00,0x00 # vmovdqu64 %fs:0x200,%zmm6
+ .byte 0x64,0x62,0xf1,0xfe,0x48,0x6f,0x3c,0x25,0x40,0x02,0x00,0x00 # vmovdqu64 %fs:0x240,%zmm7
+ ret
L(no_avx6):
# endif
movdqa %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
diff -urN glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym
--- glibc-2.12-2-gc4ccff1/sysdeps/x86_64/link-defines.sym 2010-05-04 07:27:23.000000000 -0400
+++ glibc-2.12-2-gc4ccff1.mod/sysdeps/x86_64/link-defines.sym 2015-03-03 23:03:25.042829206 -0500
@@ -4,6 +4,8 @@
--
VECTOR_SIZE sizeof (La_x86_64_vector)
XMM_SIZE sizeof (La_x86_64_xmm)
+YMM_SIZE sizeof (La_x86_64_ymm)
+ZMM_SIZE sizeof (La_x86_64_zmm)
LR_SIZE sizeof (struct La_x86_64_regs)
LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx)