This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] [x86_64] Fixed libmvec AVX512 implementations


Hi,

this patch fixes several libmvec bugs found during testing on new KNL hardware.
Fixed AVX512 IFUNC implementations, implementations of wrappers to
AVX2 versions and KNL expf implementation.

2015-07-24  Andrew Senkevich  <andrew.senkevich@intel.com>

        * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
        * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
        * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
        * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
        * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
        version.

diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index ba3b66f..d0f4f27 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_cos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_cos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_cos)

 #define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 8f837fb..7b7c07d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_exp_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_exp_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_exp)

 #define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 2f9e9d8..76375fd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_log_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_log_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_log)

 #define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index 3b11511..c1e5e76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vv_pow_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vv_pow_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vv_pow)

 #define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index ba63102..131f2f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8v_sin_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8v_sin_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8v_sin)

 #define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index 7228ba5..e331090 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN8vvv_sincos)

 #define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 91564de..0654d3c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_cosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_cosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_cosf)

 #define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 3b3489d..62858eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_expf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_expf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_expf)

 #define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 8756750..68c57e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_logf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_logf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_logf)

 #define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index a4ba4fb..3aa9f95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vv_powf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vv_powf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vv_powf)

 #define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index 0a1753e..bdcabab 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16vvv_sincosf)

 #define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 7ed637b..3ec78a0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -23,16 +23,16 @@
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1
+        jne     1f
         call    __init_cpu_features
 1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
         testl   $bit_AVX512DQ_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
-        jnz     3
-2:      leaq    _ZGVeN16v_sinf_knl(%rip), %rax
+        jnz     2f
+        leaq    _ZGVeN16v_sinf_knl(%rip), %rax
         testl   $bit_AVX512F_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
-        jnz     3
+        jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3:      ret
+2:      ret
 END (_ZGVeN16v_sinf)

 #define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8e..5c0ff89 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -234,61 +234,50 @@

 /* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512_ff callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x40
-        call   HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovupd   (%rsp), %ymm0
+        vmovupd   64(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 128(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -310,61 +299,26 @@
         cfi_rel_offset (%r13, 0)
         subq      $176, %rsp
         movq      %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
         .byte  0x62
         .byte  0xf1
         .byte  0x7c
         .byte  0x48
-        .byte  0x29
+        .byte  0x11
         .byte  0x04
         .byte  0x24
         movq    %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x04
-        .byte  0x24
+        vmovupd (%rsp), %ymm0
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
+        vmovupd   32(%rsp), %ymm0
         lea       64(%rsp), %rdi
         lea       96(%rsp), %rsi
         call      HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x44
-        .byte  0x24
-        .byte  0x40
-/* Below is encoding for vmovapd   96(%rsp), %ymm1.  */
-        .byte  0xc5
-        .byte  0xfd
-        .byte  0x28
-        .byte  0x4c
-        .byte  0x24
-        .byte  0x60
-/* Below is encoding for vmovapd   %ymm0, 32(%r12).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x44
-        .byte  0x24
-        .byte  0x20
-/* Below is encoding for vmovapd   %ymm1, 32(%r13).  */
-        .byte  0xc4
-        .byte  0xc1
-        .byte  0x7d
-        .byte  0x29
-        .byte  0x4d
-        .byte  0x20
+        vmovupd   64(%rsp), %ymm0
+        vmovupd   96(%rsp), %ymm1
+        vmovupd   %ymm0, 32(%r12)
+        vmovupd   %ymm1, 32(%r13)
+        vzeroupper
         addq      $176, %rsp
         popq      %r13
         cfi_adjust_cfa_offset (-8)
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index 66bb081..d255d19 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -239,28 +239,39 @@

 /* AVX512 ISA version as wrapper to AVX2 ISA version.  */
 .macro WRAPPER_IMPL_AVX512 callee
-        pushq  %rbp
+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
-        movq   %rsp, %rbp
+        movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
-        andq   $-64, %rsp
-        subq   $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        call   HIDDEN_JUMPTARGET(\callee)
-        movq   %rbp, %rsp
+        andq      $-64, %rsp
+        subq      $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+        vmovupd   (%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 64(%rsp)
+        vmovupd   32(%rsp), %ymm0
+        call      HIDDEN_JUMPTARGET(\callee)
+        vmovupd   %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x01
+        movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
-        popq   %rbp
+        popq      %rbp
         cfi_adjust_cfa_offset (-8)
         cfi_restore (%rbp)
         ret
@@ -274,29 +285,41 @@
         movq      %rsp, %rbp
         cfi_def_cfa_register (%rbp)
         andq      $-64, %rsp
-        subq      $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x04
-        .byte  0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp).  */
-        .byte  0x62
-        .byte  0xf1
-        .byte  0x7c
-        .byte  0x48
-        .byte  0x29
-        .byte  0x4c
-        .byte  0x24
-        vmovaps (%rsp), %ymm0
-        vmovaps 64(%rsp), %ymm1
+        subq      $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x04
+        .byte   0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp).  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x11
+        .byte   0x4c
+        .byte   0x24
+        .byte   0x01
+        vmovups   (%rsp), %ymm0
+        vmovups   64(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
-        vmovaps 32(%rsp), %ymm0
-        vmovaps 96(%rsp), %ymm1
+        vmovups   %ymm0, 128(%rsp)
+        vmovups   32(%rsp), %ymm0
+        vmovups   96(%rsp), %ymm1
         call      HIDDEN_JUMPTARGET(\callee)
+        vmovups   %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0.  */
+        .byte   0x62
+        .byte   0xf1
+        .byte   0x7c
+        .byte   0x48
+        .byte   0x10
+        .byte   0x44
+        .byte   0x24
+        .byte   0x02
         movq      %rbp, %rsp
         cfi_def_cfa_register (%rsp)
         popq      %rbp
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index cb807e0..ec69055 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
      The table lookup is skipped if k = 0.
      For low accuracy approximation, exp(r) ~ 1 or 1+r.  */

+        pushq     %rbp
         cfi_adjust_cfa_offset (8)
         cfi_rel_offset (%rbp, 0)
         movq      %rsp, %rbp

Ok for trunk?


--
WBR,
Andrew


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]