This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.21-642-g9901716
- From: andros at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 24 Jul 2015 11:48:28 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.21-642-g9901716
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 99017161354321845d11dce4fcd3abfebc5dd0d5 (commit)
from 3bcea719ddd6ce399d7bccb492c40af77d216e42 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=99017161354321845d11dce4fcd3abfebc5dd0d5
commit 99017161354321845d11dce4fcd3abfebc5dd0d5
Author: Andrew Senkevich <andrew.senkevich@intel.com>
Date: Fri Jul 24 14:47:23 2015 +0300
Fixed several libmvec bugs found during testing on KNL hardware.
AVX512 IFUNC implementations, implementations of wrappers to
AVX2 versions and KNL expf implementation fixed.
* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
implementation.
diff --git a/ChangeLog b/ChangeLog
index 6f6016d..3e22413 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-07-24 Andrew Senkevich <andrew.senkevich@intel.com>
+
+ * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Fixed AVX512 IFUNC.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
+ * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Fixed wrappers to AVX2.
+ * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S: Fixed KNL
+ implementation.
+
2015-07-24 Szabolcs Nagy <szabolcs.nagy@arm.com>
[BZ #17711]
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index ba3b66f..d0f4f27 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_cos)
.type _ZGVeN8v_cos, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_cos_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_cos_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_cos_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_cos)
#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 8f837fb..7b7c07d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_exp)
.type _ZGVeN8v_exp, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_exp_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_exp_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_exp_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_exp)
#define _ZGVeN8v_exp _ZGVeN8v_exp_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 2f9e9d8..76375fd 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_log)
.type _ZGVeN8v_log, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_log_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_log_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_log_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_log_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_log)
#define _ZGVeN8v_log _ZGVeN8v_log_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index 3b11511..c1e5e76 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8vv_pow)
.type _ZGVeN8vv_pow, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8vv_pow_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8vv_pow_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8vv_pow_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8vv_pow)
#define _ZGVeN8vv_pow _ZGVeN8vv_pow_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index ba63102..131f2f4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8v_sin)
.type _ZGVeN8v_sin, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8v_sin_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8v_sin_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8v_sin_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8v_sin)
#define _ZGVeN8v_sin _ZGVeN8v_sin_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index 7228ba5..e331090 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN8vvv_sincos)
.type _ZGVeN8vvv_sincos, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN8vvv_sincos_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN8vvv_sincos_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN8vvv_sincos)
#define _ZGVeN8vvv_sincos _ZGVeN8vvv_sincos_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 91564de..0654d3c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_cosf)
.type _ZGVeN16v_cosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_cosf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_cosf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_cosf)
#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 3b3489d..62858eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_expf)
.type _ZGVeN16v_expf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_expf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_expf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_expf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_expf)
#define _ZGVeN16v_expf _ZGVeN16v_expf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index cb807e0..ec69055 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -46,6 +46,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
The table lookup is skipped if k = 0.
For low accuracy approximation, exp(r) ~ 1 or 1+r. */
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 8756750..68c57e4 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_logf)
.type _ZGVeN16v_logf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_logf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_logf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_logf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_logf)
#define _ZGVeN16v_logf _ZGVeN16v_logf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index a4ba4fb..3aa9f95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16vv_powf)
.type _ZGVeN16vv_powf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16vv_powf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16vv_powf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16vv_powf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16vv_powf)
#define _ZGVeN16vv_powf _ZGVeN16vv_powf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index 0a1753e..bdcabab 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16vvv_sincosf)
.type _ZGVeN16vvv_sincosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16vvv_sincosf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16vvv_sincosf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16vvv_sincosf)
#define _ZGVeN16vvv_sincosf _ZGVeN16vvv_sincosf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 7ed637b..3ec78a0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -23,16 +23,16 @@
ENTRY (_ZGVeN16v_sinf)
.type _ZGVeN16v_sinf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
- jne 1
+ jne 1f
call __init_cpu_features
1: leaq _ZGVeN16v_sinf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
- jnz 3
-2: leaq _ZGVeN16v_sinf_knl(%rip), %rax
+ jnz 2f
+ leaq _ZGVeN16v_sinf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
- jnz 3
+ jnz 2f
leaq _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
-3: ret
+2: ret
END (_ZGVeN16v_sinf)
#define _ZGVeN16v_sinf _ZGVeN16v_sinf_avx2_wrapper
diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
index bd93b8e..5c0ff89 100644
--- a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h
@@ -194,39 +194,39 @@
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+ vmovupd (%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 64(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x01
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -234,61 +234,50 @@
/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512_ff callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x4c
- .byte 0x24
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovapd 64(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x40
- call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x01
+ vmovupd (%rsp), %ymm0
+ vmovupd 64(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 128(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x02
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -310,61 +299,26 @@
cfi_rel_offset (%r13, 0)
subq $176, %rsp
movq %rsi, %r13
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
+/* Below is encoding for vmovups %zmm0, (%rsp). */
.byte 0x62
.byte 0xf1
.byte 0x7c
.byte 0x48
- .byte 0x29
+ .byte 0x11
.byte 0x04
.byte 0x24
movq %rdi, %r12
-/* Below is encoding for vmovapd (%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x04
- .byte 0x24
+ vmovupd (%rsp), %ymm0
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 32(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x20
+ vmovupd 32(%rsp), %ymm0
lea 64(%rsp), %rdi
lea 96(%rsp), %rsi
call HIDDEN_JUMPTARGET(\callee)
-/* Below is encoding for vmovapd 64(%rsp), %ymm0. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x44
- .byte 0x24
- .byte 0x40
-/* Below is encoding for vmovapd 96(%rsp), %ymm1. */
- .byte 0xc5
- .byte 0xfd
- .byte 0x28
- .byte 0x4c
- .byte 0x24
- .byte 0x60
-/* Below is encoding for vmovapd %ymm0, 32(%r12). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x44
- .byte 0x24
- .byte 0x20
-/* Below is encoding for vmovapd %ymm1, 32(%r13). */
- .byte 0xc4
- .byte 0xc1
- .byte 0x7d
- .byte 0x29
- .byte 0x4d
- .byte 0x20
+ vmovupd 64(%rsp), %ymm0
+ vmovupd 96(%rsp), %ymm1
+ vmovupd %ymm0, 32(%r12)
+ vmovupd %ymm1, 32(%r13)
+ vzeroupper
addq $176, %rsp
popq %r13
cfi_adjust_cfa_offset (-8)
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
index 66bb081..d255d19 100644
--- a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -239,28 +239,39 @@
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
- pushq %rbp
+ pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
- movq %rsp, %rbp
+ movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
- andq $-64, %rsp
- subq $64, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
- vmovaps (%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %ymm0
- call HIDDEN_JUMPTARGET(\callee)
- movq %rbp, %rsp
+ andq $-64, %rsp
+ subq $128, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+ vmovupd (%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 64(%rsp)
+ vmovupd 32(%rsp), %ymm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovupd %ymm0, 96(%rsp)
+/* Below is encoding for vmovups 64(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x01
+ movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
- popq %rbp
+ popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
@@ -274,29 +285,41 @@
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
- subq $128, %rsp
-/* Below is encoding for vmovaps %zmm0, (%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x04
- .byte 0x24
-/* Below is encoding for vmovaps %zmm1, 64(%rsp). */
- .byte 0x62
- .byte 0xf1
- .byte 0x7c
- .byte 0x48
- .byte 0x29
- .byte 0x4c
- .byte 0x24
- vmovaps (%rsp), %ymm0
- vmovaps 64(%rsp), %ymm1
+ subq $192, %rsp
+/* Below is encoding for vmovups %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovups %zmm1, 64(%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x11
+ .byte 0x4c
+ .byte 0x24
+ .byte 0x01
+ vmovups (%rsp), %ymm0
+ vmovups 64(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
- vmovaps 32(%rsp), %ymm0
- vmovaps 96(%rsp), %ymm1
+ vmovups %ymm0, 128(%rsp)
+ vmovups 32(%rsp), %ymm0
+ vmovups 96(%rsp), %ymm1
call HIDDEN_JUMPTARGET(\callee)
+ vmovups %ymm0, 160(%rsp)
+/* Below is encoding for vmovups 128(%rsp), %zmm0. */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x10
+ .byte 0x44
+ .byte 0x24
+ .byte 0x02
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 19 ++
sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S | 10 +-
.../fpu/multiarch/svml_s_expf16_core_avx512.S | 1 +
sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S | 10 +-
.../x86_64/fpu/multiarch/svml_s_sincosf16_core.S | 10 +-
sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S | 10 +-
sysdeps/x86_64/fpu/svml_d_wrapper_impl.h | 202 ++++++++------------
sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 101 ++++++----
16 files changed, 220 insertions(+), 223 deletions(-)
hooks/post-receive
--
GNU C Library master sources