This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.21-453-g2a52321
- From: andros at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 9 Jun 2015 15:34:11 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.21-453-g2a52321
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 2a523216d5dc973d8bf91a00f00b70b7df42b91d (commit)
via 04f496d6025753058bdd071fd711e9f56df149a7 (commit)
from 24a2718f595bc11dc6abb31303ceb8fdcb664f2f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2a523216d5dc973d8bf91a00f00b70b7df42b91d
commit 2a523216d5dc973d8bf91a00f00b70b7df42b91d
Author: Andrew Senkevich <andrew.senkevich@intel.com>
Date: Tue Jun 9 18:32:42 2015 +0300
This patch adds vector cosf tests.
* math/Makefile: Added CFLAGS for new tests.
* math/test-float-vlen16.h: New file.
* math/test-float-vlen4.h: New file.
* math/test-float-vlen8.h: New file.
* math/test-double-vlen2.h: Fixed 2 argument macro and comment.
* sysdeps/x86_64/fpu/Makefile: Added new tests and variables.
* sysdeps/x86_64/fpu/libm-test-ulps: Regenerated.
* sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen16.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen4.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen8-avx2.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-float-vlen8.c: New file.
diff --git a/ChangeLog b/ChangeLog
index c3e52b2..353b383 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -68,6 +68,22 @@
* sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cosf.
* NEWS: Mention addition of x86_64 vector cosf.
+ * math/Makefile: Added CFLAGS for new tests.
+ * math/test-float-vlen16.h: New file.
+ * math/test-float-vlen4.h: New file.
+ * math/test-float-vlen8.h: New file.
+ * math/test-double-vlen2.h: Fixed 2 argument macro and comment.
+ * sysdeps/x86_64/fpu/Makefile: Added new tests and variables.
+ * sysdeps/x86_64/fpu/libm-test-ulps: Regenerated.
+ * sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen16.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen4.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen8-avx2.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c: New file.
+ * sysdeps/x86_64/fpu/test-float-vlen8.c: New file.
+
2015-06-09 Marko Myllynen <myllynen@redhat.com>
* locale/C-ctype.c (PREDEFINED_CLASSES): Remove.
diff --git a/math/Makefile b/math/Makefile
index 8f14f25..7f6b85e 100644
--- a/math/Makefile
+++ b/math/Makefile
@@ -160,6 +160,15 @@ CFLAGS-test-double-vlen4-wrappers.c = $(double-vlen4-arch-ext-cflags)
CFLAGS-test-double-vlen8.c = $(libm-test-vec-cflags)
CFLAGS-test-double-vlen8-wrappers.c = $(double-vlen8-arch-ext-cflags)
+CFLAGS-test-float-vlen4.c = $(libm-test-vec-cflags)
+CFLAGS-test-float-vlen4-wrappers.c = $(float-vlen4-arch-ext-cflags)
+
+CFLAGS-test-float-vlen8.c = $(libm-test-vec-cflags)
+CFLAGS-test-float-vlen8-wrappers.c = $(float-vlen8-arch-ext-cflags)
+
+CFLAGS-test-float-vlen16.c = $(libm-test-vec-cflags)
+CFLAGS-test-float-vlen16-wrappers.c = $(float-vlen16-arch-ext-cflags)
+
CFLAGS-test-float.c = -fno-inline -ffloat-store -fno-builtin
CFLAGS-test-double.c = -fno-inline -ffloat-store -fno-builtin
CFLAGS-test-ldouble.c = -fno-inline -ffloat-store -fno-builtin
diff --git a/math/test-double-vlen2.h b/math/test-double-vlen2.h
index 37d7060..2e8415b 100644
--- a/math/test-double-vlen2.h
+++ b/math/test-double-vlen2.h
@@ -45,7 +45,7 @@
#define WRAPPER_DECL(function) extern FLOAT function (FLOAT);
#define WRAPPER_DECL_ff(function) extern FLOAT function (FLOAT, FLOAT);
-// Wrapper from scalar to vector function with vector length 8.
+// Wrapper from scalar to vector function with vector length 2.
#define VECTOR_WRAPPER(scalar_func, vector_func) \
extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x) \
@@ -63,7 +63,7 @@ extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
FLOAT scalar_func (FLOAT x, FLOAT y) \
{ \
int i; \
- VEC_TYPE mx; \
+ VEC_TYPE mx, my; \
INIT_VEC_LOOP (mx, x, 2); \
INIT_VEC_LOOP (my, y, 2); \
VEC_TYPE mr = vector_func (mx, my); \
diff --git a/math/test-double-vlen2.h b/math/test-float-vlen16.h
similarity index 63%
copy from math/test-double-vlen2.h
copy to math/test-float-vlen16.h
index 37d7060..5c0a7a4 100644
--- a/math/test-double-vlen2.h
+++ b/math/test-float-vlen16.h
@@ -1,4 +1,4 @@
-/* Definitions for double vector tests with vector length 2.
+/* Definitions for float vector tests with vector length 16.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,56 +16,56 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FLOAT double
-#define FUNC(function) function
-#define TEST_MSG "testing double vector math (without inline functions)\n"
+#define FLOAT float
+#define FUNC(function) function ## f
+#define TEST_MSG "testing float vector math (without inline functions)\n"
#define MATHCONST(x) x
-#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cdouble
+#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cfloat
#define PRINTF_EXPR "e"
#define PRINTF_XEXPR "a"
#define PRINTF_NEXPR "f"
-#define TEST_DOUBLE 1
+#define TEST_FLOAT 1
#define TEST_MATHVEC 1
#ifndef __NO_MATH_INLINES
# define __NO_MATH_INLINES
#endif
-#define EXCEPTION_TESTS_double 0
-#define ROUNDING_TESTS_double(MODE) ((MODE) == FE_TONEAREST)
+#define EXCEPTION_TESTS_float 0
+#define ROUNDING_TESTS_float(MODE) ((MODE) == FE_TONEAREST)
#define CNCT(x, y) x ## y
#define CONCAT(a, b) CNCT (a, b)
-#define VEC_SUFF _vlen2
+#define VEC_SUFF _vlen16
#define WRAPPER_NAME(function) CONCAT (function, VEC_SUFF)
-#define FUNC_TEST(function) function ## _VEC_SUFF
+#define FUNC_TEST(function) function ## f ## _VEC_SUFF
-#define WRAPPER_DECL(function) extern FLOAT function (FLOAT);
-#define WRAPPER_DECL_ff(function) extern FLOAT function (FLOAT, FLOAT);
+#define WRAPPER_DECL(func) extern FLOAT func (FLOAT x);
+#define WRAPPER_DECL_ff(func) extern FLOAT func (FLOAT x, FLOAT y);
-// Wrapper from scalar to vector function with vector length 8.
+// Wrapper from scalar to vector function with vector length 16.
#define VECTOR_WRAPPER(scalar_func, vector_func) \
extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x) \
{ \
int i; \
VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
+ INIT_VEC_LOOP (mx, x, 16); \
VEC_TYPE mr = vector_func (mx); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (16); \
}
// Wrapper from scalar 2 argument function to vector one.
-#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
-extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
+#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x, FLOAT y) \
{ \
int i; \
- VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
- INIT_VEC_LOOP (my, y, 2); \
+ VEC_TYPE mx, my; \
+ INIT_VEC_LOOP (mx, x, 16); \
+ INIT_VEC_LOOP (my, y, 16); \
VEC_TYPE mr = vector_func (mx, my); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (16); \
}
diff --git a/math/test-double-vlen2.h b/math/test-float-vlen4.h
similarity index 68%
copy from math/test-double-vlen2.h
copy to math/test-float-vlen4.h
index 37d7060..09485bc 100644
--- a/math/test-double-vlen2.h
+++ b/math/test-float-vlen4.h
@@ -1,4 +1,4 @@
-/* Definitions for double vector tests with vector length 2.
+/* Definitions for float vector tests with vector length 4.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,56 +16,56 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FLOAT double
-#define FUNC(function) function
-#define TEST_MSG "testing double vector math (without inline functions)\n"
+#define FLOAT float
+#define FUNC(function) function ## f
+#define TEST_MSG "testing float vector math (without inline functions)\n"
#define MATHCONST(x) x
-#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cdouble
+#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cfloat
#define PRINTF_EXPR "e"
#define PRINTF_XEXPR "a"
#define PRINTF_NEXPR "f"
-#define TEST_DOUBLE 1
+#define TEST_FLOAT 1
#define TEST_MATHVEC 1
#ifndef __NO_MATH_INLINES
# define __NO_MATH_INLINES
#endif
-#define EXCEPTION_TESTS_double 0
-#define ROUNDING_TESTS_double(MODE) ((MODE) == FE_TONEAREST)
+#define EXCEPTION_TESTS_float 0
+#define ROUNDING_TESTS_float(MODE) ((MODE) == FE_TONEAREST)
#define CNCT(x, y) x ## y
#define CONCAT(a, b) CNCT (a, b)
-#define VEC_SUFF _vlen2
+#define VEC_SUFF _vlen4
#define WRAPPER_NAME(function) CONCAT (function, VEC_SUFF)
-#define FUNC_TEST(function) function ## _VEC_SUFF
+#define FUNC_TEST(function) function ## f ## _VEC_SUFF
#define WRAPPER_DECL(function) extern FLOAT function (FLOAT);
#define WRAPPER_DECL_ff(function) extern FLOAT function (FLOAT, FLOAT);
-// Wrapper from scalar to vector function with vector length 8.
+// Wrapper from scalar to vector function with vector length 4.
#define VECTOR_WRAPPER(scalar_func, vector_func) \
extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x) \
{ \
int i; \
VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
+ INIT_VEC_LOOP (mx, x, 4); \
VEC_TYPE mr = vector_func (mx); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (4); \
}
// Wrapper from scalar 2 argument function to vector one.
-#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
-extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
+#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x, FLOAT y) \
{ \
int i; \
- VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
- INIT_VEC_LOOP (my, y, 2); \
+ VEC_TYPE mx, my; \
+ INIT_VEC_LOOP (mx, x, 4); \
+ INIT_VEC_LOOP (my, y, 4); \
VEC_TYPE mr = vector_func (mx, my); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (4); \
}
diff --git a/math/test-double-vlen2.h b/math/test-float-vlen8.h
similarity index 71%
copy from math/test-double-vlen2.h
copy to math/test-float-vlen8.h
index 37d7060..d309931 100644
--- a/math/test-double-vlen2.h
+++ b/math/test-float-vlen8.h
@@ -1,4 +1,4 @@
-/* Definitions for double vector tests with vector length 2.
+/* Definitions for float vector tests with vector length 8.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,31 +16,31 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#define FLOAT double
-#define FUNC(function) function
-#define TEST_MSG "testing double vector math (without inline functions)\n"
+#define FLOAT float
+#define FUNC(function) function ## f
+#define TEST_MSG "testing float vector math (without inline functions)\n"
#define MATHCONST(x) x
-#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cdouble
+#define CHOOSE(Clongdouble,Cdouble,Cfloat,Cinlinelongdouble,Cinlinedouble,Cinlinefloat) Cfloat
#define PRINTF_EXPR "e"
#define PRINTF_XEXPR "a"
#define PRINTF_NEXPR "f"
-#define TEST_DOUBLE 1
+#define TEST_FLOAT 1
#define TEST_MATHVEC 1
#ifndef __NO_MATH_INLINES
# define __NO_MATH_INLINES
#endif
-#define EXCEPTION_TESTS_double 0
-#define ROUNDING_TESTS_double(MODE) ((MODE) == FE_TONEAREST)
+#define EXCEPTION_TESTS_float 0
+#define ROUNDING_TESTS_float(MODE) ((MODE) == FE_TONEAREST)
#define CNCT(x, y) x ## y
#define CONCAT(a, b) CNCT (a, b)
-#define VEC_SUFF _vlen2
+#define VEC_SUFF _vlen8
#define WRAPPER_NAME(function) CONCAT (function, VEC_SUFF)
-#define FUNC_TEST(function) function ## _VEC_SUFF
+#define FUNC_TEST(function) function ## f ## _VEC_SUFF
#define WRAPPER_DECL(function) extern FLOAT function (FLOAT);
#define WRAPPER_DECL_ff(function) extern FLOAT function (FLOAT, FLOAT);
@@ -52,20 +52,20 @@ FLOAT scalar_func (FLOAT x) \
{ \
int i; \
VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
+ INIT_VEC_LOOP (mx, x, 8); \
VEC_TYPE mr = vector_func (mx); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (8); \
}
// Wrapper from scalar 2 argument function to vector one.
-#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
-extern VEC_TYPE vector_func (VEC_TYPE, VEC_TYPE); \
+#define VECTOR_WRAPPER_ff(scalar_func, vector_func) \
+extern VEC_TYPE vector_func (VEC_TYPE); \
FLOAT scalar_func (FLOAT x, FLOAT y) \
{ \
int i; \
- VEC_TYPE mx; \
- INIT_VEC_LOOP (mx, x, 2); \
- INIT_VEC_LOOP (my, y, 2); \
+ VEC_TYPE mx, my; \
+ INIT_VEC_LOOP (mx, x, 8); \
+ INIT_VEC_LOOP (my, y, 8); \
VEC_TYPE mr = vector_func (mx, my); \
- TEST_VEC_LOOP (2); \
+ TEST_VEC_LOOP (8); \
}
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index af0d2af..454cfba 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -9,10 +9,11 @@ endif
# Variables for libmvec tests.
ifeq ($(subdir),math)
ifeq ($(build-mathvec),yes)
-libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2
+libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \
+ float-vlen4 float-vlen8 float-vlen8-avx2
ifeq (yes,$(config-cflags-avx512))
-libmvec-tests += double-vlen8
+libmvec-tests += double-vlen8 float-vlen16
endif
double-vlen2-arch-ext-cflags = -msse4
@@ -20,8 +21,16 @@ double-vlen4-arch-ext-cflags = -mavx
double-vlen4-arch-ext2-cflags = -mavx2
double-vlen8-arch-ext-cflags = -mavx512f
+float-vlen4-arch-ext-cflags = -msse4
+float-vlen8-arch-ext-cflags = -mavx
+float-vlen8-arch-ext2-cflags = -mavx2
+float-vlen16-arch-ext-cflags = -mavx512f
+
CFLAGS-test-double-vlen4-avx2.c = $(libm-test-vec-cflags)
CFLAGS-test-double-vlen4-avx2-wrappers.c = $(double-vlen4-arch-ext2-cflags)
+CFLAGS-test-float-vlen8-avx2.c = $(libm-test-vec-cflags)
+CFLAGS-test-float-vlen8-avx2-wrappers.c = $(float-vlen8-arch-ext2-cflags)
+
endif
endif
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 0d1f6e8..ed152d8 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -965,17 +965,25 @@ idouble: 1
ildouble: 2
ldouble: 2
+Function: "cos_vlen16":
+float: 1
+
Function: "cos_vlen2":
double: 1
Function: "cos_vlen4":
double: 1
+float: 1
Function: "cos_vlen4_avx2":
double: 1
Function: "cos_vlen8":
double: 1
+float: 1
+
+Function: "cos_vlen8_avx2":
+float: 1
Function: "cosh":
double: 1
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
new file mode 100644
index 0000000..2bb155f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
@@ -0,0 +1,25 @@
+/* Wrapper part of tests for AVX-512 ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen16.h"
+#include "test-vec-loop.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m512
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVeN16v_cosf)
diff --git a/sysdeps/x86_64/fpu/test-float-vlen16.c b/sysdeps/x86_64/fpu/test-float-vlen16.c
new file mode 100644
index 0000000..a664ad9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen16.c
@@ -0,0 +1,25 @@
+/* Tests for AVX-512 ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen16.h"
+
+#define TEST_VECTOR_cosf 1
+
+#define REQUIRE_AVX512F
+
+#include "libm-test.c"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
new file mode 100644
index 0000000..05d6a40
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
@@ -0,0 +1,25 @@
+/* Wrapper part of tests for SSE ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen4.h"
+#include "test-vec-loop.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m128
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
diff --git a/sysdeps/x86_64/fpu/test-float-vlen4.c b/sysdeps/x86_64/fpu/test-float-vlen4.c
new file mode 100644
index 0000000..8946520
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen4.c
@@ -0,0 +1,23 @@
+/* Tests for SSE ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen4.h"
+
+#define TEST_VECTOR_cosf 1
+
+#include "libm-test.c"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
new file mode 100644
index 0000000..cff9941
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
@@ -0,0 +1,28 @@
+/* Wrapper part of tests for AVX2 ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen8.h"
+#include "test-vec-loop.h"
+#include <immintrin.h>
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen8_avx2
+
+#define VEC_TYPE __m256
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVdN8v_cosf)
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
new file mode 100644
index 0000000..f0ee6f2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
@@ -0,0 +1,28 @@
+/* Tests for AVX2 ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen8.h"
+
+#undef VEC_SUFF
+#define VEC_SUFF _vlen8_avx2
+
+#define TEST_VECTOR_cosf 1
+
+#define REQUIRE_AVX2
+
+#include "libm-test.c"
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
new file mode 100644
index 0000000..c2305a3
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
@@ -0,0 +1,25 @@
+/* Wrapper part of tests for AVX ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen8.h"
+#include "test-vec-loop.h"
+#include <immintrin.h>
+
+#define VEC_TYPE __m256
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVcN8v_cosf)
diff --git a/sysdeps/x86_64/fpu/test-float-vlen8.c b/sysdeps/x86_64/fpu/test-float-vlen8.c
new file mode 100644
index 0000000..b96dec6
--- /dev/null
+++ b/sysdeps/x86_64/fpu/test-float-vlen8.c
@@ -0,0 +1,23 @@
+/* Tests for AVX ISA versions of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen8.h"
+
+#define TEST_VECTOR_cosf 1
+
+#include "libm-test.c"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=04f496d6025753058bdd071fd711e9f56df149a7
commit 04f496d6025753058bdd071fd711e9f56df149a7
Author: Andrew Senkevich <andrew.senkevich@intel.com>
Date: Tue Jun 9 18:29:47 2015 +0300
Vector cosf for x86_64.
Here is implementation of vectorized cosf containing SSE, AVX,
AVX2 and AVX512 versions according to Vector ABI
<https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
* sysdeps/x86_64/fpu/Versions: New versions added.
* sysdeps/x86_64/fpu/svml_s_cosf4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf16_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S: New file.
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: New file.
* sysdeps/x86_64/fpu/svml_s_cosf_data.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf_data.h: New file.
* sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
build of SSE, AVX2 and AVX512 IFUNC versions.
* sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
* sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cosf.
* NEWS: Mention addition of x86_64 vector cosf.
diff --git a/ChangeLog b/ChangeLog
index 47318a7..c3e52b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -47,6 +47,27 @@
* sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-double-vlen8.c: New file.
+ * sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
+ * sysdeps/x86_64/fpu/Versions: New versions added.
+ * sysdeps/x86_64/fpu/svml_s_cosf4_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S: New file.
+ * sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S: New file.
+ * sysdeps/x86_64/fpu/svml_s_cosf8_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S: New file.
+ * sysdeps/x86_64/fpu/svml_s_cosf16_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: New file.
+ * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S: New file.
+ * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: New file.
+ * sysdeps/x86_64/fpu/svml_s_cosf_data.S: New file.
+ * sysdeps/x86_64/fpu/svml_s_cosf_data.h: New file.
+ * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
+ build of SSE, AVX2 and AVX512 IFUNC versions.
+ * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
+ * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cosf.
+ * NEWS: Mention addition of x86_64 vector cosf.
+
2015-06-09 Marko Myllynen <myllynen@redhat.com>
* locale/C-ctype.c (PREDEFINED_CLASSES): Remove.
diff --git a/NEWS b/NEWS
index 5e223a1..53f244d 100644
--- a/NEWS
+++ b/NEWS
@@ -52,7 +52,7 @@ Version 2.22
condition in some applications.
* Added vector math library named libmvec with the following vectorized x86_64
- implementations: cos.
+ implementations: cos, cosf.
The library can be disabled with --disable-mathvec. Use of the functions is
enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0.
The library is linked in as needed when using -lm (no need to specify -lmvec
diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
index be6eaed..acabb8a 100644
--- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
@@ -1,6 +1,10 @@
GLIBC_2.22
GLIBC_2.22 A
_ZGVbN2v_cos F
+ _ZGVbN4v_cosf F
_ZGVcN4v_cos F
+ _ZGVcN8v_cosf F
_ZGVdN4v_cos F
+ _ZGVdN8v_cosf F
+ _ZGVeN16v_cosf F
_ZGVeN8v_cos F
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h
index 27294ce..b3ef833 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86/fpu/bits/math-vector.h
@@ -30,5 +30,7 @@
# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
# undef __DECL_SIMD_cos
# define __DECL_SIMD_cos __DECL_SIMD_x86_64
+# undef __DECL_SIMD_cosf
+# define __DECL_SIMD_cosf __DECL_SIMD_x86_64
# endif
#endif
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 2d5fcf8..af0d2af 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -1,7 +1,9 @@
ifeq ($(subdir),mathvec)
libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
svml_d_cos4_core svml_d_cos8_core \
- svml_d_cos_data init-arch
+ svml_d_cos_data svml_s_cosf4_core svml_s_cosf8_core_avx \
+ svml_s_cosf8_core svml_s_cosf16_core svml_s_cosf_data \
+ init-arch
endif
# Variables for libmvec tests.
diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions
index 4290e73..f85c28b 100644
--- a/sysdeps/x86_64/fpu/Versions
+++ b/sysdeps/x86_64/fpu/Versions
@@ -1,5 +1,6 @@
libmvec {
GLIBC_2.22 {
_ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos;
+ _ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf;
}
}
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index b2f3266..6b50475 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -54,5 +54,6 @@ endif
ifeq ($(subdir),mathvec)
libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
- svml_d_cos8_core_avx512
+ svml_d_cos8_core_avx512 svml_s_cosf4_core_sse4 \
+ svml_s_cosf8_core_avx2 svml_s_cosf16_core_avx512
endif
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
similarity index 50%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 27294ce..91564de 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Multiple versions of vectorized cosf.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,24 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include <init-arch.h>
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
+ .text
+ENTRY (_ZGVeN16v_cosf)
+ .type _ZGVeN16v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1
+ call __init_cpu_features
+1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
+ testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+ jnz 3
+2: leaq _ZGVeN16v_cosf_knl(%rip), %rax
+ testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+ jnz 3
+ leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
+3: ret
+END (_ZGVeN16v_cosf)
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
-#endif
+#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
+#include "../svml_s_cosf16_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
new file mode 100644
index 0000000..a78ae2e
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -0,0 +1,460 @@
+/* Function cosf vectorized with AVX-512. KNL and SKX versions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_cosf_data.h"
+#include "svml_s_wrapper_impl.h"
+
+ .text
+ENTRY (_ZGVeN16v_cosf_knl)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_scos_data@GOTPCREL(%rip), %rdx
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %zmm0, %zmm6
+ movl $-1, %eax
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rdx), %zmm0, %zmm2
+ vmovups __sRShifter(%rdx), %zmm3
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
+ vmovups __sPI1_FMA(%rdx), %zmm5
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %zmm3, %zmm2, %zmm4
+ vmovups __sA9_FMA(%rdx), %zmm9
+
+/* Check for large and special arguments */
+ vpandd __sAbsMask(%rdx), %zmm0, %zmm1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %zmm2, %zmm8
+ vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
+ vpbroadcastd %eax, %zmm12{%k1}{z}
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rdx), %zmm4, %zmm7
+ vptestmd %zmm12, %zmm12, %k0
+ vfnmadd231ps %zmm7, %zmm5, %zmm6
+ kmovw %k0, %ecx
+ vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
+ vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+ vmulps %zmm7, %zmm7, %zmm10
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vpxord %zmm8, %zmm7, %zmm11
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
+ vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
+ vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
+ vmulps %zmm10, %zmm9, %zmm1
+ vfmadd213ps %zmm11, %zmm11, %zmm1
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ kmovw %k4, 1048(%rsp)
+ xorl %eax, %eax
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ addb $1, %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ kmovw 1048(%rsp), %k4
+ movq 1064(%rsp), %rsi
+ kmovw 1040(%rsp), %k5
+ movq 1056(%rsp), %rdi
+ kmovw 1032(%rsp), %k6
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ vmovups 1216(%rsp), %zmm1
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_1_7
+#endif
+END (_ZGVeN16v_cosf_knl)
+
+ENTRY (_ZGVeN16v_cosf_skx)
+#ifndef HAVE_AVX512_ASM_SUPPORT
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+#else
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $1280, %rsp
+ movq __svml_scos_data@GOTPCREL(%rip), %rax
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %zmm0, %zmm6
+ vmovups .L_2il0floatpacket.13(%rip), %zmm12
+ vmovups __sRShifter(%rax), %zmm3
+ vmovups __sPI1_FMA(%rax), %zmm5
+ vmovups __sA9_FMA(%rax), %zmm9
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rax), %zmm0, %zmm2
+
+/* Check for large and special arguments */
+ vandps __sAbsMask(%rax), %zmm0, %zmm1
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
+ vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %zmm2, %zmm8
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %zmm3, %zmm2, %zmm4
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rax), %zmm4, %zmm7
+ vfnmadd231ps %zmm7, %zmm5, %zmm6
+ vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
+ vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
+
+/* a) Calculate X^2 = X * X */
+ vmulps %zmm7, %zmm7, %zmm10
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %zmm8, %zmm7, %zmm11
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
+ vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
+ vfmadd213ps __sA3(%rax), %zmm10, %zmm9
+ vpandnd %zmm1, %zmm1, %zmm12{%k1}
+ vmulps %zmm10, %zmm9, %zmm1
+ vptestmd %zmm12, %zmm12, %k0
+ vfmadd213ps %zmm11, %zmm11, %zmm1
+ kmovw %k0, %ecx
+ testl %ecx, %ecx
+ jne .LBL_2_3
+.LBL_2_2:
+ cfi_remember_state
+ vmovaps %zmm1, %zmm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_2_3:
+ cfi_restore_state
+ vmovups %zmm0, 1152(%rsp)
+ vmovups %zmm1, 1216(%rsp)
+ je .LBL_2_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ kmovw %k4, 1048(%rsp)
+ kmovw %k5, 1040(%rsp)
+ kmovw %k6, 1032(%rsp)
+ kmovw %k7, 1024(%rsp)
+ vmovups %zmm16, 960(%rsp)
+ vmovups %zmm17, 896(%rsp)
+ vmovups %zmm18, 832(%rsp)
+ vmovups %zmm19, 768(%rsp)
+ vmovups %zmm20, 704(%rsp)
+ vmovups %zmm21, 640(%rsp)
+ vmovups %zmm22, 576(%rsp)
+ vmovups %zmm23, 512(%rsp)
+ vmovups %zmm24, 448(%rsp)
+ vmovups %zmm25, 384(%rsp)
+ vmovups %zmm26, 320(%rsp)
+ vmovups %zmm27, 256(%rsp)
+ vmovups %zmm28, 192(%rsp)
+ vmovups %zmm29, 128(%rsp)
+ vmovups %zmm30, 64(%rsp)
+ vmovups %zmm31, (%rsp)
+ movq %rsi, 1064(%rsp)
+ movq %rdi, 1056(%rsp)
+ movq %r12, 1096(%rsp)
+ cfi_offset_rel_rsp (12, 1096)
+ movb %dl, %r12b
+ movq %r13, 1088(%rsp)
+ cfi_offset_rel_rsp (13, 1088)
+ movl %ecx, %r13d
+ movq %r14, 1080(%rsp)
+ cfi_offset_rel_rsp (14, 1080)
+ movl %eax, %r14d
+ movq %r15, 1072(%rsp)
+ cfi_offset_rel_rsp (15, 1072)
+ cfi_remember_state
+
+.LBL_2_6:
+ btl %r14d, %r13d
+ jc .LBL_2_12
+.LBL_2_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_2_10
+.LBL_2_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_2_6
+ kmovw 1048(%rsp), %k4
+ kmovw 1040(%rsp), %k5
+ kmovw 1032(%rsp), %k6
+ kmovw 1024(%rsp), %k7
+ vmovups 960(%rsp), %zmm16
+ vmovups 896(%rsp), %zmm17
+ vmovups 832(%rsp), %zmm18
+ vmovups 768(%rsp), %zmm19
+ vmovups 704(%rsp), %zmm20
+ vmovups 640(%rsp), %zmm21
+ vmovups 576(%rsp), %zmm22
+ vmovups 512(%rsp), %zmm23
+ vmovups 448(%rsp), %zmm24
+ vmovups 384(%rsp), %zmm25
+ vmovups 320(%rsp), %zmm26
+ vmovups 256(%rsp), %zmm27
+ vmovups 192(%rsp), %zmm28
+ vmovups 128(%rsp), %zmm29
+ vmovups 64(%rsp), %zmm30
+ vmovups (%rsp), %zmm31
+ vmovups 1216(%rsp), %zmm1
+ movq 1064(%rsp), %rsi
+ movq 1056(%rsp), %rdi
+ movq 1096(%rsp), %r12
+ cfi_restore (%r12)
+ movq 1088(%rsp), %r13
+ cfi_restore (%r13)
+ movq 1080(%rsp), %r14
+ cfi_restore (%r14)
+ movq 1072(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_2_2
+
+.LBL_2_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1156(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1220(%rsp,%r15,8)
+ jmp .LBL_2_8
+.LBL_2_12:
+ movzbl %r12b, %r15d
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ vzeroupper
+ vmovss 1152(%rsp,%r15,8), %xmm0
+ call cosf@PLT
+ vmovss %xmm0, 1216(%rsp,%r15,8)
+ jmp .LBL_2_7
+#endif
+END (_ZGVeN16v_cosf_skx)
+
+ .section .rodata, "a"
+.L_2il0floatpacket.13:
+ .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
+ .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
similarity index 55%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
index 27294ce..fa2363b 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Multiple versions of vectorized cosf, vector length is 4.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,23 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include <init-arch.h>
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
+ .text
+ENTRY (_ZGVbN4v_cosf)
+ .type _ZGVbN4v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVbN4v_cosf_sse4(%rip), %rax
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax
+ ret
+END (_ZGVbN4v_cosf)
+libmvec_hidden_def (_ZGVbN4v_cosf)
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
-#endif
+#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2
+#include "../svml_s_cosf4_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
new file mode 100644
index 0000000..f231ba2
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
@@ -0,0 +1,227 @@
+/* Function cosf vectorized with SSE4.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include "svml_s_cosf_data.h"
+
+ .text
+ENTRY (_ZGVbN4v_cosf_sse4)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $320, %rsp
+ movaps %xmm0, %xmm4
+ movq __svml_scos_data@GOTPCREL(%rip), %rax
+ movups __sHalfPI(%rax), %xmm1
+ movups __sRShifter(%rax), %xmm5
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ addps %xmm4, %xmm1
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ mulps __sInvPI(%rax), %xmm1
+ movups __sPI1(%rax), %xmm6
+ addps %xmm5, %xmm1
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ movaps %xmm1, %xmm2
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ subps %xmm5, %xmm1
+ movups __sPI2(%rax), %xmm7
+ pslld $31, %xmm2
+ movups __sPI3(%rax), %xmm5
+ movups __sAbsMask(%rax), %xmm3
+
+/* Check for large and special arguments */
+ andps %xmm4, %xmm3
+
+/* g) Subtract 0.5 from result for octant correction */
+ subps __sOneHalf(%rax), %xmm1
+ cmpnleps __sRangeReductionVal(%rax), %xmm3
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ */
+ mulps %xmm1, %xmm6
+ mulps %xmm1, %xmm7
+ mulps %xmm1, %xmm5
+ subps %xmm6, %xmm0
+ movmskps %xmm3, %ecx
+ movups __sPI4(%rax), %xmm6
+ subps %xmm7, %xmm0
+ mulps %xmm6, %xmm1
+ subps %xmm5, %xmm0
+ subps %xmm1, %xmm0
+
+/* a) Calculate X^2 = X * X */
+ movaps %xmm0, %xmm1
+ mulps %xmm0, %xmm1
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ xorps %xmm2, %xmm0
+ movups __sA9(%rax), %xmm2
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
+ */
+ mulps %xmm1, %xmm2
+ addps __sA7(%rax), %xmm2
+ mulps %xmm1, %xmm2
+ addps __sA5(%rax), %xmm2
+ mulps %xmm1, %xmm2
+ addps __sA3(%rax), %xmm2
+ mulps %xmm2, %xmm1
+ mulps %xmm0, %xmm1
+ addps %xmm1, %xmm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ movups %xmm4, 192(%rsp)
+ movups %xmm0, 256(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ movups %xmm8, 112(%rsp)
+ movups %xmm9, 96(%rsp)
+ movups %xmm10, 80(%rsp)
+ movups %xmm11, 64(%rsp)
+ movups %xmm12, 48(%rsp)
+ movups %xmm13, 32(%rsp)
+ movups %xmm14, 16(%rsp)
+ movups %xmm15, (%rsp)
+ movq %rsi, 136(%rsp)
+ movq %rdi, 128(%rsp)
+ movq %r12, 168(%rsp)
+ cfi_offset_rel_rsp (12, 168)
+ movb %dl, %r12b
+ movq %r13, 160(%rsp)
+ cfi_offset_rel_rsp (13, 160)
+ movl %ecx, %r13d
+ movq %r14, 152(%rsp)
+ cfi_offset_rel_rsp (14, 152)
+ movl %eax, %r14d
+ movq %r15, 144(%rsp)
+ cfi_offset_rel_rsp (15, 144)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ movups 112(%rsp), %xmm8
+ movups 96(%rsp), %xmm9
+ movups 80(%rsp), %xmm10
+ movups 64(%rsp), %xmm11
+ movups 48(%rsp), %xmm12
+ movups 32(%rsp), %xmm13
+ movups 16(%rsp), %xmm14
+ movups (%rsp), %xmm15
+ movq 136(%rsp), %rsi
+ movq 128(%rsp), %rdi
+ movq 168(%rsp), %r12
+ cfi_restore (%r12)
+ movq 160(%rsp), %r13
+ cfi_restore (%r13)
+ movq 152(%rsp), %r14
+ cfi_restore (%r14)
+ movq 144(%rsp), %r15
+ cfi_restore (%r15)
+ movups 256(%rsp), %xmm0
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ movss 196(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 260(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ movss 192(%rsp,%r15,8), %xmm0
+
+ call cosf@PLT
+
+ movss %xmm0, 256(%rsp,%r15,8)
+ jmp .LBL_1_7
+END (_ZGVbN4v_cosf_sse4)
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
similarity index 50%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
index 27294ce..e14bba4 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Multiple versions of vectorized cosf, vector length is 8.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -10,25 +10,29 @@
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
+ Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include <init-arch.h>
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
+ .text
+ENTRY (_ZGVdN8v_cosf)
+ .type _ZGVdN8v_cosf, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq _ZGVdN8v_cosf_avx2(%rip), %rax
+ testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+ jz 2f
+ ret
+2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
+ ret
+END (_ZGVdN8v_cosf)
+libmvec_hidden_def (_ZGVdN8v_cosf)
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
-#endif
+#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper
+#include "../svml_s_cosf8_core.S"
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
new file mode 100644
index 0000000..6c25e14
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
@@ -0,0 +1,215 @@
+/* Function cosf vectorized with AVX2.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+#include "svml_s_cosf_data.h"
+
+ .text
+ENTRY (_ZGVdN8v_cosf_avx2)
+/*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using AND operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $448, %rsp
+ movq __svml_scos_data@GOTPCREL(%rip), %rax
+ vmovaps %ymm0, %ymm2
+ vmovups __sRShifter(%rax), %ymm5
+ vmovups __sPI1_FMA(%rax), %ymm7
+
+/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
+ vaddps __sHalfPI(%rax), %ymm2, %ymm4
+
+/*
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" (0x4B000000) value
+ */
+ vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
+
+/* f) Subtract "Right Shifter" (0x4B000000) value */
+ vsubps %ymm5, %ymm4, %ymm6
+
+/*
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position (S << 31)
+ */
+ vpslld $31, %ymm4, %ymm0
+
+/* g) Subtract 0.5 from result for octant correction */
+ vsubps __sOneHalf(%rax), %ymm6, %ymm4
+
+/* Check for large and special arguments */
+ vandps __sAbsMask(%rax), %ymm2, %ymm3
+ vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
+
+/*
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3
+ */
+ vmovaps %ymm2, %ymm3
+ vfnmadd231ps %ymm4, %ymm7, %ymm3
+ vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
+ vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
+
+/* a) Calculate X^2 = X * X */
+ vmulps %ymm4, %ymm4, %ymm5
+
+/*
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S );
+ */
+ vxorps %ymm0, %ymm4, %ymm6
+ vmovups __sA9_FMA(%rax), %ymm0
+
+/*
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
+ */
+ vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
+ vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
+ vfmadd213ps __sA3(%rax), %ymm5, %ymm0
+ vmulps %ymm5, %ymm0, %ymm0
+ vmovmskps %ymm1, %ecx
+ vfmadd213ps %ymm6, %ymm6, %ymm0
+ testl %ecx, %ecx
+ jne .LBL_1_3
+
+.LBL_1_2:
+ cfi_remember_state
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+
+.LBL_1_3:
+ cfi_restore_state
+ vmovups %ymm2, 320(%rsp)
+ vmovups %ymm0, 384(%rsp)
+ je .LBL_1_2
+
+ xorb %dl, %dl
+ xorl %eax, %eax
+ vmovups %ymm8, 224(%rsp)
+ vmovups %ymm9, 192(%rsp)
+ vmovups %ymm10, 160(%rsp)
+ vmovups %ymm11, 128(%rsp)
+ vmovups %ymm12, 96(%rsp)
+ vmovups %ymm13, 64(%rsp)
+ vmovups %ymm14, 32(%rsp)
+ vmovups %ymm15, (%rsp)
+ movq %rsi, 264(%rsp)
+ movq %rdi, 256(%rsp)
+ movq %r12, 296(%rsp)
+ cfi_offset_rel_rsp (12, 296)
+ movb %dl, %r12b
+ movq %r13, 288(%rsp)
+ cfi_offset_rel_rsp (13, 288)
+ movl %ecx, %r13d
+ movq %r14, 280(%rsp)
+ cfi_offset_rel_rsp (14, 280)
+ movl %eax, %r14d
+ movq %r15, 272(%rsp)
+ cfi_offset_rel_rsp (15, 272)
+ cfi_remember_state
+
+.LBL_1_6:
+ btl %r14d, %r13d
+ jc .LBL_1_12
+
+.LBL_1_7:
+ lea 1(%r14), %esi
+ btl %esi, %r13d
+ jc .LBL_1_10
+
+.LBL_1_8:
+ incb %r12b
+ addl $2, %r14d
+ cmpb $16, %r12b
+ jb .LBL_1_6
+
+ vmovups 224(%rsp), %ymm8
+ vmovups 192(%rsp), %ymm9
+ vmovups 160(%rsp), %ymm10
+ vmovups 128(%rsp), %ymm11
+ vmovups 96(%rsp), %ymm12
+ vmovups 64(%rsp), %ymm13
+ vmovups 32(%rsp), %ymm14
+ vmovups (%rsp), %ymm15
+ vmovups 384(%rsp), %ymm0
+ movq 264(%rsp), %rsi
+ movq 256(%rsp), %rdi
+ movq 296(%rsp), %r12
+ cfi_restore (%r12)
+ movq 288(%rsp), %r13
+ cfi_restore (%r13)
+ movq 280(%rsp), %r14
+ cfi_restore (%r14)
+ movq 272(%rsp), %r15
+ cfi_restore (%r15)
+ jmp .LBL_1_2
+
+.LBL_1_10:
+ cfi_restore_state
+ movzbl %r12b, %r15d
+ vmovss 324(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call cosf@PLT
+
+ vmovss %xmm0, 388(%rsp,%r15,8)
+ jmp .LBL_1_8
+
+.LBL_1_12:
+ movzbl %r12b, %r15d
+ vmovss 320(%rsp,%r15,8), %xmm0
+ vzeroupper
+
+ call cosf@PLT
+
+ vmovss %xmm0, 384(%rsp,%r15,8)
+ jmp .LBL_1_7
+
+END (_ZGVdN8v_cosf_avx2)
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
similarity index 59%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/svml_s_cosf16_core.S
index 27294ce..e623df5 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/svml_s_cosf16_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Function cosf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,10 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
-
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
-#endif
+ .text
+ENTRY (_ZGVeN16v_cosf)
+WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
+END (_ZGVeN16v_cosf)
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
similarity index 60%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/svml_s_cosf4_core.S
index 27294ce..9875cd7 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/svml_s_cosf4_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Function cosf vectorized with SSE2, wrapper version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,14 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
+ .text
+ENTRY (_ZGVbN4v_cosf)
+WRAPPER_IMPL_SSE2 cosf
+END (_ZGVbN4v_cosf)
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVbN4v_cosf)
#endif
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
similarity index 60%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/svml_s_cosf8_core.S
index 27294ce..376ee35 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Function cosf vectorized with AVX2, wrapper version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,14 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
+ .text
+ENTRY (_ZGVdN8v_cosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_cosf
+END (_ZGVdN8v_cosf)
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
+#ifndef USE_MULTIARCH
+ libmvec_hidden_def (_ZGVdN8v_cosf)
#endif
diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
similarity index 59%
copy from sysdeps/x86/fpu/bits/math-vector.h
copy to sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
index 27294ce..a443fd2 100644
--- a/sysdeps/x86/fpu/bits/math-vector.h
+++ b/sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
@@ -1,4 +1,4 @@
-/* Platform-specific SIMD declarations of math functions.
+/* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,19 +16,10 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef _MATH_H
-# error "Never include <bits/math-vector.h> directly;\
- include <math.h> instead."
-#endif
+#include <sysdep.h>
+#include "svml_s_wrapper_impl.h"
-/* Get default empty definitions for simd declarations. */
-#include <bits/libm-simd-decl-stubs.h>
-
-#if defined __x86_64__ && defined __FAST_MATH__
-# if defined _OPENMP && _OPENMP >= 201307
-/* OpenMP case. */
-# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
-# undef __DECL_SIMD_cos
-# define __DECL_SIMD_cos __DECL_SIMD_x86_64
-# endif
-#endif
+ .text
+ENTRY (_ZGVcN8v_cosf)
+WRAPPER_IMPL_AVX _ZGVbN4v_cosf
+END (_ZGVcN8v_cosf)
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf_data.S b/sysdeps/x86_64/fpu/svml_s_cosf_data.S
new file mode 100644
index 0000000..2f7303c
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_cosf_data.S
@@ -0,0 +1,1130 @@
+/* Data for function cosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "svml_s_cosf_data.h"
+
+ .section .rodata, "a"
+ .align 64
+
+/* Data table for vector implementations of function cosf.
+ The table may contain polynomial, reduction, lookup coefficients
+ and other macro_names obtained through different methods
+ of research and experimental work. */
+
+ .globl __svml_scos_data
+__svml_scos_data:
+
+/* Lookup table for high accuracy version (CHL,SHi,SLo,Sigma). */
+.if .-__svml_scos_data != __dT
+.err
+.endif
+ .long 0x00000000
+ .long 0x3f800000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x3bdbd541
+ .long 0x3f7fec43
+ .long 0x3084cd0d
+ .long 0xbd000000
+ .long 0x3c5c1342
+ .long 0x3f7fb10f
+ .long 0x31de5b5f
+ .long 0xbd800000
+ .long 0xbc354825
+ .long 0x3f7f4e6d
+ .long 0x32d01884
+ .long 0xbd800000
+ .long 0x3cdd0b28
+ .long 0x3f7ec46d
+ .long 0x31f44949
+ .long 0xbe000000
+ .long 0x3b29b1a9
+ .long 0x3f7e1324
+ .long 0xb2f1e603
+ .long 0xbe000000
+ .long 0xbcb2041c
+ .long 0x3f7d3aac
+ .long 0xb0f75ae9
+ .long 0xbe000000
+ .long 0xbd3c4289
+ .long 0x3f7c3b28
+ .long 0xb231d68b
+ .long 0xbe000000
+ .long 0x3d60e8f8
+ .long 0x3f7b14be
+ .long 0x32ff75cb
+ .long 0xbe800000
+ .long 0x3cfd1f65
+ .long 0x3f79c79d
+ .long 0x32c64e59
+ .long 0xbe800000
+ .long 0x3be60685
+ .long 0x3f7853f8
+ .long 0xb20db9e5
+ .long 0xbe800000
+ .long 0xbc88e931
+ .long 0x3f76ba07
+ .long 0x326d092c
+ .long 0xbe800000
+ .long 0xbd25018c
+ .long 0x3f74fa0b
+ .long 0xb2939d22
+ .long 0xbe800000
+ .long 0xbd826b93
+ .long 0x3f731447
+ .long 0x32c48e11
+ .long 0xbe800000
+ .long 0xbdb1f34f
+ .long 0x3f710908
+ .long 0x321ed0dd
+ .long 0xbe800000
+ .long 0x3e0f77ad
+ .long 0x3f6ed89e
+ .long 0xb29333dc
+ .long 0xbf000000
+ .long 0x3df043ab
+ .long 0x3f6c835e
+ .long 0x32f328d4
+ .long 0xbf000000
+ .long 0x3dc210d8
+ .long 0x3f6a09a7
+ .long 0xb2eb236c
+ .long 0xbf000000
+ .long 0x3d945dff
+ .long 0x3f676bd8
+ .long 0xb2bc3389
+ .long 0xbf000000
+ .long 0x3d4e645a
+ .long 0x3f64aa59
+ .long 0x311a08fa
+ .long 0xbf000000
+ .long 0x3cea5164
+ .long 0x3f61c598
+ .long 0xb2e7f425
+ .long 0xbf000000
+ .long 0x3be8b648
+ .long 0x3f5ebe05
+ .long 0x32c6f953
+ .long 0xbf000000
+ .long 0xbc670f32
+ .long 0x3f5b941a
+ .long 0x32232dc8
+ .long 0xbf000000
+ .long 0xbd0f59aa
+ .long 0x3f584853
+ .long 0xb27d5fc0
+ .long 0xbf000000
+ .long 0xbd639d9d
+ .long 0x3f54db31
+ .long 0x3290ea1a
+ .long 0xbf000000
+ .long 0xbd9b4153
+ .long 0x3f514d3d
+ .long 0x300c4f04
+ .long 0xbf000000
+ .long 0xbdc3fdff
+ .long 0x3f4d9f02
+ .long 0x327e70e8
+ .long 0xbf000000
+ .long 0xbdebfe8a
+ .long 0x3f49d112
+ .long 0x32992640
+ .long 0xbf000000
+ .long 0xbe099e65
+ .long 0x3f45e403
+ .long 0x32b15174
+ .long 0xbf000000
+ .long 0xbe1cd957
+ .long 0x3f41d870
+ .long 0x32bff977
+ .long 0xbf000000
+ .long 0xbe2fad27
+ .long 0x3f3daef9
+ .long 0x319aabec
+ .long 0xbf000000
+ .long 0xbe4216eb
+ .long 0x3f396842
+ .long 0xb2810007
+ .long 0xbf000000
+ .long 0x3e95f61a
+ .long 0x3f3504f3
+ .long 0x324fe77a
+ .long 0xbf800000
+ .long 0x3e8d2f7d
+ .long 0x3f3085bb
+ .long 0xb2ae2d32
+ .long 0xbf800000
+ .long 0x3e84a20e
+ .long 0x3f2beb4a
+ .long 0xb2b73136
+ .long 0xbf800000
+ .long 0x3e789e3f
+ .long 0x3f273656
+ .long 0xb2038343
+ .long 0xbf800000
+ .long 0x3e686ff3
+ .long 0x3f226799
+ .long 0x322123bb
+ .long 0xbf800000
+ .long 0x3e58bbb7
+ .long 0x3f1d7fd1
+ .long 0x3292050c
+ .long 0xbf800000
+ .long 0x3e4983f7
+ .long 0x3f187fc0
+ .long 0xb1c7a3f3
+ .long 0xbf800000
+ .long 0x3e3acb0c
+ .long 0x3f13682a
+ .long 0x32cdd12e
+ .long 0xbf800000
+ .long 0x3e2c933b
+ .long 0x3f0e39da
+ .long 0xb24a32e7
+ .long 0xbf800000
+ .long 0x3e1edeb5
+ .long 0x3f08f59b
+ .long 0xb2be4b4e
+ .long 0xbf800000
+ .long 0x3e11af97
+ .long 0x3f039c3d
+ .long 0xb25ba002
+ .long 0xbf800000
+ .long 0x3e0507ea
+ .long 0x3efc5d27
+ .long 0xb180eca9
+ .long 0xbf800000
+ .long 0x3df1d344
+ .long 0x3ef15aea
+ .long 0xb1ff2139
+ .long 0xbf800000
+ .long 0x3ddaad38
+ .long 0x3ee63375
+ .long 0xb1d9c774
+ .long 0xbf800000
+ .long 0x3dc4a143
+ .long 0x3edae880
+ .long 0x321e15cc
+ .long 0xbf800000
+ .long 0x3dafb2cc
+ .long 0x3ecf7bca
+ .long 0x316a3b63
+ .long 0xbf800000
+ .long 0x3d9be50c
+ .long 0x3ec3ef15
+ .long 0x31d5d52c
+ .long 0xbf800000
+ .long 0x3d893b12
+ .long 0x3eb8442a
+ .long 0xb2705ba6
+ .long 0xbf800000
+ .long 0x3d6f6f7e
+ .long 0x3eac7cd4
+ .long 0xb2254e02
+ .long 0xbf800000
+ .long 0x3d4ebb8a
+ .long 0x3ea09ae5
+ .long 0xb23e89a0
+ .long 0xbf800000
+ .long 0x3d305f55
+ .long 0x3e94a031
+ .long 0x326d59f0
+ .long 0xbf800000
+ .long 0x3d145f8c
+ .long 0x3e888e93
+ .long 0x312c7d9e
+ .long 0xbf800000
+ .long 0x3cf58104
+ .long 0x3e78cfcc
+ .long 0xb11bd41d
+ .long 0xbf800000
+ .long 0x3cc70c54
+ .long 0x3e605c13
+ .long 0x31a7e4f6
+ .long 0xbf800000
+ .long 0x3c9d6830
+ .long 0x3e47c5c2
+ .long 0xb0e5967d
+ .long 0xbf800000
+ .long 0x3c71360b
+ .long 0x3e2f10a2
+ .long 0x311167f9
+ .long 0xbf800000
+ .long 0x3c315502
+ .long 0x3e164083
+ .long 0x31e8e614
+ .long 0xbf800000
+ .long 0x3bf66e3c
+ .long 0x3dfab273
+ .long 0xb11568cf
+ .long 0xbf800000
+ .long 0x3b9dc971
+ .long 0x3dc8bd36
+ .long 0xb07592f5
+ .long 0xbf800000
+ .long 0x3b319298
+ .long 0x3d96a905
+ .long 0xb1531e61
+ .long 0xbf800000
+ .long 0x3a9de1c8
+ .long 0x3d48fb30
+ .long 0xb0ef227f
+ .long 0xbf800000
+ .long 0x399de7df
+ .long 0x3cc90ab0
+ .long 0xb005c998
+ .long 0xbf800000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0xbf800000
+ .long 0x399de7df
+ .long 0xbcc90ab0
+ .long 0x3005c998
+ .long 0xbf800000
+ .long 0x3a9de1c8
+ .long 0xbd48fb30
+ .long 0x30ef227f
+ .long 0xbf800000
+ .long 0x3b319298
+ .long 0xbd96a905
+ .long 0x31531e61
+ .long 0xbf800000
+ .long 0x3b9dc971
+ .long 0xbdc8bd36
+ .long 0x307592f5
+ .long 0xbf800000
+ .long 0x3bf66e3c
+ .long 0xbdfab273
+ .long 0x311568cf
+ .long 0xbf800000
+ .long 0x3c315502
+ .long 0xbe164083
+ .long 0xb1e8e614
+ .long 0xbf800000
+ .long 0x3c71360b
+ .long 0xbe2f10a2
+ .long 0xb11167f9
+ .long 0xbf800000
+ .long 0x3c9d6830
+ .long 0xbe47c5c2
+ .long 0x30e5967d
+ .long 0xbf800000
+ .long 0x3cc70c54
+ .long 0xbe605c13
+ .long 0xb1a7e4f6
+ .long 0xbf800000
+ .long 0x3cf58104
+ .long 0xbe78cfcc
+ .long 0x311bd41d
+ .long 0xbf800000
+ .long 0x3d145f8c
+ .long 0xbe888e93
+ .long 0xb12c7d9e
+ .long 0xbf800000
+ .long 0x3d305f55
+ .long 0xbe94a031
+ .long 0xb26d59f0
+ .long 0xbf800000
+ .long 0x3d4ebb8a
+ .long 0xbea09ae5
+ .long 0x323e89a0
+ .long 0xbf800000
+ .long 0x3d6f6f7e
+ .long 0xbeac7cd4
+ .long 0x32254e02
+ .long 0xbf800000
+ .long 0x3d893b12
+ .long 0xbeb8442a
+ .long 0x32705ba6
+ .long 0xbf800000
+ .long 0x3d9be50c
+ .long 0xbec3ef15
+ .long 0xb1d5d52c
+ .long 0xbf800000
+ .long 0x3dafb2cc
+ .long 0xbecf7bca
+ .long 0xb16a3b63
+ .long 0xbf800000
+ .long 0x3dc4a143
+ .long 0xbedae880
+ .long 0xb21e15cc
+ .long 0xbf800000
+ .long 0x3ddaad38
+ .long 0xbee63375
+ .long 0x31d9c774
+ .long 0xbf800000
+ .long 0x3df1d344
+ .long 0xbef15aea
+ .long 0x31ff2139
+ .long 0xbf800000
+ .long 0x3e0507ea
+ .long 0xbefc5d27
+ .long 0x3180eca9
+ .long 0xbf800000
+ .long 0x3e11af97
+ .long 0xbf039c3d
+ .long 0x325ba002
+ .long 0xbf800000
+ .long 0x3e1edeb5
+ .long 0xbf08f59b
+ .long 0x32be4b4e
+ .long 0xbf800000
+ .long 0x3e2c933b
+ .long 0xbf0e39da
+ .long 0x324a32e7
+ .long 0xbf800000
+ .long 0x3e3acb0c
+ .long 0xbf13682a
+ .long 0xb2cdd12e
+ .long 0xbf800000
+ .long 0x3e4983f7
+ .long 0xbf187fc0
+ .long 0x31c7a3f3
+ .long 0xbf800000
+ .long 0x3e58bbb7
+ .long 0xbf1d7fd1
+ .long 0xb292050c
+ .long 0xbf800000
+ .long 0x3e686ff3
+ .long 0xbf226799
+ .long 0xb22123bb
+ .long 0xbf800000
+ .long 0x3e789e3f
+ .long 0xbf273656
+ .long 0x32038343
+ .long 0xbf800000
+ .long 0x3e84a20e
+ .long 0xbf2beb4a
+ .long 0x32b73136
+ .long 0xbf800000
+ .long 0x3e8d2f7d
+ .long 0xbf3085bb
+ .long 0x32ae2d32
+ .long 0xbf800000
+ .long 0x3e95f61a
+ .long 0xbf3504f3
+ .long 0xb24fe77a
+ .long 0xbf800000
+ .long 0xbe4216eb
+ .long 0xbf396842
+ .long 0x32810007
+ .long 0xbf000000
+ .long 0xbe2fad27
+ .long 0xbf3daef9
+ .long 0xb19aabec
+ .long 0xbf000000
+ .long 0xbe1cd957
+ .long 0xbf41d870
+ .long 0xb2bff977
+ .long 0xbf000000
+ .long 0xbe099e65
+ .long 0xbf45e403
+ .long 0xb2b15174
+ .long 0xbf000000
+ .long 0xbdebfe8a
+ .long 0xbf49d112
+ .long 0xb2992640
+ .long 0xbf000000
+ .long 0xbdc3fdff
+ .long 0xbf4d9f02
+ .long 0xb27e70e8
+ .long 0xbf000000
+ .long 0xbd9b4153
+ .long 0xbf514d3d
+ .long 0xb00c4f04
+ .long 0xbf000000
+ .long 0xbd639d9d
+ .long 0xbf54db31
+ .long 0xb290ea1a
+ .long 0xbf000000
+ .long 0xbd0f59aa
+ .long 0xbf584853
+ .long 0x327d5fc0
+ .long 0xbf000000
+ .long 0xbc670f32
+ .long 0xbf5b941a
+ .long 0xb2232dc8
+ .long 0xbf000000
+ .long 0x3be8b648
+ .long 0xbf5ebe05
+ .long 0xb2c6f953
+ .long 0xbf000000
+ .long 0x3cea5164
+ .long 0xbf61c598
+ .long 0x32e7f425
+ .long 0xbf000000
+ .long 0x3d4e645a
+ .long 0xbf64aa59
+ .long 0xb11a08fa
+ .long 0xbf000000
+ .long 0x3d945dff
+ .long 0xbf676bd8
+ .long 0x32bc3389
+ .long 0xbf000000
+ .long 0x3dc210d8
+ .long 0xbf6a09a7
+ .long 0x32eb236c
+ .long 0xbf000000
+ .long 0x3df043ab
+ .long 0xbf6c835e
+ .long 0xb2f328d4
+ .long 0xbf000000
+ .long 0x3e0f77ad
+ .long 0xbf6ed89e
+ .long 0x329333dc
+ .long 0xbf000000
+ .long 0xbdb1f34f
+ .long 0xbf710908
+ .long 0xb21ed0dd
+ .long 0xbe800000
+ .long 0xbd826b93
+ .long 0xbf731447
+ .long 0xb2c48e11
+ .long 0xbe800000
+ .long 0xbd25018c
+ .long 0xbf74fa0b
+ .long 0x32939d22
+ .long 0xbe800000
+ .long 0xbc88e931
+ .long 0xbf76ba07
+ .long 0xb26d092c
+ .long 0xbe800000
+ .long 0x3be60685
+ .long 0xbf7853f8
+ .long 0x320db9e5
+ .long 0xbe800000
+ .long 0x3cfd1f65
+ .long 0xbf79c79d
+ .long 0xb2c64e59
+ .long 0xbe800000
+ .long 0x3d60e8f8
+ .long 0xbf7b14be
+ .long 0xb2ff75cb
+ .long 0xbe800000
+ .long 0xbd3c4289
+ .long 0xbf7c3b28
+ .long 0x3231d68b
+ .long 0xbe000000
+ .long 0xbcb2041c
+ .long 0xbf7d3aac
+ .long 0x30f75ae9
+ .long 0xbe000000
+ .long 0x3b29b1a9
+ .long 0xbf7e1324
+ .long 0x32f1e603
+ .long 0xbe000000
+ .long 0x3cdd0b28
+ .long 0xbf7ec46d
+ .long 0xb1f44949
+ .long 0xbe000000
+ .long 0xbc354825
+ .long 0xbf7f4e6d
+ .long 0xb2d01884
+ .long 0xbd800000
+ .long 0x3c5c1342
+ .long 0xbf7fb10f
+ .long 0xb1de5b5f
+ .long 0xbd800000
+ .long 0x3bdbd541
+ .long 0xbf7fec43
+ .long 0xb084cd0d
+ .long 0xbd000000
+ .long 0x00000000
+ .long 0xbf800000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0xbbdbd541
+ .long 0xbf7fec43
+ .long 0xb084cd0d
+ .long 0x3d000000
+ .long 0xbc5c1342
+ .long 0xbf7fb10f
+ .long 0xb1de5b5f
+ .long 0x3d800000
+ .long 0x3c354825
+ .long 0xbf7f4e6d
+ .long 0xb2d01884
+ .long 0x3d800000
+ .long 0xbcdd0b28
+ .long 0xbf7ec46d
+ .long 0xb1f44949
+ .long 0x3e000000
+ .long 0xbb29b1a9
+ .long 0xbf7e1324
+ .long 0x32f1e603
+ .long 0x3e000000
+ .long 0x3cb2041c
+ .long 0xbf7d3aac
+ .long 0x30f75ae9
+ .long 0x3e000000
+ .long 0x3d3c4289
+ .long 0xbf7c3b28
+ .long 0x3231d68b
+ .long 0x3e000000
+ .long 0xbd60e8f8
+ .long 0xbf7b14be
+ .long 0xb2ff75cb
+ .long 0x3e800000
+ .long 0xbcfd1f65
+ .long 0xbf79c79d
+ .long 0xb2c64e59
+ .long 0x3e800000
+ .long 0xbbe60685
+ .long 0xbf7853f8
+ .long 0x320db9e5
+ .long 0x3e800000
+ .long 0x3c88e931
+ .long 0xbf76ba07
+ .long 0xb26d092c
+ .long 0x3e800000
+ .long 0x3d25018c
+ .long 0xbf74fa0b
+ .long 0x32939d22
+ .long 0x3e800000
+ .long 0x3d826b93
+ .long 0xbf731447
+ .long 0xb2c48e11
+ .long 0x3e800000
+ .long 0x3db1f34f
+ .long 0xbf710908
+ .long 0xb21ed0dd
+ .long 0x3e800000
+ .long 0xbe0f77ad
+ .long 0xbf6ed89e
+ .long 0x329333dc
+ .long 0x3f000000
+ .long 0xbdf043ab
+ .long 0xbf6c835e
+ .long 0xb2f328d4
+ .long 0x3f000000
+ .long 0xbdc210d8
+ .long 0xbf6a09a7
+ .long 0x32eb236c
+ .long 0x3f000000
+ .long 0xbd945dff
+ .long 0xbf676bd8
+ .long 0x32bc3389
+ .long 0x3f000000
+ .long 0xbd4e645a
+ .long 0xbf64aa59
+ .long 0xb11a08fa
+ .long 0x3f000000
+ .long 0xbcea5164
+ .long 0xbf61c598
+ .long 0x32e7f425
+ .long 0x3f000000
+ .long 0xbbe8b648
+ .long 0xbf5ebe05
+ .long 0xb2c6f953
+ .long 0x3f000000
+ .long 0x3c670f32
+ .long 0xbf5b941a
+ .long 0xb2232dc8
+ .long 0x3f000000
+ .long 0x3d0f59aa
+ .long 0xbf584853
+ .long 0x327d5fc0
+ .long 0x3f000000
+ .long 0x3d639d9d
+ .long 0xbf54db31
+ .long 0xb290ea1a
+ .long 0x3f000000
+ .long 0x3d9b4153
+ .long 0xbf514d3d
+ .long 0xb00c4f04
+ .long 0x3f000000
+ .long 0x3dc3fdff
+ .long 0xbf4d9f02
+ .long 0xb27e70e8
+ .long 0x3f000000
+ .long 0x3debfe8a
+ .long 0xbf49d112
+ .long 0xb2992640
+ .long 0x3f000000
+ .long 0x3e099e65
+ .long 0xbf45e403
+ .long 0xb2b15174
+ .long 0x3f000000
+ .long 0x3e1cd957
+ .long 0xbf41d870
+ .long 0xb2bff977
+ .long 0x3f000000
+ .long 0x3e2fad27
+ .long 0xbf3daef9
+ .long 0xb19aabec
+ .long 0x3f000000
+ .long 0x3e4216eb
+ .long 0xbf396842
+ .long 0x32810007
+ .long 0x3f000000
+ .long 0xbe95f61a
+ .long 0xbf3504f3
+ .long 0xb24fe77a
+ .long 0x3f800000
+ .long 0xbe8d2f7d
+ .long 0xbf3085bb
+ .long 0x32ae2d32
+ .long 0x3f800000
+ .long 0xbe84a20e
+ .long 0xbf2beb4a
+ .long 0x32b73136
+ .long 0x3f800000
+ .long 0xbe789e3f
+ .long 0xbf273656
+ .long 0x32038343
+ .long 0x3f800000
+ .long 0xbe686ff3
+ .long 0xbf226799
+ .long 0xb22123bb
+ .long 0x3f800000
+ .long 0xbe58bbb7
+ .long 0xbf1d7fd1
+ .long 0xb292050c
+ .long 0x3f800000
+ .long 0xbe4983f7
+ .long 0xbf187fc0
+ .long 0x31c7a3f3
+ .long 0x3f800000
+ .long 0xbe3acb0c
+ .long 0xbf13682a
+ .long 0xb2cdd12e
+ .long 0x3f800000
+ .long 0xbe2c933b
+ .long 0xbf0e39da
+ .long 0x324a32e7
+ .long 0x3f800000
+ .long 0xbe1edeb5
+ .long 0xbf08f59b
+ .long 0x32be4b4e
+ .long 0x3f800000
+ .long 0xbe11af97
+ .long 0xbf039c3d
+ .long 0x325ba002
+ .long 0x3f800000
+ .long 0xbe0507ea
+ .long 0xbefc5d27
+ .long 0x3180eca9
+ .long 0x3f800000
+ .long 0xbdf1d344
+ .long 0xbef15aea
+ .long 0x31ff2139
+ .long 0x3f800000
+ .long 0xbddaad38
+ .long 0xbee63375
+ .long 0x31d9c774
+ .long 0x3f800000
+ .long 0xbdc4a143
+ .long 0xbedae880
+ .long 0xb21e15cc
+ .long 0x3f800000
+ .long 0xbdafb2cc
+ .long 0xbecf7bca
+ .long 0xb16a3b63
+ .long 0x3f800000
+ .long 0xbd9be50c
+ .long 0xbec3ef15
+ .long 0xb1d5d52c
+ .long 0x3f800000
+ .long 0xbd893b12
+ .long 0xbeb8442a
+ .long 0x32705ba6
+ .long 0x3f800000
+ .long 0xbd6f6f7e
+ .long 0xbeac7cd4
+ .long 0x32254e02
+ .long 0x3f800000
+ .long 0xbd4ebb8a
+ .long 0xbea09ae5
+ .long 0x323e89a0
+ .long 0x3f800000
+ .long 0xbd305f55
+ .long 0xbe94a031
+ .long 0xb26d59f0
+ .long 0x3f800000
+ .long 0xbd145f8c
+ .long 0xbe888e93
+ .long 0xb12c7d9e
+ .long 0x3f800000
+ .long 0xbcf58104
+ .long 0xbe78cfcc
+ .long 0x311bd41d
+ .long 0x3f800000
+ .long 0xbcc70c54
+ .long 0xbe605c13
+ .long 0xb1a7e4f6
+ .long 0x3f800000
+ .long 0xbc9d6830
+ .long 0xbe47c5c2
+ .long 0x30e5967d
+ .long 0x3f800000
+ .long 0xbc71360b
+ .long 0xbe2f10a2
+ .long 0xb11167f9
+ .long 0x3f800000
+ .long 0xbc315502
+ .long 0xbe164083
+ .long 0xb1e8e614
+ .long 0x3f800000
+ .long 0xbbf66e3c
+ .long 0xbdfab273
+ .long 0x311568cf
+ .long 0x3f800000
+ .long 0xbb9dc971
+ .long 0xbdc8bd36
+ .long 0x307592f5
+ .long 0x3f800000
+ .long 0xbb319298
+ .long 0xbd96a905
+ .long 0x31531e61
+ .long 0x3f800000
+ .long 0xba9de1c8
+ .long 0xbd48fb30
+ .long 0x30ef227f
+ .long 0x3f800000
+ .long 0xb99de7df
+ .long 0xbcc90ab0
+ .long 0x3005c998
+ .long 0x3f800000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x00000000
+ .long 0x3f800000
+ .long 0xb99de7df
+ .long 0x3cc90ab0
+ .long 0xb005c998
+ .long 0x3f800000
+ .long 0xba9de1c8
+ .long 0x3d48fb30
+ .long 0xb0ef227f
+ .long 0x3f800000
+ .long 0xbb319298
+ .long 0x3d96a905
+ .long 0xb1531e61
+ .long 0x3f800000
+ .long 0xbb9dc971
+ .long 0x3dc8bd36
+ .long 0xb07592f5
+ .long 0x3f800000
+ .long 0xbbf66e3c
+ .long 0x3dfab273
+ .long 0xb11568cf
+ .long 0x3f800000
+ .long 0xbc315502
+ .long 0x3e164083
+ .long 0x31e8e614
+ .long 0x3f800000
+ .long 0xbc71360b
+ .long 0x3e2f10a2
+ .long 0x311167f9
+ .long 0x3f800000
+ .long 0xbc9d6830
+ .long 0x3e47c5c2
+ .long 0xb0e5967d
+ .long 0x3f800000
+ .long 0xbcc70c54
+ .long 0x3e605c13
+ .long 0x31a7e4f6
+ .long 0x3f800000
+ .long 0xbcf58104
+ .long 0x3e78cfcc
+ .long 0xb11bd41d
+ .long 0x3f800000
+ .long 0xbd145f8c
+ .long 0x3e888e93
+ .long 0x312c7d9e
+ .long 0x3f800000
+ .long 0xbd305f55
+ .long 0x3e94a031
+ .long 0x326d59f0
+ .long 0x3f800000
+ .long 0xbd4ebb8a
+ .long 0x3ea09ae5
+ .long 0xb23e89a0
+ .long 0x3f800000
+ .long 0xbd6f6f7e
+ .long 0x3eac7cd4
+ .long 0xb2254e02
+ .long 0x3f800000
+ .long 0xbd893b12
+ .long 0x3eb8442a
+ .long 0xb2705ba6
+ .long 0x3f800000
+ .long 0xbd9be50c
+ .long 0x3ec3ef15
+ .long 0x31d5d52c
+ .long 0x3f800000
+ .long 0xbdafb2cc
+ .long 0x3ecf7bca
+ .long 0x316a3b63
+ .long 0x3f800000
+ .long 0xbdc4a143
+ .long 0x3edae880
+ .long 0x321e15cc
+ .long 0x3f800000
+ .long 0xbddaad38
+ .long 0x3ee63375
+ .long 0xb1d9c774
+ .long 0x3f800000
+ .long 0xbdf1d344
+ .long 0x3ef15aea
+ .long 0xb1ff2139
+ .long 0x3f800000
+ .long 0xbe0507ea
+ .long 0x3efc5d27
+ .long 0xb180eca9
+ .long 0x3f800000
+ .long 0xbe11af97
+ .long 0x3f039c3d
+ .long 0xb25ba002
+ .long 0x3f800000
+ .long 0xbe1edeb5
+ .long 0x3f08f59b
+ .long 0xb2be4b4e
+ .long 0x3f800000
+ .long 0xbe2c933b
+ .long 0x3f0e39da
+ .long 0xb24a32e7
+ .long 0x3f800000
+ .long 0xbe3acb0c
+ .long 0x3f13682a
+ .long 0x32cdd12e
+ .long 0x3f800000
+ .long 0xbe4983f7
+ .long 0x3f187fc0
+ .long 0xb1c7a3f3
+ .long 0x3f800000
+ .long 0xbe58bbb7
+ .long 0x3f1d7fd1
+ .long 0x3292050c
+ .long 0x3f800000
+ .long 0xbe686ff3
+ .long 0x3f226799
+ .long 0x322123bb
+ .long 0x3f800000
+ .long 0xbe789e3f
+ .long 0x3f273656
+ .long 0xb2038343
+ .long 0x3f800000
+ .long 0xbe84a20e
+ .long 0x3f2beb4a
+ .long 0xb2b73136
+ .long 0x3f800000
+ .long 0xbe8d2f7d
+ .long 0x3f3085bb
+ .long 0xb2ae2d32
+ .long 0x3f800000
+ .long 0xbe95f61a
+ .long 0x3f3504f3
+ .long 0x324fe77a
+ .long 0x3f800000
+ .long 0x3e4216eb
+ .long 0x3f396842
+ .long 0xb2810007
+ .long 0x3f000000
+ .long 0x3e2fad27
+ .long 0x3f3daef9
+ .long 0x319aabec
+ .long 0x3f000000
+ .long 0x3e1cd957
+ .long 0x3f41d870
+ .long 0x32bff977
+ .long 0x3f000000
+ .long 0x3e099e65
+ .long 0x3f45e403
+ .long 0x32b15174
+ .long 0x3f000000
+ .long 0x3debfe8a
+ .long 0x3f49d112
+ .long 0x32992640
+ .long 0x3f000000
+ .long 0x3dc3fdff
+ .long 0x3f4d9f02
+ .long 0x327e70e8
+ .long 0x3f000000
+ .long 0x3d9b4153
+ .long 0x3f514d3d
+ .long 0x300c4f04
+ .long 0x3f000000
+ .long 0x3d639d9d
+ .long 0x3f54db31
+ .long 0x3290ea1a
+ .long 0x3f000000
+ .long 0x3d0f59aa
+ .long 0x3f584853
+ .long 0xb27d5fc0
+ .long 0x3f000000
+ .long 0x3c670f32
+ .long 0x3f5b941a
+ .long 0x32232dc8
+ .long 0x3f000000
+ .long 0xbbe8b648
+ .long 0x3f5ebe05
+ .long 0x32c6f953
+ .long 0x3f000000
+ .long 0xbcea5164
+ .long 0x3f61c598
+ .long 0xb2e7f425
+ .long 0x3f000000
+ .long 0xbd4e645a
+ .long 0x3f64aa59
+ .long 0x311a08fa
+ .long 0x3f000000
+ .long 0xbd945dff
+ .long 0x3f676bd8
+ .long 0xb2bc3389
+ .long 0x3f000000
+ .long 0xbdc210d8
+ .long 0x3f6a09a7
+ .long 0xb2eb236c
+ .long 0x3f000000
+ .long 0xbdf043ab
+ .long 0x3f6c835e
+ .long 0x32f328d4
+ .long 0x3f000000
+ .long 0xbe0f77ad
+ .long 0x3f6ed89e
+ .long 0xb29333dc
+ .long 0x3f000000
+ .long 0x3db1f34f
+ .long 0x3f710908
+ .long 0x321ed0dd
+ .long 0x3e800000
+ .long 0x3d826b93
+ .long 0x3f731447
+ .long 0x32c48e11
+ .long 0x3e800000
+ .long 0x3d25018c
+ .long 0x3f74fa0b
+ .long 0xb2939d22
+ .long 0x3e800000
+ .long 0x3c88e931
+ .long 0x3f76ba07
+ .long 0x326d092c
+ .long 0x3e800000
+ .long 0xbbe60685
+ .long 0x3f7853f8
+ .long 0xb20db9e5
+ .long 0x3e800000
+ .long 0xbcfd1f65
+ .long 0x3f79c79d
+ .long 0x32c64e59
+ .long 0x3e800000
+ .long 0xbd60e8f8
+ .long 0x3f7b14be
+ .long 0x32ff75cb
+ .long 0x3e800000
+ .long 0x3d3c4289
+ .long 0x3f7c3b28
+ .long 0xb231d68b
+ .long 0x3e000000
+ .long 0x3cb2041c
+ .long 0x3f7d3aac
+ .long 0xb0f75ae9
+ .long 0x3e000000
+ .long 0xbb29b1a9
+ .long 0x3f7e1324
+ .long 0xb2f1e603
+ .long 0x3e000000
+ .long 0xbcdd0b28
+ .long 0x3f7ec46d
+ .long 0x31f44949
+ .long 0x3e000000
+ .long 0x3c354825
+ .long 0x3f7f4e6d
+ .long 0x32d01884
+ .long 0x3d800000
+ .long 0xbc5c1342
+ .long 0x3f7fb10f
+ .long 0x31de5b5f
+ .long 0x3d800000
+ .long 0xbbdbd541
+ .long 0x3f7fec43
+ .long 0x3084cd0d
+ .long 0x3d000000
+
+/* General purpose constants:
+ absolute value mask */
+float_vector __sAbsMask 0x7fffffff
+
+/* threshold for out-of-range values */
+float_vector __sRangeReductionVal 0x461c4000
+
+/* +INF */
+float_vector __sRangeVal 0x7f800000
+
+/* High Accuracy version polynomial coefficients:
+ S1 = -1.66666666664728165763e-01 */
+float_vector __sS1 0xbe2aaaab
+
+/* S2 = 8.33329173045453069014e-03 */
+float_vector __sS2 0x3c08885c
+
+/* C1 = -5.00000000000000000000e-01 */
+float_vector __sC1 0xbf000000
+
+/* C2 = 4.16638942914469202550e-02 */
+float_vector __sC2 0x3d2aaa7c
+
+/* Range reduction PI-based constants:
+ PI high part */
+float_vector __sPI1 0x40490000
+
+/* PI mid part 1 */
+float_vector __sPI2 0x3a7da000
+
+/* PI mid part 2 */
+float_vector __sPI3 0x34222000
+
+/* PI low part */
+float_vector __sPI4 0x2cb4611a
+
+/* PI1, PI2, and PI3 when FMA is available
+ PI high part (when FMA available) */
+float_vector __sPI1_FMA 0x40490fdb
+
+/* PI mid part (when FMA available) */
+float_vector __sPI2_FMA 0xb3bbbd2e
+
+/* PI low part (when FMA available) */
+float_vector __sPI3_FMA 0xa7772ced
+
+/* Polynomial constants for work w/o FMA, relative error ~ 2^(-26.625) */
+float_vector __sA3 0xbe2aaaa6
+float_vector __sA5 0x3c08876a
+float_vector __sA7 0xb94fb7ff
+float_vector __sA9 0x362edef8
+
+/* Polynomial constants, work with FMA, relative error ~ 2^(-26.417) */
+float_vector __sA5_FMA 0x3c088768
+float_vector __sA7_FMA 0xb94fb6cf
+float_vector __sA9_FMA 0x362ec335
+
+/* 1/PI */
+float_vector __sInvPI 0x3ea2f983
+
+/* right-shifter constant */
+float_vector __sRShifter 0x4b400000
+
+/* PI/2 */
+float_vector __sHalfPI 0x3fc90fdb
+
+/* 1/2 */
+float_vector __sOneHalf 0x3f000000
+ .type __svml_scos_data,@object
+ .size __svml_scos_data,.-__svml_scos_data
diff --git a/sysdeps/x86_64/fpu/svml_s_cosf_data.h b/sysdeps/x86_64/fpu/svml_s_cosf_data.h
new file mode 100644
index 0000000..1e25c5a
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_cosf_data.h
@@ -0,0 +1,58 @@
+/* Offsets for data table for vectorized cosf.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef S_COSF_DATA_H
+#define S_COSF_DATA_H
+
+.macro float_vector offset value
+.if .-__svml_scos_data != \offset
+.err
+.endif
+.rept 16
+.long \value
+.endr
+.endm
+
+#define __dT 0
+#define __sAbsMask 4096
+#define __sRangeReductionVal 4160
+#define __sRangeVal 4224
+#define __sS1 4288
+#define __sS2 4352
+#define __sC1 4416
+#define __sC2 4480
+#define __sPI1 4544
+#define __sPI2 4608
+#define __sPI3 4672
+#define __sPI4 4736
+#define __sPI1_FMA 4800
+#define __sPI2_FMA 4864
+#define __sPI3_FMA 4928
+#define __sA3 4992
+#define __sA5 5056
+#define __sA7 5120
+#define __sA9 5184
+#define __sA5_FMA 5248
+#define __sA7_FMA 5312
+#define __sA9_FMA 5376
+#define __sInvPI 5440
+#define __sRShifter 5504
+#define __sHalfPI 5568
+#define __sOneHalf 5632
+
+#endif
diff --git a/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
new file mode 100644
index 0000000..d5b62ee
--- /dev/null
+++ b/sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
@@ -0,0 +1,111 @@
+/* Wrapper implementations of vector math functions.
+ Copyright (C) 2014-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* SSE2 ISA version as wrapper to scalar. */
+.macro WRAPPER_IMPL_SSE2 callee
+ subq $40, %rsp
+ cfi_adjust_cfa_offset(40)
+ movaps %xmm0, (%rsp)
+ call \callee@PLT
+ movss %xmm0, 16(%rsp)
+ movss 4(%rsp), %xmm0
+ call \callee@PLT
+ movss %xmm0, 20(%rsp)
+ movss 8(%rsp), %xmm0
+ call \callee@PLT
+ movss %xmm0, 24(%rsp)
+ movss 12(%rsp), %xmm0
+ call \callee@PLT
+ movss 16(%rsp), %xmm3
+ movss 20(%rsp), %xmm2
+ movss 24(%rsp), %xmm1
+ movss %xmm0, 28(%rsp)
+ unpcklps %xmm1, %xmm3
+ unpcklps %xmm0, %xmm2
+ unpcklps %xmm2, %xmm3
+ movaps %xmm3, %xmm0
+ addq $40, %rsp
+ cfi_adjust_cfa_offset(-40)
+ ret
+.endm
+
+/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
+.macro WRAPPER_IMPL_AVX callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-32, %rsp
+ subq $32, %rsp
+ vextractf128 $1, %ymm0, (%rsp)
+ vzeroupper
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps %xmm0, 16(%rsp)
+ vmovaps (%rsp), %xmm0
+ call HIDDEN_JUMPTARGET(\callee)
+ vmovaps %xmm0, %xmm1
+ vmovaps 16(%rsp), %xmm0
+ vinsertf128 $1, %xmm1, %ymm0, %ymm0
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
+
+/* AVX512 ISA version as wrapper to AVX2 ISA version. */
+.macro WRAPPER_IMPL_AVX512 callee
+ pushq %rbp
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (%rbp, 0)
+ movq %rsp, %rbp
+ cfi_def_cfa_register (%rbp)
+ andq $-64, %rsp
+ subq $64, %rsp
+/* Below is encoding for vmovaps %zmm0, (%rsp). */
+ .byte 0x62
+ .byte 0xf1
+ .byte 0x7c
+ .byte 0x48
+ .byte 0x29
+ .byte 0x04
+ .byte 0x24
+/* Below is encoding for vmovaps (%rsp), %ymm0. */
+ .byte 0xc5
+ .byte 0xfc
+ .byte 0x28
+ .byte 0x04
+ .byte 0x24
+ call HIDDEN_JUMPTARGET(\callee)
+/* Below is encoding for vmovaps 32(%rsp), %ymm0. */
+ .byte 0xc5
+ .byte 0xfc
+ .byte 0x28
+ .byte 0x44
+ .byte 0x24
+ .byte 0x20
+ call HIDDEN_JUMPTARGET(\callee)
+ movq %rbp, %rsp
+ cfi_def_cfa_register (%rsp)
+ popq %rbp
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (%rbp)
+ ret
+.endm
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 37 +
NEWS | 2 +-
math/Makefile | 9 +
math/test-double-vlen2.h | 4 +-
math/test-float-vlen16.h | 71 ++
math/test-float-vlen4.h | 71 ++
math/test-float-vlen8.h | 71 ++
sysdeps/unix/sysv/linux/x86_64/libmvec.abilist | 4 +
sysdeps/x86/fpu/bits/math-vector.h | 2 +
sysdeps/x86_64/fpu/Makefile | 17 +-
sysdeps/x86_64/fpu/Versions | 1 +
sysdeps/x86_64/fpu/libm-test-ulps | 8 +
sysdeps/x86_64/fpu/multiarch/Makefile | 3 +-
sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S | 39 +
.../fpu/multiarch/svml_s_cosf16_core_avx512.S | 460 ++++++++
sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S | 38 +
.../x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S | 227 ++++
sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S | 38 +
.../x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S | 215 ++++
sysdeps/x86_64/fpu/svml_s_cosf16_core.S | 25 +
sysdeps/x86_64/fpu/svml_s_cosf4_core.S | 29 +
sysdeps/x86_64/fpu/svml_s_cosf8_core.S | 29 +
sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S | 25 +
sysdeps/x86_64/fpu/svml_s_cosf_data.S | 1130 ++++++++++++++++++++
sysdeps/x86_64/fpu/svml_s_cosf_data.h | 58 +
sysdeps/x86_64/fpu/svml_s_wrapper_impl.h | 111 ++
sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c | 25 +
sysdeps/x86_64/fpu/test-float-vlen16.c | 25 +
sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c | 25 +
sysdeps/x86_64/fpu/test-float-vlen4.c | 23 +
.../x86_64/fpu/test-float-vlen8-avx2-wrappers.c | 28 +
sysdeps/x86_64/fpu/test-float-vlen8-avx2.c | 28 +
sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c | 25 +
sysdeps/x86_64/fpu/test-float-vlen8.c | 23 +
34 files changed, 2919 insertions(+), 7 deletions(-)
create mode 100644 math/test-float-vlen16.h
create mode 100644 math/test-float-vlen4.h
create mode 100644 math/test-float-vlen8.h
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf16_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf4_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf8_core.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf_data.S
create mode 100644 sysdeps/x86_64/fpu/svml_s_cosf_data.h
create mode 100644 sysdeps/x86_64/fpu/svml_s_wrapper_impl.h
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen16-wrappers.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen16.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen4-wrappers.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen4.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen8-avx2-wrappers.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen8-avx2.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen8-wrappers.c
create mode 100644 sysdeps/x86_64/fpu/test-float-vlen8.c
hooks/post-receive
--
GNU C Library master sources