From 2193311288b97cf11dfabf1be22eac89b4ff7366 Mon Sep 17 00:00:00 2001 From: Andrew Senkevich Date: Tue, 9 Jun 2015 14:25:49 +0300 Subject: [PATCH] Start of series of patches with x86_64 vector math functions. Here is implementation of cos containing SSE, AVX, AVX2 and AVX512 versions according to Vector ABI which had been discussed in . Vector math library build and ABI testing enabled by default for x86_64. * sysdeps/x86_64/fpu/Makefile: New file. * sysdeps/x86_64/fpu/Versions: New file. * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file. * sysdeps/x86_64/fpu/svml_d_cos_data.h: New file. * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file. * sysdeps/x86_64/fpu/svml_d_cos4_core.S: New file. * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file. * sysdeps/x86_64/fpu/svml_d_cos8_core.S: New file. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: New file. * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S: New file. * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added build of SSE, AVX2 and AVX512 IFUNC versions. * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cos. * math/bits/mathcalls.h: Added cos declaration with __MATHCALL_VEC. * sysdeps/x86_64/configure.ac: Options for libmvec build. * sysdeps/x86_64/configure: Regenerated. * sysdeps/x86_64/sysdep.h (cfi_offset_rel_rsp): New macro. * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New file. * manual/install.texi (Configuring and compiling): Document --disable-mathvec. * INSTALL: Regenerated. * NEWS: Mention addition of libmvec and x86_64 vector cos. --- ChangeLog | 30 ++ INSTALL | 4 + NEWS | 8 + manual/install.texi | 4 + math/bits/mathcalls.h | 2 +- .../unix/sysv/linux/x86_64/libmvec.abilist | 6 + sysdeps/x86/fpu/bits/math-vector.h | 34 ++ sysdeps/x86_64/configure | 4 + sysdeps/x86_64/configure.ac | 4 + sysdeps/x86_64/fpu/Makefile | 5 + sysdeps/x86_64/fpu/Versions | 5 + sysdeps/x86_64/fpu/multiarch/Makefile | 5 + .../x86_64/fpu/multiarch/svml_d_cos2_core.S | 38 ++ .../fpu/multiarch/svml_d_cos2_core_sse4.S | 223 +++++++++ .../x86_64/fpu/multiarch/svml_d_cos4_core.S | 38 ++ .../fpu/multiarch/svml_d_cos4_core_avx2.S | 207 ++++++++ .../x86_64/fpu/multiarch/svml_d_cos8_core.S | 39 ++ .../fpu/multiarch/svml_d_cos8_core_avx512.S | 463 ++++++++++++++++++ sysdeps/x86_64/fpu/svml_d_cos2_core.S | 30 ++ sysdeps/x86_64/fpu/svml_d_cos4_core.S | 30 ++ sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S | 25 + sysdeps/x86_64/fpu/svml_d_cos8_core.S | 26 + sysdeps/x86_64/fpu/svml_d_cos_data.S | 114 +++++ sysdeps/x86_64/fpu/svml_d_cos_data.h | 48 ++ sysdeps/x86_64/fpu/svml_d_wrapper_impl.h | 101 ++++ sysdeps/x86_64/sysdep.h | 7 + 26 files changed, 1499 insertions(+), 1 deletion(-) create mode 100644 sysdeps/unix/sysv/linux/x86_64/libmvec.abilist create mode 100644 sysdeps/x86/fpu/bits/math-vector.h create mode 100644 sysdeps/x86_64/fpu/Makefile create mode 100644 sysdeps/x86_64/fpu/Versions create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S create mode 100644 sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos2_core.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos4_core.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos8_core.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos_data.S create mode 100644 sysdeps/x86_64/fpu/svml_d_cos_data.h create mode 100644 sysdeps/x86_64/fpu/svml_d_wrapper_impl.h diff --git a/ChangeLog b/ChangeLog index 63dc9982b7..0877e05289 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,33 @@ +2015-06-09 Andrew Senkevich + + * sysdeps/x86_64/fpu/Makefile: New file. + * sysdeps/x86_64/fpu/Versions: New file. + * sysdeps/x86_64/fpu/svml_d_cos_data.S: New file. + * sysdeps/x86_64/fpu/svml_d_cos_data.h: New file. + * sysdeps/x86_64/fpu/svml_d_cos2_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_cos4_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S: New file. + * sysdeps/x86_64/fpu/svml_d_cos8_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S: New file. + * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added + build of SSE, AVX2 and AVX512 IFUNC versions. + * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cos. + * math/bits/mathcalls.h: Added cos declaration with __MATHCALL_VEC. + * sysdeps/x86_64/configure.ac: Options for libmvec build. + * sysdeps/x86_64/configure: Regenerated. + * sysdeps/x86_64/sysdep.h (cfi_offset_rel_rsp): New macro. + * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New file. + * manual/install.texi (Configuring and compiling): Document + --disable-mathvec. + * INSTALL: Regenerated. + * NEWS: Mention addition of libmvec and x86_64 vector cos. + 2015-06-09 Marko Myllynen * locale/C-ctype.c (PREDEFINED_CLASSES): Remove. diff --git a/INSTALL b/INSTALL index 18a47f618e..8e13f2cef7 100644 --- a/INSTALL +++ b/INSTALL @@ -149,6 +149,10 @@ will be used, and CFLAGS sets optimization options for the compiler. with, so new warnings cause the build with '-Werror' to fail), you can configure with '--disable-werror'. +'--disable-mathvec' + By default for x86_64, the GNU C Library is built with vector math + library. Use this option to disable vector math library. + '--build=BUILD-SYSTEM' '--host=HOST-SYSTEM' These options are for cross-compiling. If you specify both options diff --git a/NEWS b/NEWS index 881e61c4f2..5e223a1e39 100644 --- a/NEWS +++ b/NEWS @@ -50,6 +50,14 @@ Version 2.22 * CVE-2014-8121 The NSS backends shared internal state between the getXXent and getXXbyYY NSS calls for the same database, causing a denial-of-service condition in some applications. + +* Added vector math library named libmvec with the following vectorized x86_64 + implementations: cos. + The library can be disabled with --disable-mathvec. Use of the functions is + enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0. + The library is linked in as needed when using -lm (no need to specify -lmvec + explicitly). + Visit for detailed information. Version 2.21 diff --git a/manual/install.texi b/manual/install.texi index bb09199fa6..42ee467463 100644 --- a/manual/install.texi +++ b/manual/install.texi @@ -181,6 +181,10 @@ version of GCC than this version of @theglibc{} was tested with, so new warnings cause the build with @option{-Werror} to fail), you can configure with @option{--disable-werror}. +@item --disable-mathvec +By default for x86_64, @theglibc{} is built with vector math library. +Use this option to disable vector math library. + @item --build=@var{build-system} @itemx --host=@var{host-system} These options are for cross-compiling. If you specify both options and diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h index e8e5577038..85a6a95a7c 100644 --- a/math/bits/mathcalls.h +++ b/math/bits/mathcalls.h @@ -60,7 +60,7 @@ __MATHCALL (atan,, (_Mdouble_ __x)); __MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x)); /* Cosine of X. */ -__MATHCALL (cos,, (_Mdouble_ __x)); +__MATHCALL_VEC (cos,, (_Mdouble_ __x)); /* Sine of X. */ __MATHCALL (sin,, (_Mdouble_ __x)); /* Tangent of X. */ diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist new file mode 100644 index 0000000000..be6eaedafd --- /dev/null +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist @@ -0,0 +1,6 @@ +GLIBC_2.22 + GLIBC_2.22 A + _ZGVbN2v_cos F + _ZGVcN4v_cos F + _ZGVdN4v_cos F + _ZGVeN8v_cos F diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h new file mode 100644 index 0000000000..27294ce9fa --- /dev/null +++ b/sysdeps/x86/fpu/bits/math-vector.h @@ -0,0 +1,34 @@ +/* Platform-specific SIMD declarations of math functions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef _MATH_H +# error "Never include directly;\ + include instead." +#endif + +/* Get default empty definitions for simd declarations. */ +#include + +#if defined __x86_64__ && defined __FAST_MATH__ +# if defined _OPENMP && _OPENMP >= 201307 +/* OpenMP case. */ +# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch") +# undef __DECL_SIMD_cos +# define __DECL_SIMD_cos __DECL_SIMD_x86_64 +# endif +#endif diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure index 7d4dadd4fd..1493523e1c 100644 --- a/sysdeps/x86_64/configure +++ b/sysdeps/x86_64/configure @@ -275,6 +275,10 @@ fi config_vars="$config_vars config-cflags-avx2 = $libc_cv_cc_avx2" +if test x"$build_mathvec" = xnotset; then + build_mathvec=yes +fi + $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h # work around problem with autoconf and empty lines at the end of files diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac index c9f9a51f72..1c2b35fe92 100644 --- a/sysdeps/x86_64/configure.ac +++ b/sysdeps/x86_64/configure.ac @@ -99,6 +99,10 @@ if test $libc_cv_cc_avx2 = yes; then fi LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2]) +if test x"$build_mathvec" = xnotset; then + build_mathvec=yes +fi + dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile new file mode 100644 index 0000000000..2f16323f78 --- /dev/null +++ b/sysdeps/x86_64/fpu/Makefile @@ -0,0 +1,5 @@ +ifeq ($(subdir),mathvec) +libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \ + svml_d_cos4_core svml_d_cos8_core \ + svml_d_cos_data init-arch +endif diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions new file mode 100644 index 0000000000..4290e73589 --- /dev/null +++ b/sysdeps/x86_64/fpu/Versions @@ -0,0 +1,5 @@ +libmvec { + GLIBC_2.22 { + _ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos; + } +} diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 12b0526e50..b2f3266490 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -51,3 +51,8 @@ CFLAGS-slowexp-avx.c = -msse2avx -DSSE2AVX CFLAGS-s_tan-avx.c = -msse2avx -DSSE2AVX endif endif + +ifeq ($(subdir),mathvec) +libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ + svml_d_cos8_core_avx512 +endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S new file mode 100644 index 0000000000..5f67d83bd4 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized cos, vector length is 2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + + .text +ENTRY (_ZGVbN2v_cos) + .type _ZGVbN2v_cos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVbN2v_cos_sse4(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + ret +2: leaq _ZGVbN2v_cos_sse2(%rip), %rax + ret +END (_ZGVbN2v_cos) +libmvec_hidden_def (_ZGVbN2v_cos) + +#define _ZGVbN2v_cos _ZGVbN2v_cos_sse2 +#include "../svml_d_cos2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S new file mode 100644 index 0000000000..11348a37c5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core_sse4.S @@ -0,0 +1,223 @@ +/* Function cos vectorized with SSE4. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" + + .text +ENTRY (_ZGVbN2v_cos_sse4) +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $320, %rsp + movaps %xmm0, %xmm3 + movq __svml_dcos_data@GOTPCREL(%rip), %rax + movups __dHalfPI(%rax), %xmm2 + +/* ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + addpd %xmm3, %xmm2 + movups __dInvPI(%rax), %xmm5 + movups __dAbsMask(%rax), %xmm4 + +/* Get absolute argument value: X' = |X'| */ + andps %xmm2, %xmm4 + +/* Y = X'*InvPi + RS : right shifter add */ + mulpd %xmm5, %xmm2 + +/* Check for large arguments path */ + cmpnlepd __dRangeVal(%rax), %xmm4 + movups __dRShifter(%rax), %xmm6 + addpd %xmm6, %xmm2 + movmskpd %xmm4, %ecx + +/* N = Y - RS : right shifter sub */ + movaps %xmm2, %xmm1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + psllq $63, %xmm2 + subpd %xmm6, %xmm1 + +/* N = N - 0.5 */ + subpd __dOneHalf(%rax), %xmm1 + movups __dPI1(%rax), %xmm7 + +/* R = X - N*Pi1 */ + mulpd %xmm1, %xmm7 + movups __dPI2(%rax), %xmm4 + +/* R = R - N*Pi2 */ + mulpd %xmm1, %xmm4 + subpd %xmm7, %xmm0 + movups __dPI3(%rax), %xmm5 + +/* R = R - N*Pi3 */ + mulpd %xmm1, %xmm5 + subpd %xmm4, %xmm0 + +/* R = R - N*Pi4 */ + movups __dPI4(%rax), %xmm6 + mulpd %xmm6, %xmm1 + subpd %xmm5, %xmm0 + subpd %xmm1, %xmm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + movaps %xmm0, %xmm4 + mulpd %xmm0, %xmm4 + movups __dC7(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC6(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC5(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC4(%rax), %xmm1 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + mulpd %xmm4, %xmm1 + addpd __dC3(%rax), %xmm1 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + mulpd %xmm4, %xmm1 + addpd __dC2(%rax), %xmm1 + mulpd %xmm4, %xmm1 + addpd __dC1(%rax), %xmm1 + mulpd %xmm1, %xmm4 + mulpd %xmm0, %xmm4 + addpd %xmm4, %xmm0 + +/* RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + xorps %xmm2, %xmm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + movups %xmm3, 192(%rsp) + movups %xmm0, 256(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + movups %xmm8, 112(%rsp) + movups %xmm9, 96(%rsp) + movups %xmm10, 80(%rsp) + movups %xmm11, 64(%rsp) + movups %xmm12, 48(%rsp) + movups %xmm13, 32(%rsp) + movups %xmm14, 16(%rsp) + movups %xmm15, (%rsp) + movq %rsi, 136(%rsp) + movq %rdi, 128(%rsp) + movq %r12, 168(%rsp) + cfi_offset_rel_rsp (12, 168) + movb %dl, %r12b + movq %r13, 160(%rsp) + cfi_offset_rel_rsp (13, 160) + movl %ecx, %r13d + movq %r14, 152(%rsp) + cfi_offset_rel_rsp (14, 152) + movl %eax, %r14d + movq %r15, 144(%rsp) + cfi_offset_rel_rsp (15, 144) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + movups 112(%rsp), %xmm8 + movups 96(%rsp), %xmm9 + movups 80(%rsp), %xmm10 + movups 64(%rsp), %xmm11 + movups 48(%rsp), %xmm12 + movups 32(%rsp), %xmm13 + movups 16(%rsp), %xmm14 + movups (%rsp), %xmm15 + movq 136(%rsp), %rsi + movq 128(%rsp), %rdi + movq 168(%rsp), %r12 + cfi_restore (%r12) + movq 160(%rsp), %r13 + cfi_restore (%r13) + movq 152(%rsp), %r14 + cfi_restore (%r14) + movq 144(%rsp), %r15 + cfi_restore (%r15) + movups 256(%rsp), %xmm0 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 200(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 264(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + movsd 192(%rsp,%r15), %xmm0 + + call cos@PLT + + movsd %xmm0, 256(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVbN2v_cos_sse4) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S new file mode 100644 index 0000000000..5babb834ad --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized cos, vector length is 4. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + + .text +ENTRY (_ZGVdN4v_cos) + .type _ZGVdN4v_cos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVdN4v_cos_avx2(%rip), %rax + testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip) + jz 2f + ret +2: leaq _ZGVdN4v_cos_sse_wrapper(%rip), %rax + ret +END (_ZGVdN4v_cos) +libmvec_hidden_def (_ZGVdN4v_cos) + +#define _ZGVdN4v_cos _ZGVdN4v_cos_sse_wrapper +#include "../svml_d_cos4_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S new file mode 100644 index 0000000000..f192ba022e --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core_avx2.S @@ -0,0 +1,207 @@ +/* Function cos vectorized with AVX2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" + + .text +ENTRY (_ZGVdN4v_cos_avx2) + +/* ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $448, %rsp + movq __svml_dcos_data@GOTPCREL(%rip), %rax + vmovapd %ymm0, %ymm1 + vmovupd __dInvPI(%rax), %ymm4 + vmovupd __dRShifter(%rax), %ymm5 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %ymm1, %ymm7 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %ymm7, %ymm2 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm4, %ymm7 + vmovupd __dC7(%rax), %ymm4 + +/* Check for large arguments path */ + vcmpnle_uqpd __dRangeVal(%rax), %ymm2, %ymm3 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm7, %ymm6 + vmovupd __dPI1_FMA(%rax), %ymm2 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm7, %ymm7 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %ymm6, %ymm0 + vmovmskpd %ymm3, %ecx + +/* R = X - N*Pi1 */ + vmovapd %ymm1, %ymm3 + vfnmadd231pd %ymm0, %ymm2, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %ymm0, %ymm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %ymm3, %ymm0 + +/* POLYNOMIAL APPROXIMATION: R2 = R*R */ + vmulpd %ymm0, %ymm0, %ymm5 + vfmadd213pd __dC6(%rax), %ymm5, %ymm4 + vfmadd213pd __dC5(%rax), %ymm5, %ymm4 + vfmadd213pd __dC4(%rax), %ymm5, %ymm4 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %ymm5, %ymm4 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %ymm5, %ymm4 + vfmadd213pd __dC1(%rax), %ymm5, %ymm4 + vmulpd %ymm5, %ymm4, %ymm6 + vfmadd213pd %ymm0, %ymm0, %ymm6 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes */ + vxorpd %ymm7, %ymm6, %ymm0 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovupd %ymm1, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + cfi_offset_rel_rsp (12, 296) + movb %dl, %r12b + movq %r13, 288(%rsp) + cfi_offset_rel_rsp (13, 288) + movl %ecx, %r13d + movq %r14, 280(%rsp) + cfi_offset_rel_rsp (14, 280) + movl %eax, %r14d + movq %r15, 272(%rsp) + cfi_offset_rel_rsp (15, 272) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + cfi_restore (%r12) + movq 288(%rsp), %r13 + cfi_restore (%r13) + movq 280(%rsp), %r14 + cfi_restore (%r14) + movq 272(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call cos@PLT + + vmovsd %xmm0, 392(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call cos@PLT + + vmovsd %xmm0, 384(%rsp,%r15) + jmp .LBL_1_7 + +END (_ZGVdN4v_cos_avx2) diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S new file mode 100644 index 0000000000..ba3b66f69f --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S @@ -0,0 +1,39 @@ +/* Multiple versions of vectorized cos, vector length is 8. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + + .text +ENTRY (_ZGVeN8v_cos) + .type _ZGVeN8v_cos, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1 + call __init_cpu_features +1: leaq _ZGVeN8v_cos_skx(%rip), %rax + testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip) + jnz 3 +2: leaq _ZGVeN8v_cos_knl(%rip), %rax + testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip) + jnz 3 + leaq _ZGVeN8v_cos_avx2_wrapper(%rip), %rax +3: ret +END (_ZGVeN8v_cos) + +#define _ZGVeN8v_cos _ZGVeN8v_cos_avx2_wrapper +#include "../svml_d_cos8_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S new file mode 100644 index 0000000000..14695ec3c9 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S @@ -0,0 +1,463 @@ +/* Function cos vectorized with AVX-512, KNL and SKX versions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_cos_knl) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dcos_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm7 + +/* Check for large arguments path */ + movq $-1, %rcx + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm5 + vmovups __dInvPI(%rax), %zmm3 + +/* Get absolute argument value: X' = |X'| */ + vpandq __dAbsMask(%rax), %zmm5, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 + vmovups __dPI1_FMA(%rax), %zmm6 + +/* N = Y - RS : right shifter sub */ + vsubpd __dRShifter(%rax), %zmm5, %zmm4 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm5, %zmm12 + vmovups __dC7(%rax), %zmm8 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm4, %zmm10 + vcmppd $22, __dRangeVal(%rax), %zmm1, %k1 + vpbroadcastq %rcx, %zmm2{%k1}{z} + vfnmadd231pd %zmm10, %zmm6, %zmm7 + vptestmq %zmm2, %zmm2, %k0 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm10, %zmm7 + kmovw %k0, %ecx + movzbl %cl, %ecx + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm7, %zmm10 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm10, %zmm10, %zmm9 + vfmadd213pd __dC6(%rax), %zmm9, %zmm8 + vfmadd213pd __dC5(%rax), %zmm9, %zmm8 + vfmadd213pd __dC4(%rax), %zmm9, %zmm8 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm9, %zmm8 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm9, %zmm8 + vfmadd213pd __dC1(%rax), %zmm9, %zmm8 + vmulpd %zmm9, %zmm8, %zmm11 + vfmadd213pd %zmm10, %zmm10, %zmm11 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vpxorq %zmm12, %zmm11, %zmm1 + testl %ecx, %ecx + jne .LBL_1_3 + +.LBL_1_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_1_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_1_2 + + xorb %dl, %dl + kmovw %k4, 1048(%rsp) + xorl %eax, %eax + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_1_6: + btl %r14d, %r13d + jc .LBL_1_12 + +.LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_1_10 + +.LBL_1_8: + addb $1, %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_1_6 + + kmovw 1048(%rsp), %k4 + movq 1064(%rsp), %rsi + kmovw 1040(%rsp), %k5 + movq 1056(%rsp), %rdi + kmovw 1032(%rsp), %k6 + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + vmovups 1216(%rsp), %zmm1 + jmp .LBL_1_2 + +.LBL_1_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + call cos@PLT + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_1_8 + +.LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + call cos@PLT + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_1_7 +#endif +END (_ZGVeN8v_cos_knl) + +ENTRY (_ZGVeN8v_cos_skx) +#ifndef HAVE_AVX512_ASM_SUPPORT +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +#else +/* + ALGORITHM DESCRIPTION: + + ( low accuracy ( < 4ulp ) or enhanced performance + ( half of correct mantissa ) implementation ) + + Argument representation: + arg + Pi/2 = (N*Pi + R) + + Result calculation: + cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $1280, %rsp + movq __svml_dcos_data@GOTPCREL(%rip), %rax + +/* R = X - N*Pi1 */ + vmovaps %zmm0, %zmm8 + +/* Check for large arguments path */ + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 + +/* + ARGUMENT RANGE REDUCTION: + Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd __dHalfPI(%rax), %zmm0, %zmm6 + vmovups __dInvPI(%rax), %zmm3 + vmovups __dRShifter(%rax), %zmm4 + vmovups __dPI1_FMA(%rax), %zmm7 + vmovups __dC7(%rax), %zmm9 + +/* Get absolute argument value: X' = |X'| */ + vandpd __dAbsMask(%rax), %zmm6, %zmm1 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %zmm4, %zmm3, %zmm6 + vcmppd $18, __dRangeVal(%rax), %zmm1, %k1 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %zmm6, %zmm13 + +/* N = Y - RS : right shifter sub */ + vsubpd %zmm4, %zmm6, %zmm5 + +/* N = N - 0.5 */ + vsubpd __dOneHalf(%rax), %zmm5, %zmm11 + vfnmadd231pd %zmm11, %zmm7, %zmm8 + +/* R = R - N*Pi2 */ + vfnmadd231pd __dPI2_FMA(%rax), %zmm11, %zmm8 + +/* R = R - N*Pi3 */ + vfnmadd132pd __dPI3_FMA(%rax), %zmm8, %zmm11 + +/* + POLYNOMIAL APPROXIMATION: + R2 = R*R + */ + vmulpd %zmm11, %zmm11, %zmm10 + vfmadd213pd __dC6(%rax), %zmm10, %zmm9 + vfmadd213pd __dC5(%rax), %zmm10, %zmm9 + vfmadd213pd __dC4(%rax), %zmm10, %zmm9 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd __dC3(%rax), %zmm10, %zmm9 + +/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))) */ + vfmadd213pd __dC2(%rax), %zmm10, %zmm9 + vfmadd213pd __dC1(%rax), %zmm10, %zmm9 + vmulpd %zmm10, %zmm9, %zmm12 + vfmadd213pd %zmm11, %zmm11, %zmm12 + vpandnq %zmm1, %zmm1, %zmm2{%k1} + vcmppd $3, %zmm2, %zmm2, %k0 + +/* + RECONSTRUCTION: + Final sign setting: Res = Poly^SignRes + */ + vxorpd %zmm13, %zmm12, %zmm1 + kmovw %k0, %ecx + testl %ecx, %ecx + jne .LBL_2_3 + +.LBL_2_2: + cfi_remember_state + vmovaps %zmm1, %zmm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret + +.LBL_2_3: + cfi_restore_state + vmovups %zmm0, 1152(%rsp) + vmovups %zmm1, 1216(%rsp) + je .LBL_2_2 + + xorb %dl, %dl + xorl %eax, %eax + kmovw %k4, 1048(%rsp) + kmovw %k5, 1040(%rsp) + kmovw %k6, 1032(%rsp) + kmovw %k7, 1024(%rsp) + vmovups %zmm16, 960(%rsp) + vmovups %zmm17, 896(%rsp) + vmovups %zmm18, 832(%rsp) + vmovups %zmm19, 768(%rsp) + vmovups %zmm20, 704(%rsp) + vmovups %zmm21, 640(%rsp) + vmovups %zmm22, 576(%rsp) + vmovups %zmm23, 512(%rsp) + vmovups %zmm24, 448(%rsp) + vmovups %zmm25, 384(%rsp) + vmovups %zmm26, 320(%rsp) + vmovups %zmm27, 256(%rsp) + vmovups %zmm28, 192(%rsp) + vmovups %zmm29, 128(%rsp) + vmovups %zmm30, 64(%rsp) + vmovups %zmm31, (%rsp) + movq %rsi, 1064(%rsp) + movq %rdi, 1056(%rsp) + movq %r12, 1096(%rsp) + cfi_offset_rel_rsp (12, 1096) + movb %dl, %r12b + movq %r13, 1088(%rsp) + cfi_offset_rel_rsp (13, 1088) + movl %ecx, %r13d + movq %r14, 1080(%rsp) + cfi_offset_rel_rsp (14, 1080) + movl %eax, %r14d + movq %r15, 1072(%rsp) + cfi_offset_rel_rsp (15, 1072) + cfi_remember_state + +.LBL_2_6: + btl %r14d, %r13d + jc .LBL_2_12 + +.LBL_2_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc .LBL_2_10 + +.LBL_2_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb .LBL_2_6 + + kmovw 1048(%rsp), %k4 + kmovw 1040(%rsp), %k5 + kmovw 1032(%rsp), %k6 + kmovw 1024(%rsp), %k7 + vmovups 960(%rsp), %zmm16 + vmovups 896(%rsp), %zmm17 + vmovups 832(%rsp), %zmm18 + vmovups 768(%rsp), %zmm19 + vmovups 704(%rsp), %zmm20 + vmovups 640(%rsp), %zmm21 + vmovups 576(%rsp), %zmm22 + vmovups 512(%rsp), %zmm23 + vmovups 448(%rsp), %zmm24 + vmovups 384(%rsp), %zmm25 + vmovups 320(%rsp), %zmm26 + vmovups 256(%rsp), %zmm27 + vmovups 192(%rsp), %zmm28 + vmovups 128(%rsp), %zmm29 + vmovups 64(%rsp), %zmm30 + vmovups (%rsp), %zmm31 + vmovups 1216(%rsp), %zmm1 + movq 1064(%rsp), %rsi + movq 1056(%rsp), %rdi + movq 1096(%rsp), %r12 + cfi_restore (%r12) + movq 1088(%rsp), %r13 + cfi_restore (%r13) + movq 1080(%rsp), %r14 + cfi_restore (%r14) + movq 1072(%rsp), %r15 + cfi_restore (%r15) + jmp .LBL_2_2 + +.LBL_2_10: + cfi_restore_state + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1160(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1160(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1224(%rsp,%r15) + jmp .LBL_2_8 + +.LBL_2_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 1152(%rsp,%r15), %xmm0 + vzeroupper + vmovsd 1152(%rsp,%r15), %xmm0 + + call cos@PLT + + vmovsd %xmm0, 1216(%rsp,%r15) + jmp .LBL_2_7 +#endif +END (_ZGVeN8v_cos_skx) + + .section .rodata, "a" +.L_2il0floatpacket.16: + .long 0xffffffff,0xffffffff + .type .L_2il0floatpacket.16,@object diff --git a/sysdeps/x86_64/fpu/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/svml_d_cos2_core.S new file mode 100644 index 0000000000..a1c5bee935 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos2_core.S @@ -0,0 +1,30 @@ +/* Function cos vectorized with SSE2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVbN2v_cos) +WRAPPER_IMPL_SSE2 cos +END (_ZGVbN2v_cos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVbN2v_cos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S new file mode 100644 index 0000000000..a505b44cc2 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S @@ -0,0 +1,30 @@ +/* Function cos vectorized with AVX2, wrapper version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVdN4v_cos) +WRAPPER_IMPL_AVX _ZGVbN2v_cos +END (_ZGVdN4v_cos) + +#ifndef USE_MULTIARCH + libmvec_hidden_def (_ZGVdN4v_cos) +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S new file mode 100644 index 0000000000..bf10b01cc5 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core_avx.S @@ -0,0 +1,25 @@ +/* Function cos vectorized in AVX ISA as wrapper to SSE4 ISA version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVcN4v_cos) +WRAPPER_IMPL_AVX _ZGVbN2v_cos +END (_ZGVcN4v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/svml_d_cos8_core.S new file mode 100644 index 0000000000..c7507dbef0 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos8_core.S @@ -0,0 +1,26 @@ +/* Function cos vectorized with AVX-512, wrapper to AVX2. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include "svml_d_cos_data.h" +#include "svml_d_wrapper_impl.h" + + .text +ENTRY (_ZGVeN8v_cos) +WRAPPER_IMPL_AVX512 _ZGVdN4v_cos +END (_ZGVeN8v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.S b/sysdeps/x86_64/fpu/svml_d_cos_data.S new file mode 100644 index 0000000000..c9bfd63840 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.S @@ -0,0 +1,114 @@ +/* Data for vectorized cos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "svml_d_cos_data.h" + +.macro double_vector offset value +.if .-__svml_dcos_data != \offset +.err +.endif +.rept 8 +.quad \value +.endr +.endm + + .section .rodata, "a" + .align 64 + +/* Data table for vector implementations of function cos. + The table may contain polynomial, reduction, lookup + coefficients and other constants obtained through different + methods of research and experimental work. + */ + .globl __svml_dcos_data +__svml_dcos_data: + +/* General purpose constants: + absolute value mask + */ +double_vector __dAbsMask 0x7fffffffffffffff + +/* working range threshold */ +double_vector __dRangeVal 0x4160000000000000 + +/* PI/2 */ +double_vector __dHalfPI 0x3ff921fb54442d18 + +/* 1/PI */ +double_vector __dInvPI 0x3fd45f306dc9c883 + +/* right-shifter constant */ +double_vector __dRShifter 0x4338000000000000 + +/* 0.5 */ +double_vector __dOneHalf 0x3fe0000000000000 + +/* Range reduction PI-based constants: + PI high part + */ +double_vector __dPI1 0x400921fb40000000 + +/* PI mid part 1 */ +double_vector __dPI2 0x3e84442d00000000 + +/* PI mid part 2 */ +double_vector __dPI3 0x3d08469880000000 + +/* PI low part */ +double_vector __dPI4 0x3b88cc51701b839a + +/* Range reduction PI-based constants if FMA available: + PI high part (FMA available) + */ +double_vector __dPI1_FMA 0x400921fb54442d18 + +/* PI mid part (FMA available) */ +double_vector __dPI2_FMA 0x3ca1a62633145c06 + +/* PI low part (FMA available) */ +double_vector __dPI3_FMA 0x395c1cd129024e09 + +/* Polynomial coefficients (relative error 2^(-52.115)): */ +double_vector __dC1 0xbfc55555555554a7 +double_vector __dC2 0x3f8111111110a4a8 +double_vector __dC3 0xbf2a01a019a5b86d +double_vector __dC4 0x3ec71de38030fea0 +double_vector __dC5 0xbe5ae63546002231 +double_vector __dC6 0x3de60e6857a2f220 +double_vector __dC7 0xbd69f0d60811aac8 + +/* + Additional constants: + absolute value mask + */ +double_vector __dAbsMask_la 0x7fffffffffffffff + +/* 1/PI */ +double_vector __dInvPI_la 0x3fd45f306dc9c883 + +/* right-shifer for low accuracy version */ +double_vector __dRShifter_la 0x4330000000000000 + +/* right-shifer-1.0 for low accuracy version */ +double_vector __dRShifterm5_la 0x432fffffffffffff + +/* right-shifer with low mask for low accuracy version */ +double_vector __dRXmax_la 0x43300000007ffffe + + .type __svml_dcos_data,@object + .size __svml_dcos_data,.-__svml_dcos_data diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.h b/sysdeps/x86_64/fpu/svml_d_cos_data.h new file mode 100644 index 0000000000..4d28e6eda5 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.h @@ -0,0 +1,48 @@ +/* Offsets for data table for vectorized cos. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef D_COS_DATA_H +#define D_COS_DATA_H + +#define __dAbsMask 0 +#define __dRangeVal 64 +#define __dHalfPI 128 +#define __dInvPI 192 +#define __dRShifter 256 +#define __dOneHalf 320 +#define __dPI1 384 +#define __dPI2 448 +#define __dPI3 512 +#define __dPI4 576 +#define __dPI1_FMA 640 +#define __dPI2_FMA 704 +#define __dPI3_FMA 768 +#define __dC1 832 +#define __dC2 896 +#define __dC3 960 +#define __dC4 1024 +#define __dC5 1088 +#define __dC6 1152 +#define __dC7 1216 +#define __dAbsMask_la 1280 +#define __dInvPI_la 1344 +#define __dRShifter_la 1408 +#define __dRShifterm5_la 1472 +#define __dRXmax_la 1536 + +#endif diff --git a/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h new file mode 100644 index 0000000000..4b2e9f5e80 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_wrapper_impl.h @@ -0,0 +1,101 @@ +/* Wrapper implementations of vector math functions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* SSE2 ISA version as wrapper to scalar. */ +.macro WRAPPER_IMPL_SSE2 callee + subq $40, %rsp + cfi_adjust_cfa_offset(40) + movaps %xmm0, (%rsp) + call \callee@PLT + movsd %xmm0, 16(%rsp) + movsd 8(%rsp), %xmm0 + call \callee@PLT + movsd 16(%rsp), %xmm1 + movsd %xmm0, 24(%rsp) + unpcklpd %xmm0, %xmm1 + movaps %xmm1, %xmm0 + addq $40, %rsp + cfi_adjust_cfa_offset(-40) + ret +.endm + +/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */ +.macro WRAPPER_IMPL_AVX callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-32, %rsp + subq $32, %rsp + vextractf128 $1, %ymm0, (%rsp) + vzeroupper + call HIDDEN_JUMPTARGET(\callee) + vmovapd %xmm0, 16(%rsp) + vmovaps (%rsp), %xmm0 + call HIDDEN_JUMPTARGET(\callee) + vmovapd %xmm0, %xmm1 + vmovapd 16(%rsp), %xmm0 + vinsertf128 $1, %xmm1, %ymm0, %ymm0 + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm + +/* AVX512 ISA version as wrapper to AVX2 ISA version. */ +.macro WRAPPER_IMPL_AVX512 callee + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $64, %rsp +/* Below is encoding for vmovaps %zmm0, (%rsp). */ + .byte 0x62 + .byte 0xf1 + .byte 0x7c + .byte 0x48 + .byte 0x29 + .byte 0x04 + .byte 0x24 +/* Below is encoding for vmovapd (%rsp), %ymm0. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x04 + .byte 0x24 + call HIDDEN_JUMPTARGET(\callee) +/* Below is encoding for vmovapd 32(%rsp), %ymm0. */ + .byte 0xc5 + .byte 0xfd + .byte 0x28 + .byte 0x44 + .byte 0x24 + .byte 0x20 + call HIDDEN_JUMPTARGET(\callee) + movq %rbp, %rsp + cfi_def_cfa_register (%rsp) + popq %rbp + cfi_adjust_cfa_offset (-8) + cfi_restore (%rbp) + ret +.endm diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index e652171064..e79a3974fd 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -25,6 +25,13 @@ /* Syntactic details of assembler. */ +/* This macro is for setting proper CFI with DW_CFA_expression describing + the register as saved relative to %rsp instead of relative to the CFA. + Expression is DW_OP_drop, DW_OP_breg7 (%rsp is register 7), sleb128 offset + from %rsp. */ +#define cfi_offset_rel_rsp(regn, off) .cfi_escape 0x10, regn, 0x4, 0x13, \ + 0x77, off & 0x7F | 0x80, off >> 7 + /* ELF uses byte-counts for .align, most others use log2 of count of bytes. */ #define ALIGNARG(log2) 1<