diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h index 8a94a7e..f7e5e39 100644 --- a/math/bits/mathcalls.h +++ b/math/bits/mathcalls.h @@ -46,6 +46,17 @@ # error "Never include directly; include instead." #endif +#undef __DECL_SIMD + +/* For now we have vectorized version only for _Mdouble_ case */ +#if !defined _Mfloat_ && !defined _Mlong_double_ +# if defined _OPENMP && _OPENMP >= 201307 +# define __DECL_SIMD _Pragma ("omp declare simd") +# endif +#else +# define __DECL_SIMD +#endif + /* Trigonometric functions. */ @@ -60,6 +71,7 @@ __MATHCALL (atan,, (_Mdouble_ __x)); __MATHCALL (atan2,, (_Mdouble_ __y, _Mdouble_ __x)); /* Cosine of X. */ +__DECL_SIMD __MATHCALL (cos,, (_Mdouble_ __x)); /* Sine of X. */ __MATHCALL (sin,, (_Mdouble_ __x)); diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist index 2390934..bb791ea 100644 --- a/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/64/libm.abilist @@ -402,5 +402,8 @@ GLIBC_2.2.5 yn F ynf F ynl F +GLIBC_2.21 + GLIBC_2.21 A + _ZGVdN4v_cos F GLIBC_2.4 GLIBC_2.4 A diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile new file mode 100644 index 0000000..1cb3ec5 --- /dev/null +++ b/sysdeps/x86_64/fpu/Makefile @@ -0,0 +1,3 @@ +ifeq ($(subdir),math) +libm-support += svml_d_cos4_core svml_d_cos_data +endif diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions new file mode 100644 index 0000000..1717a7a --- /dev/null +++ b/sysdeps/x86_64/fpu/Versions @@ -0,0 +1,7 @@ +libm { + GLIBC_2.21 { + # A generic bug got this omitted from other configurations' version + # sets, but we always had it. + _ZGVdN4v_cos; + } +} diff --git a/sysdeps/x86_64/fpu/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/svml_d_cos4_core.S new file mode 100644 index 0000000..8334875 --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos4_core.S @@ -0,0 +1,185 @@ +/* Function cos vectorized with AVX2. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + + .text +ENTRY(_ZGVdN4v_cos) + +/* ALGORITHM DESCRIPTION: + * + * ( low accuracy ( < 4ulp ) or enhanced performance ( half of correct mantissa ) implementation ) + * + * Argument representation: + * arg + Pi/2 = (N*Pi + R) + * + * Result calculation: + * cos(arg) = sin(arg+Pi/2) = sin(N*Pi + R) = (-1)^N * sin(R) + * sin(R) is approximated by corresponding polynomial + */ + pushq %rbp + movq %rsp, %rbp + andq $-64, %rsp + subq $448, %rsp + movq __gnu_svml_dcos_data@GOTPCREL(%rip), %rax + vmovapd %ymm0, %ymm1 + vmovupd 192(%rax), %ymm4 + vmovupd 256(%rax), %ymm5 + +/* ARGUMENT RANGE REDUCTION: + * Add Pi/2 to argument: X' = X+Pi/2 + */ + vaddpd 128(%rax), %ymm1, %ymm7 + +/* Get absolute argument value: X' = |X'| */ + vandpd (%rax), %ymm7, %ymm2 + +/* Y = X'*InvPi + RS : right shifter add */ + vfmadd213pd %ymm5, %ymm4, %ymm7 + vmovupd 1216(%rax), %ymm4 + +/* Check for large arguments path */ + vcmpnle_uqpd 64(%rax), %ymm2, %ymm3 + +/* N = Y - RS : right shifter sub */ + vsubpd %ymm5, %ymm7, %ymm6 + vmovupd 640(%rax), %ymm2 + +/* SignRes = Y<<63 : shift LSB to MSB place for result sign */ + vpsllq $63, %ymm7, %ymm7 + +/* N = N - 0.5 */ + vsubpd 320(%rax), %ymm6, %ymm0 + vmovmskpd %ymm3, %ecx + +/* R = X - N*Pi1 */ + vmovapd %ymm1, %ymm3 + vfnmadd231pd %ymm0, %ymm2, %ymm3 + +/* R = R - N*Pi2 */ + vfnmadd231pd 704(%rax), %ymm0, %ymm3 + +/* R = R - N*Pi3 */ + vfnmadd132pd 768(%rax), %ymm3, %ymm0 + +/* POLYNOMIAL APPROXIMATION: + * R2 = R*R + */ + vmulpd %ymm0, %ymm0, %ymm5 + vfmadd213pd 1152(%rax), %ymm5, %ymm4 + vfmadd213pd 1088(%rax), %ymm5, %ymm4 + vfmadd213pd 1024(%rax), %ymm5, %ymm4 + +/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ + vfmadd213pd 960(%rax), %ymm5, %ymm4 + vfmadd213pd 896(%rax), %ymm5, %ymm4 + vfmadd213pd 832(%rax), %ymm5, %ymm4 + vmulpd %ymm5, %ymm4, %ymm6 + vfmadd213pd %ymm0, %ymm0, %ymm6 + +/* RECONSTRUCTION: + * Final sign setting: Res = Poly^SignRes + */ + vxorpd %ymm7, %ymm6, %ymm0 + testl %ecx, %ecx + jne _LBL_1_3 + +_LBL_1_2: + movq %rbp, %rsp + popq %rbp + ret + +_LBL_1_3: + vmovupd %ymm1, 320(%rsp) + vmovupd %ymm0, 384(%rsp) + je _LBL_1_2 + + xorb %dl, %dl + xorl %eax, %eax + vmovups %ymm8, 224(%rsp) + vmovups %ymm9, 192(%rsp) + vmovups %ymm10, 160(%rsp) + vmovups %ymm11, 128(%rsp) + vmovups %ymm12, 96(%rsp) + vmovups %ymm13, 64(%rsp) + vmovups %ymm14, 32(%rsp) + vmovups %ymm15, (%rsp) + movq %rsi, 264(%rsp) + movq %rdi, 256(%rsp) + movq %r12, 296(%rsp) + movb %dl, %r12b + movq %r13, 288(%rsp) + movl %ecx, %r13d + movq %r14, 280(%rsp) + movl %eax, %r14d + movq %r15, 272(%rsp) + +_LBL_1_6: + btl %r14d, %r13d + jc _LBL_1_12 + +_LBL_1_7: + lea 1(%r14), %esi + btl %esi, %r13d + jc _LBL_1_10 + +_LBL_1_8: + incb %r12b + addl $2, %r14d + cmpb $16, %r12b + jb _LBL_1_6 + + vmovups 224(%rsp), %ymm8 + vmovups 192(%rsp), %ymm9 + vmovups 160(%rsp), %ymm10 + vmovups 128(%rsp), %ymm11 + vmovups 96(%rsp), %ymm12 + vmovups 64(%rsp), %ymm13 + vmovups 32(%rsp), %ymm14 + vmovups (%rsp), %ymm15 + vmovupd 384(%rsp), %ymm0 + movq 264(%rsp), %rsi + movq 256(%rsp), %rdi + movq 296(%rsp), %r12 + movq 288(%rsp), %r13 + movq 280(%rsp), %r14 + movq 272(%rsp), %r15 + jmp _LBL_1_2 + +_LBL_1_10: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 328(%rsp,%r15), %xmm0 + vzeroupper + + call __cos@PLT + + vmovsd %xmm0, 392(%rsp,%r15) + jmp _LBL_1_8 + +_LBL_1_12: + movzbl %r12b, %r15d + shlq $4, %r15 + vmovsd 320(%rsp,%r15), %xmm0 + vzeroupper + + call __cos@PLT + + vmovsd %xmm0, 384(%rsp,%r15) + jmp _LBL_1_7 +END(_ZGVdN4v_cos) diff --git a/sysdeps/x86_64/fpu/svml_d_cos_data.S b/sysdeps/x86_64/fpu/svml_d_cos_data.S new file mode 100644 index 0000000..7bb1aba --- /dev/null +++ b/sysdeps/x86_64/fpu/svml_d_cos_data.S @@ -0,0 +1,426 @@ +/* Data for vectorized cos. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + + + .section .rodata, "a" + + .align 64 + .globl __gnu_svml_dcos_data +__gnu_svml_dcos_data: + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 0 + .long 1096810496 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1413754136 + .long 1073291771 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1127743488 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 0 + .long 1071644672 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 1073741824 + .long 1074340347 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 0 + .long 1048855597 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 2147483648 + .long 1023952536 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1880851354 + .long 998820945 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 1413754136 + .long 1074340347 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 856972294 + .long 1017226790 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 688016905 + .long 962338001 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 1431655591 + .long 3217380693 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 286303400 + .long 1065423121 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 430291053 + .long 3207201184 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 2150694560 + .long 1053236707 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1174413873 + .long 3193628213 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 1470296608 + .long 1038487144 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 135375560 + .long 3177836758 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 4294967295 + .long 2147483647 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 1841940611 + .long 1070882608 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 0 + .long 1127219200 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 4294967295 + .long 1127219199 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .long 8388606 + .long 1127219200 + .type __gnu_svml_dcos_data,@object + .size __gnu_svml_dcos_data,1600