This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.26-109-g24a2e65


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  24a2e6588d2e0c91b4003878b0625d4a9360e8f3 (commit)
      from  403143e1df85dadd374f304bd891be0cd7573e3b (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=24a2e6588d2e0c91b4003878b0625d4a9360e8f3

commit 24a2e6588d2e0c91b4003878b0625d4a9360e8f3
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Aug 16 08:43:35 2017 -0700

    x86-64: Optimize e_expf with FMA [BZ #21912]
    
    FMA optimized e_expf improves performance by more than 50% on Skylake.
    
    	[BZ #21912]
    	* sysdeps/x86_64/fpu/multiarch/Makefile (libm-sysdep_routines):
    	Add e_expf-fma.
    	* sysdeps/x86_64/fpu/multiarch/e_expf-fma.S: New file.
    	* sysdeps/x86_64/fpu/multiarch/e_expf.c: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/ifunc-fma.h: Likewise.

diff --git a/ChangeLog b/ChangeLog
index e308ee9..cb7547a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2017-08-16  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #21912]
+	* sysdeps/x86_64/fpu/multiarch/Makefile (libm-sysdep_routines):
+	Add e_expf-fma.
+	* sysdeps/x86_64/fpu/multiarch/e_expf-fma.S: New file.
+	* sysdeps/x86_64/fpu/multiarch/e_expf.c: Likewise.
+	* sysdeps/x86_64/fpu/multiarch/ifunc-fma.h: Likewise.
+
 2017-08-16  Andreas Schwab  <schwab@suse.de>
 
 	[BZ #16750]
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 9daf2cf..1b61795 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -35,6 +35,9 @@ CFLAGS-slowpow-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
 
+# e_expf-fma.S implements both FMA and SSE2 versions of e_expf.
+libm-sysdep_routines += e_expf-fma
+
 libm-sysdep_routines += e_exp-fma4 e_log-fma4 e_pow-fma4 s_atan-fma4 \
 			e_asin-fma4 e_atan2-fma4 s_sin-fma4 s_tan-fma4 \
 			mplog-fma4 mpa-fma4 slowexp-fma4 slowpow-fma4 \
diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
new file mode 100644
index 0000000..43140de
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
@@ -0,0 +1,182 @@
+/* FMA/AVX2 version of IEEE 754 expf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Short algorithm description:
+
+    Let K = 64 (table size).
+         e^x  = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
+    where
+         x = m*log(2)/K + y,    y in [0.0..log(2)/K]
+         m = n*K + j,           m,n,j - signed integer, j in [0..K-1]
+         values of 2^(j/K) are tabulated as T[j].
+
+         P(y) is a minimax polynomial approximation of expf(x)-1
+         on small interval [0.0..log(2)/K].
+
+         P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
+         z = y*y;    P(y) = (P3*z + P1)*z + (P2*z + P0)*y
+
+   Special cases:
+    expf(NaN) = NaN
+    expf(+INF) = +INF
+    expf(-INF) = 0
+    expf(x) = 1 for subnormals
+    for finite argument, only expf(0)=1 is exact
+    expf(x) overflows if x>88.7228317260742190
+    expf(x) underflows if x<-103.972076416015620
+ */
+
+	.section .text.fma,"ax",@progbits
+ENTRY(__ieee754_expf_fma)
+	/* Input: single precision x in %xmm0 */
+	vcvtss2sd %xmm0, %xmm0, %xmm1	/* Convert x to double precision */
+	vmovd	%xmm0, %ecx		/* Copy x */
+	vmovsd	L(DP_KLN2)(%rip), %xmm2	/* DP K/log(2) */
+	vfmadd213sd L(DP_RD)(%rip), %xmm1, %xmm2 /* DP x*K/log(2)+RD */
+	vmovsd	L(DP_P2)(%rip), %xmm3	/* DP P2 */
+	movl	%ecx, %eax		/* x */
+	andl	$0x7fffffff, %ecx	/* |x| */
+	lea	L(DP_T)(%rip), %rsi	/* address of table T[j] */
+	vmovsd	L(DP_P3)(%rip), %xmm4	/* DP P3 */
+
+	cmpl	$0x42ad496b, %ecx	/* |x|<125*log(2) ? */
+	jae	L(special_paths_fma)
+
+	/* Here if |x|<125*log(2) */
+	cmpl	$0x31800000, %ecx	/* |x|<2^(-28) ? */
+	jb	L(small_arg_fma)
+
+	/* Main path: here if 2^(-28)<=|x|<125*log(2) */
+						/* %xmm2 = SP x*K/log(2)+RS */
+	vmovd	  %xmm2, %eax
+	vsubsd	  L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
+	movl	  %eax, %edx			/* n*K+j with trash */
+	andl	  $0x3f, %eax			/* bits of j */
+	vmovsd	  (%rsi,%rax,8), %xmm5		/* T[j] */
+	andl	  $0xffffffc0, %edx		/* bits of n */
+
+	vfmadd132sd  L(DP_NLN2K)(%rip), %xmm1, %xmm2 /*  DP y=x-t*log(2)/K */
+	vmulsd	    %xmm2, %xmm2, %xmm6		/* DP z=y*y */
+
+
+	vfmadd213sd L(DP_P1)(%rip), %xmm6, %xmm4 /* DP P3*z + P1 */
+	vfmadd213sd L(DP_P0)(%rip), %xmm6, %xmm3 /* DP P2*z+P0 */
+
+	addl	    $0x1fc0, %edx		/* bits of n + SP exponent bias */
+	shll	    $17, %edx			/* SP 2^n */
+	vmovd       %edx, %xmm1			/* SP 2^n */
+
+	vmulsd      %xmm6, %xmm4, %xmm4		/* DP (P3*z+P1)*z */
+
+	vfmadd213sd %xmm4, %xmm3, %xmm2		/* DP P(Y)  (P2*z+P0)*y */
+	vfmadd213sd %xmm5, %xmm5, %xmm2		/* DP T[j]*(P(y)+1) */
+	vcvtsd2ss   %xmm2, %xmm2, %xmm0		/* SP T[j]*(P(y)+1) */
+	vmulss	    %xmm1, %xmm0, %xmm0		/* SP result=2^n*(T[j]*(P(y)+1)) */
+	ret
+
+	.p2align	4
+L(small_arg_fma):
+	/* Here if 0<=|x|<2^(-28) */
+	vaddss	L(SP_ONE)(%rip), %xmm0, %xmm0	/* 1.0 + x */
+	/* Return 1.0 with inexact raised, except for x==0 */
+	ret
+
+	.p2align	4
+L(special_paths_fma):
+	/* Here if 125*log(2)<=|x| */
+	shrl	$31, %eax		/* Get sign bit of x, and depending on it: */
+	lea	L(SP_RANGE)(%rip), %rdx	/* load over/underflow bound */
+	cmpl	(%rdx,%rax,4), %ecx	/* |x|<under/overflow bound ? */
+	jbe	L(near_under_or_overflow_fma)
+
+	/* Here if |x|>under/overflow bound */
+	cmpl	$0x7f800000, %ecx	/* |x| is finite ? */
+	jae	L(arg_inf_or_nan_fma)
+
+	/* Here if |x|>under/overflow bound, and x is finite */
+	testl	%eax, %eax		/* sign of x nonzero ? */
+	je	L(res_overflow_fma)
+
+	/* Here if -inf<x<underflow bound (x<0) */
+	vmovss	L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
+	vmulss	%xmm0, %xmm0, %xmm0	/* Return underflowed result (zero or subnormal) */
+	ret
+
+	.p2align	4
+L(res_overflow_fma):
+	/* Here if overflow bound<x<inf (x>0) */
+	vmovss	L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
+	vmulss	%xmm0, %xmm0, %xmm0	/* Return overflowed result (Inf or max normal) */
+	ret
+
+	.p2align	4
+L(arg_inf_or_nan_fma):
+	/* Here if |x| is Inf or NAN */
+	jne	L(arg_nan_fma)	/* |x| is Inf ? */
+
+	/* Here if |x| is Inf */
+	lea	L(SP_INF_0)(%rip), %rdx	/* depending on sign of x: */
+	vmovss	(%rdx,%rax,4), %xmm0	/* return zero or Inf */
+	ret
+
+	.p2align	4
+L(arg_nan_fma):
+	/* Here if |x| is NaN */
+	vaddss	%xmm0, %xmm0, %xmm0	/* Return x+x (raise invalid) */
+	ret
+
+	.p2align	4
+L(near_under_or_overflow_fma):
+	/* Here if 125*log(2)<=|x|<under/overflow bound */
+	vmovd	%xmm2, %eax		/* bits of n*K+j with trash */
+	vsubsd	L(DP_RD)(%rip), %xmm2, %xmm2 	/* DP t=round(x*K/log(2)) */
+	movl	%eax, %edx		/* n*K+j with trash */
+	andl	$0x3f, %eax		/* bits of j */
+	vmulsd	L(DP_NLN2K)(%rip),%xmm2, %xmm2/* DP -t*log(2)/K */
+	andl	$0xffffffc0, %edx	/* bits of n */
+	vaddsd	%xmm1, %xmm2, %xmm0	/* DP y=x-t*log(2)/K */
+	vmulsd	%xmm0, %xmm0, %xmm2	/* DP z=y*y */
+	addl	$0xffc0, %edx		/* bits of n + DP exponent bias */
+	vfmadd213sd L(DP_P0)(%rip), %xmm2, %xmm3/* DP P2*z+P0 */
+	shlq	$46, %rdx		/* DP 2^n */
+	vfmadd213sd L(DP_P1)(%rip), %xmm2, %xmm4/* DP P3*z+P1 */
+	vmovq	%rdx, %xmm1		/* DP 2^n */
+	vmulsd	%xmm2, %xmm4, %xmm4	/* DP (P3*z+P1)*z */
+	vfmadd213sd %xmm4, %xmm3, %xmm0	/* DP (P2*z+P0)*y */
+	vmovsd	(%rsi,%rax,8), %xmm2
+	vfmadd213sd %xmm2, %xmm2, %xmm0 /* DP T[j]*(P(y)+1) */
+	vmulsd	%xmm1, %xmm0, %xmm0	/* DP result=2^n*(T[j]*(P(y)+1)) */
+	vcvtsd2ss %xmm0, %xmm0, %xmm0	/* convert result to single precision */
+	ret
+END(__ieee754_expf_fma)
+
+	.section .rodata.cst8,"aM",@progbits,8
+	.p2align 3
+L(DP_RD): /* double precision 2^52+2^51 */
+	.long	0x00000000, 0x43380000
+	.type L(DP_RD), @object
+	ASM_SIZE_DIRECTIVE(L(DP_RD))
+
+#define __ieee754_expf __ieee754_expf_sse2
+
+#undef strong_alias
+#define strong_alias(ignored1, ignored2)
+
+#include <sysdeps/x86_64/fpu/e_expf.S>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_expf.c b/sysdeps/x86_64/fpu/multiarch/e_expf.c
new file mode 100644
index 0000000..864d5f9
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_expf.c
@@ -0,0 +1,26 @@
+/* Multiple versions of IEEE 754 expf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+extern double __redirect_ieee754_expf (double);
+
+#define SYMBOL_NAME ieee754_expf
+#include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_expf, __ieee754_expf,
+		       IFUNC_SELECTOR ());
+strong_alias (__ieee754_expf, __expf_finite)
diff --git a/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
new file mode 100644
index 0000000..383c41f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/ifunc-fma.h
@@ -0,0 +1,34 @@
+/* Common definition for ifunc selections optimized with AVX2/FMA.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (fma) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+    return OPTIMIZE (fma);
+
+  return OPTIMIZE (sse2);
+}

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                 |    9 ++
 sysdeps/x86_64/fpu/multiarch/Makefile     |    3 +
 sysdeps/x86_64/fpu/multiarch/e_expf-fma.S |  182 +++++++++++++++++++++++++++++
 sysdeps/x86_64/fpu/multiarch/e_expf.c     |   26 ++++
 sysdeps/x86_64/fpu/multiarch/ifunc-fma.h  |   34 ++++++
 5 files changed, 254 insertions(+), 0 deletions(-)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf-fma.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_expf.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/ifunc-fma.h


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]