This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 10/10] i386: Replace assembly versions of e_powf with generic e_powf.c
On Thu, Oct 19, 2017 at 10:31 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> This patch replaces i386 assembly versions of e_powf with generic
> e_powf.c. For workload-spec2017.wrf, on Nehalem, it improves
> performance by:
>
> Before After Improvement
> reciprocal-throughput 230.855 78.3358 194%
> latency 231.685 94.1259 146%
>
> On Skylake, it improves performance by:
>
> Before After Improvement
> reciprocal-throughput 239.858 47.4713 405%
> latency 247.57 93.8798 163%
On IvyBridge with --disable-multi-arch, it improves performance by:
Before After Improvement
reciprocal-throughput 269.078 63.3758 324%
latency 271.473 102.091 165%
> * sysdeps/i386/fpu/e_powf.S: Removed.
> * sysdeps/i386/fpu/e_powf_log2_data.c: Likewise.
> * sysdeps/i386/fpu/w_powf.c: Likewise.
> * sysdeps/i386/fpu/libm-test-ulps: Updated for generic e_powf.c.
> * sysdeps/i386/i686/fpu/multiarch/libm-test-ulps: Likewise.
> * sysdeps/i386/i686/fpu/multiarch/Makefile (libm-sysdep_routines):
> Add e_powf-sse2.
> (CFLAGS-e_powf-sse2.c): New.
> * sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c: New file.
> * sysdeps/i386/i686/fpu/multiarch/e_powf.c: Likewise.
> ---
> sysdeps/i386/fpu/e_powf.S | 392 -------------------------
> sysdeps/i386/fpu/e_powf_log2_data.c | 1 -
> sysdeps/i386/fpu/libm-test-ulps | 6 +
> sysdeps/i386/fpu/w_powf.c | 1 -
> sysdeps/i386/i686/fpu/multiarch/Makefile | 3 +-
> sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c | 3 +
> sysdeps/i386/i686/fpu/multiarch/e_powf.c | 43 +++
> sysdeps/i386/i686/fpu/multiarch/libm-test-ulps | 18 +-
> 8 files changed, 66 insertions(+), 401 deletions(-)
> delete mode 100644 sysdeps/i386/fpu/e_powf.S
> delete mode 100644 sysdeps/i386/fpu/e_powf_log2_data.c
> delete mode 100644 sysdeps/i386/fpu/w_powf.c
> create mode 100644 sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
> create mode 100644 sysdeps/i386/i686/fpu/multiarch/e_powf.c
>
> diff --git a/sysdeps/i386/fpu/e_powf.S b/sysdeps/i386/fpu/e_powf.S
> deleted file mode 100644
> index 467ef2380b..0000000000
> --- a/sysdeps/i386/fpu/e_powf.S
> +++ /dev/null
> @@ -1,392 +0,0 @@
> -/* ix87 specific implementation of pow function.
> - Copyright (C) 1996-2017 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> - Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#include <machine/asm.h>
> -#include <i386-math-asm.h>
> -
> - .section .rodata.cst8,"aM",@progbits,8
> -
> - .p2align 3
> - .type one,@object
> -one: .double 1.0
> - ASM_SIZE_DIRECTIVE(one)
> - .type limit,@object
> -limit: .double 0.29
> - ASM_SIZE_DIRECTIVE(limit)
> - .type p31,@object
> -p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41
> - ASM_SIZE_DIRECTIVE(p31)
> -
> - .section .rodata.cst16,"aM",@progbits,16
> -
> - .p2align 3
> - .type infinity,@object
> -inf_zero:
> -infinity:
> - .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f
> - ASM_SIZE_DIRECTIVE(infinity)
> - .type zero,@object
> -zero: .double 0.0
> - ASM_SIZE_DIRECTIVE(zero)
> - .type minf_mzero,@object
> -minf_mzero:
> -minfinity:
> - .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff
> -mzero:
> - .byte 0, 0, 0, 0, 0, 0, 0, 0x80
> - ASM_SIZE_DIRECTIVE(minf_mzero)
> -DEFINE_FLT_MIN
> -
> -#ifdef PIC
> -# define MO(op) op##@GOTOFF(%ecx)
> -# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f)
> -#else
> -# define MO(op) op
> -# define MOX(op,x,f) op(,x,f)
> -#endif
> -
> - .text
> -ENTRY(__ieee754_powf)
> - flds 8(%esp) // y
> - fxam
> -
> -#ifdef PIC
> - LOAD_PIC_REG (cx)
> -#endif
> -
> - fnstsw
> - movb %ah, %dl
> - andb $0x45, %ah
> - cmpb $0x40, %ah // is y == 0 ?
> - je 11f
> -
> - cmpb $0x05, %ah // is y == ±inf ?
> - je 12f
> -
> - cmpb $0x01, %ah // is y == NaN ?
> - je 30f
> -
> - flds 4(%esp) // x : y
> -
> - subl $4, %esp
> - cfi_adjust_cfa_offset (4)
> -
> - fxam
> - fnstsw
> - movb %ah, %dh
> - andb $0x45, %ah
> - cmpb $0x40, %ah
> - je 20f // x is ±0
> -
> - cmpb $0x05, %ah
> - je 15f // x is ±inf
> -
> - cmpb $0x01, %ah
> - je 33f // x is NaN
> -
> - fxch // y : x
> -
> - /* fistpl raises invalid exception for |y| >= 1L<<31. */
> - fld %st // y : y : x
> - fabs // |y| : y : x
> - fcompl MO(p31) // y : x
> - fnstsw
> - sahf
> - jnc 2f
> -
> - /* First see whether `y' is a natural number. In this case we
> - can use a more precise algorithm. */
> - fld %st // y : y : x
> - fistpl (%esp) // y : x
> - fildl (%esp) // int(y) : y : x
> - fucomp %st(1) // y : x
> - fnstsw
> - sahf
> - jne 3f
> -
> - /* OK, we have an integer value for y. */
> - popl %edx
> - cfi_adjust_cfa_offset (-4)
> - orl $0, %edx
> - fstp %st(0) // x
> - jns 4f // y >= 0, jump
> - fdivrl MO(one) // 1/x (now referred to as x)
> - negl %edx
> -4: fldl MO(one) // 1 : x
> - fxch
> -
> - /* If y is even, take the absolute value of x. Otherwise,
> - ensure all intermediate values that might overflow have the
> - sign of x. */
> - testb $1, %dl
> - jnz 6f
> - fabs
> -
> -6: shrl $1, %edx
> - jnc 5f
> - fxch
> - fabs
> - fmul %st(1) // x : ST*x
> - fxch
> -5: fld %st // x : x : ST*x
> - fabs // |x| : x : ST*x
> - fmulp // |x|*x : ST*x
> - testl %edx, %edx
> - jnz 6b
> - fstp %st(0) // ST*x
> - FLT_NARROW_EVAL_UFLOW_NONNAN
> - ret
> -
> - /* y is ±NAN */
> -30: flds 4(%esp) // x : y
> - fldl MO(one) // 1.0 : x : y
> - fucomp %st(1) // x : y
> - fnstsw
> - sahf
> - je 31f
> - fxch // y : x
> -31: fstp %st(1)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> -2: /* y is a large integer (so even). */
> - fxch // x : y
> - fabs // |x| : y
> - fxch // y : x
> - .align ALIGNARG(4)
> -3: /* y is a real number. */
> - fxch // x : y
> - fldl MO(one) // 1.0 : x : y
> - fldl MO(limit) // 0.29 : 1.0 : x : y
> - fld %st(2) // x : 0.29 : 1.0 : x : y
> - fsub %st(2) // x-1 : 0.29 : 1.0 : x : y
> - fabs // |x-1| : 0.29 : 1.0 : x : y
> - fucompp // 1.0 : x : y
> - fnstsw
> - fxch // x : 1.0 : y
> - sahf
> - ja 7f
> - fsub %st(1) // x-1 : 1.0 : y
> - fyl2xp1 // log2(x) : y
> - jmp 8f
> -
> -7: fyl2x // log2(x) : y
> -8: fmul %st(1) // y*log2(x) : y
> - fst %st(1) // y*log2(x) : y*log2(x)
> - frndint // int(y*log2(x)) : y*log2(x)
> - fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x))
> - fxch // fract(y*log2(x)) : int(y*log2(x))
> - f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x))
> - faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x))
> - fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x))
> -32: addl $4, %esp
> - cfi_adjust_cfa_offset (-4)
> - fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x))
> - FLT_NARROW_EVAL_UFLOW_NONNAN
> - ret
> -
> - /* x is NaN. */
> - cfi_adjust_cfa_offset (4)
> -33: addl $4, %esp
> - cfi_adjust_cfa_offset (-4)
> - fstp %st(1)
> - ret
> -
> - // pow(x,±0) = 1
> - .align ALIGNARG(4)
> -11: fstp %st(0) // pop y
> - fldl MO(one)
> - ret
> -
> - // y == ±inf
> - .align ALIGNARG(4)
> -12: fstp %st(0) // pop y
> - fldl MO(one) // 1
> - flds 4(%esp) // x : 1
> - fabs // abs(x) : 1
> - fucompp // < 1, == 1, or > 1
> - fnstsw
> - andb $0x45, %ah
> - cmpb $0x45, %ah
> - je 13f // jump if x is NaN
> -
> - cmpb $0x40, %ah
> - je 14f // jump if |x| == 1
> -
> - shlb $1, %ah
> - xorb %ah, %dl
> - andl $2, %edx
> - fldl MOX(inf_zero, %edx, 4)
> - ret
> -
> - .align ALIGNARG(4)
> -14: fldl MO(one)
> - ret
> -
> - .align ALIGNARG(4)
> -13: flds 4(%esp) // load x == NaN
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> - // x is ±inf
> -15: fstp %st(0) // y
> - testb $2, %dh
> - jz 16f // jump if x == +inf
> -
> - // fistpl raises invalid exception for |y| >= 1L<<31, so test
> - // that (in which case y is certainly even) before testing
> - // whether y is odd.
> - fld %st // y : y
> - fabs // |y| : y
> - fcompl MO(p31) // y
> - fnstsw
> - sahf
> - jnc 16f
> -
> - // We must find out whether y is an odd integer.
> - fld %st // y : y
> - fistpl (%esp) // y
> - fildl (%esp) // int(y) : y
> - fucompp // <empty>
> - fnstsw
> - sahf
> - jne 17f
> -
> - // OK, the value is an integer.
> - popl %edx
> - cfi_adjust_cfa_offset (-4)
> - testb $1, %dl
> - jz 18f // jump if not odd
> - // It's an odd integer.
> - shrl $31, %edx
> - fldl MOX(minf_mzero, %edx, 8)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> -16: fcompl MO(zero)
> - addl $4, %esp
> - cfi_adjust_cfa_offset (-4)
> - fnstsw
> - shrl $5, %eax
> - andl $8, %eax
> - fldl MOX(inf_zero, %eax, 1)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> -17: shll $30, %edx // sign bit for y in right position
> - addl $4, %esp
> - cfi_adjust_cfa_offset (-4)
> -18: shrl $31, %edx
> - fldl MOX(inf_zero, %edx, 8)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> - // x is ±0
> -20: fstp %st(0) // y
> - testb $2, %dl
> - jz 21f // y > 0
> -
> - // x is ±0 and y is < 0. We must find out whether y is an odd integer.
> - testb $2, %dh
> - jz 25f
> -
> - // fistpl raises invalid exception for |y| >= 1L<<31, so test
> - // that (in which case y is certainly even) before testing
> - // whether y is odd.
> - fld %st // y : y
> - fabs // |y| : y
> - fcompl MO(p31) // y
> - fnstsw
> - sahf
> - jnc 25f
> -
> - fld %st // y : y
> - fistpl (%esp) // y
> - fildl (%esp) // int(y) : y
> - fucompp // <empty>
> - fnstsw
> - sahf
> - jne 26f
> -
> - // OK, the value is an integer.
> - popl %edx
> - cfi_adjust_cfa_offset (-4)
> - testb $1, %dl
> - jz 27f // jump if not odd
> - // It's an odd integer.
> - // Raise divide-by-zero exception and get minus infinity value.
> - fldl MO(one)
> - fdivl MO(zero)
> - fchs
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> -25: fstp %st(0)
> -26: addl $4, %esp
> - cfi_adjust_cfa_offset (-4)
> -27: // Raise divide-by-zero exception and get infinity value.
> - fldl MO(one)
> - fdivl MO(zero)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> - .align ALIGNARG(4)
> - // x is ±0 and y is > 0. We must find out whether y is an odd integer.
> -21: testb $2, %dh
> - jz 22f
> -
> - // fistpl raises invalid exception for |y| >= 1L<<31, so test
> - // that (in which case y is certainly even) before testing
> - // whether y is odd.
> - fcoml MO(p31) // y
> - fnstsw
> - sahf
> - jnc 22f
> -
> - fld %st // y : y
> - fistpl (%esp) // y
> - fildl (%esp) // int(y) : y
> - fucompp // <empty>
> - fnstsw
> - sahf
> - jne 23f
> -
> - // OK, the value is an integer.
> - popl %edx
> - cfi_adjust_cfa_offset (-4)
> - testb $1, %dl
> - jz 24f // jump if not odd
> - // It's an odd integer.
> - fldl MO(mzero)
> - ret
> -
> - cfi_adjust_cfa_offset (4)
> -22: fstp %st(0)
> -23: addl $4, %esp // Don't use pop.
> - cfi_adjust_cfa_offset (-4)
> -24: fldl MO(zero)
> - ret
> -
> -END(__ieee754_powf)
> -strong_alias (__ieee754_powf, __powf_finite)
> diff --git a/sysdeps/i386/fpu/e_powf_log2_data.c b/sysdeps/i386/fpu/e_powf_log2_data.c
> deleted file mode 100644
> index 1cc8931700..0000000000
> --- a/sysdeps/i386/fpu/e_powf_log2_data.c
> +++ /dev/null
> @@ -1 +0,0 @@
> -/* Not needed. */
> diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
> index 5318f61146..83b3b8653a 100644
> --- a/sysdeps/i386/fpu/libm-test-ulps
> +++ b/sysdeps/i386/fpu/libm-test-ulps
> @@ -2370,24 +2370,30 @@ ldouble: 1
>
> Function: "pow_downward":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
>
> Function: "pow_towardzero":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
>
> Function: "pow_upward":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
> diff --git a/sysdeps/i386/fpu/w_powf.c b/sysdeps/i386/fpu/w_powf.c
> deleted file mode 100644
> index d133216f5b..0000000000
> --- a/sysdeps/i386/fpu/w_powf.c
> +++ /dev/null
> @@ -1 +0,0 @@
> -#include <sysdeps/../math/w_powf.c>
> diff --git a/sysdeps/i386/i686/fpu/multiarch/Makefile b/sysdeps/i386/i686/fpu/multiarch/Makefile
> index eee3b8b1fd..c0fa9761d3 100644
> --- a/sysdeps/i386/i686/fpu/multiarch/Makefile
> +++ b/sysdeps/i386/i686/fpu/multiarch/Makefile
> @@ -1,9 +1,10 @@
> ifeq ($(subdir),math)
> libm-sysdep_routines += e_exp2f-sse2 e_expf-sse2 e_logf-sse2 e_log2f-sse2 \
> - s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
> + e_powf-sse2 s_sinf-sse2 s_cosf-sse2 s_sincosf-sse2
>
> CFLAGS-e_exp2f-sse2.c = -msse2 -mfpmath=sse
> CFLAGS-e_expf-sse2.c = -msse2 -mfpmath=sse
> CFLAGS-e_log2f-sse2.c = -msse2 -mfpmath=sse
> CFLAGS-e_logf-sse2.c = -msse2 -mfpmath=sse
> +CFLAGS-e_powf-sse2.c = -msse2 -mfpmath=sse
> endif
> diff --git a/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c b/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
> new file mode 100644
> index 0000000000..c56f6ee89f
> --- /dev/null
> +++ b/sysdeps/i386/i686/fpu/multiarch/e_powf-sse2.c
> @@ -0,0 +1,3 @@
> +#define __powf __powf_sse2
> +
> +#include <sysdeps/ieee754/flt-32/e_powf.c>
> diff --git a/sysdeps/i386/i686/fpu/multiarch/e_powf.c b/sysdeps/i386/i686/fpu/multiarch/e_powf.c
> new file mode 100644
> index 0000000000..4dc4c87326
> --- /dev/null
> +++ b/sysdeps/i386/i686/fpu/multiarch/e_powf.c
> @@ -0,0 +1,43 @@
> +/* Multiple versions of powf.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#define powf __redirect_powf
> +#define __DECL_SIMD___redirect_powf
> +#include <math.h>
> +#undef powf
> +
> +#define SYMBOL_NAME powf
> +#include "ifunc-sse2.h"
> +
> +libc_ifunc_redirected (__redirect_powf, __powf, IFUNC_SELECTOR ());
> +
> +#ifdef SHARED
> +__hidden_ver1 (__powf_ia32, __GI___powf, __redirect_powf)
> + __attribute__ ((visibility ("hidden")));
> +
> +# include <shlib-compat.h>
> +versioned_symbol (libm, __powf, powf, GLIBC_2_27);
> +#else
> +weak_alias (__powf, powf)
> +#endif
> +
> +strong_alias (__powf, __ieee754_powf)
> +strong_alias (__powf, __powf_finite)
> +
> +#define __powf __powf_ia32
> +#include <sysdeps/ieee754/flt-32/e_powf.c>
> diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
> index b5d74df580..26d90ec636 100644
> --- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
> +++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
> @@ -2370,24 +2370,30 @@ ldouble: 1
>
> Function: "pow_downward":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
>
> Function: "pow_towardzero":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
>
> Function: "pow_upward":
> double: 1
> +float: 1
> float128: 2
> idouble: 1
> +ifloat: 1
> ifloat128: 2
> ildouble: 4
> ldouble: 4
> @@ -2577,30 +2583,30 @@ ldouble: 5
>
> Function: "tgamma_downward":
> double: 3
> -float: 4
> +float: 5
> float128: 5
> idouble: 3
> -ifloat: 4
> +ifloat: 5
> ifloat128: 5
> ildouble: 5
> ldouble: 5
>
> Function: "tgamma_towardzero":
> double: 4
> -float: 4
> +float: 5
> float128: 5
> idouble: 4
> -ifloat: 4
> +ifloat: 5
> ifloat128: 5
> ildouble: 5
> ldouble: 5
>
> Function: "tgamma_upward":
> double: 4
> -float: 4
> +float: 6
> float128: 4
> idouble: 4
> -ifloat: 4
> +ifloat: 6
> ifloat128: 4
> ildouble: 5
> ldouble: 5
> --
> 2.13.6
>
--
H.J.