This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/3] Update s_sincosf.c and x86-64 s_sincosf-fma.c



On 21/11/2018 16:11, H.J. Lu wrote:
> Include <s_sincosf.h> in s_sincosf.c, instead of "s_sincosf.h", to allow
> x86-64 s_sincosf.h with vectorized sincosf_poly.  Update __sincosf_table
> to allow vectorized load in vectorized sincosf_poly.  On Broadwell,
> bench-sincosf shows:
> 
>        Before         After      Improvement
> max    160.273        114.198        40%
> min    6.25           5.625          11%
> mean   13.0325        10.6462        22%
> 
> Vectorized sincosf_poly shows
> 
>        Before         After      Improvement
> max    138.653        114.198        21%
> min    5.004          5.625          -11%
> mean   11.5934        10.6462        9%
> 
> 	* sysdeps/ieee754/flt-32/s_sincosf.c: Include <s_sincosf.h>
> 	instead of "s_sincosf.h".
> 	* sysdeps/ieee754/flt-32/s_sincosf.h (sincos_t): Rearranged to
> 	support vectorized load.
> 	(sincosf_poly): Don't define if HAVE_SINCOSF_POLY is defined.
> 	Updated for vectorized load.
> 	(sinf_poly): Updated for vectorized load.
> 	* sysdeps/ieee754/flt-32/s_sincosf_data.c (__sincosf_table):
> 	Rearranged to allow vectorized load.
> 	* sysdeps/x86_64/fpu/s_sincosf.h: New file.
> 	* sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c: Just include
> 	<sysdeps/ieee754/flt-32/s_sincosf.c>.
> ---
>  sysdeps/ieee754/flt-32/s_sincosf.c           |   2 +-
>  sysdeps/ieee754/flt-32/s_sincosf.h           |  38 ++-
>  sysdeps/ieee754/flt-32/s_sincosf_data.c      |  18 +-
>  sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c | 271 +------------------
>  sysdeps/x86_64/fpu/s_sincosf.h               |  57 ++++
>  5 files changed, 93 insertions(+), 293 deletions(-)
>  create mode 100644 sysdeps/x86_64/fpu/s_sincosf.h
> 
> diff --git a/sysdeps/ieee754/flt-32/s_sincosf.c b/sysdeps/ieee754/flt-32/s_sincosf.c
> index f7e3245097..28dd7530c5 100644
> --- a/sysdeps/ieee754/flt-32/s_sincosf.c
> +++ b/sysdeps/ieee754/flt-32/s_sincosf.c
> @@ -22,7 +22,7 @@
>  #include <math-barriers.h>
>  #include <libm-alias-float.h>
>  #include "math_config.h"
> -#include "s_sincosf.h"
> +#include <s_sincosf.h>
>  
>  #ifndef SINCOSF
>  # define SINCOSF_FUNC __sincosf
> diff --git a/sysdeps/ieee754/flt-32/s_sincosf.h b/sysdeps/ieee754/flt-32/s_sincosf.h
> index 1dcb04f235..a1d1639c17 100644
> --- a/sysdeps/ieee754/flt-32/s_sincosf.h
> +++ b/sysdeps/ieee754/flt-32/s_sincosf.h
> @@ -31,8 +31,24 @@ typedef struct
>    double sign[4];		/* Sign of sine in quadrants 0..3.  */
>    double hpi_inv;		/* 2 / PI ( * 2^24 if !TOINT_INTRINSICS).  */
>    double hpi;			/* PI / 2.  */
> -  double c0, c1, c2, c3, c4;	/* Cosine polynomial.  */
> -  double s1, s2, s3;		/* Sine polynomial.  */
> +  /* Cosine polynomial: c0, c1, c2, c3, c4.
> +     Sine polynomial: s1, s2, s3.  */
> +  double c0, c1;
> +  struct
> +    {
> +      double s1;
> +      double c2;
> +    } s1c2;
> +  struct
> +    {
> +      double s2;
> +      double c3;
> +    } s2c3;
> +  struct
> +    {
> +      double s3;
> +      double c4;
> +    } s3c4;
>  } sincos_t;
>  

I don't think this should be a problem for other architectures, do you see
any possible issue about changing the layout?

>  /* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
> @@ -48,6 +64,7 @@ abstop12 (float x)
>    return (asuint (x) >> 20) & 0x7ff;
>  }
>  
> +#ifndef HAVE_SINCOSF_POLY
>  /* Compute the sine and cosine of inputs X and X2 (X squared), using the
>     polynomial P and store the results in SINP and COSP.  N is the quadrant,
>     if odd the cosine and sine polynomials are swapped.  */
> @@ -59,8 +76,8 @@ sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
>  
>    x4 = x2 * x2;
>    x3 = x2 * x;
> -  c2 = p->c3 + x2 * p->c4;
> -  s1 = p->s2 + x2 * p->s3;
> +  c2 = p->s2c3.c3 + x2 * p->s3c4.c4;
> +  s1 = p->s2c3.s2 + x2 * p->s3c4.s3;
>  
>    /* Swap sin/cos result based on quadrant.  */
>    float *tmp = (n & 1 ? cosp : sinp);
> @@ -71,12 +88,13 @@ sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
>    x5 = x3 * x2;
>    x6 = x4 * x2;
>  
> -  s = x + x3 * p->s1;
> -  c = c1 + x4 * p->c2;
> +  s = x + x3 * p->s1c2.s1;
> +  c = c1 + x4 * p->s1c2.c2;
>  
>    *sinp = s + x5 * s1;
>    *cosp = c + x6 * c2;
>  }
> +#endif

I think current trend it to avoid such construction based on define macros 
and instead use file-based definition which can be override. For this specific 
change, wouldn't be better to create a generic s_sincosf_poly.h with 
sincosf_poly and override it on x86_64 folder? Maybe also move s_sincosf_t
to its own header, so s_sincosf_poly.h can include it as well or an
architecture can use a different layout if it requires so.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]