This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/3] Update s_sincosf.c and x86-64 s_sincosf-fma.c
On 21/11/2018 16:11, H.J. Lu wrote:
> Include <s_sincosf.h> in s_sincosf.c, instead of "s_sincosf.h", to allow
> x86-64 s_sincosf.h with vectorized sincosf_poly. Update __sincosf_table
> to allow vectorized load in vectorized sincosf_poly. On Broadwell,
> bench-sincosf shows:
>
> Before After Improvement
> max 160.273 114.198 40%
> min 6.25 5.625 11%
> mean 13.0325 10.6462 22%
>
> Vectorized sincosf_poly shows
>
> Before After Improvement
> max 138.653 114.198 21%
> min 5.004 5.625 -11%
> mean 11.5934 10.6462 9%
>
> * sysdeps/ieee754/flt-32/s_sincosf.c: Include <s_sincosf.h>
> instead of "s_sincosf.h".
> * sysdeps/ieee754/flt-32/s_sincosf.h (sincos_t): Rearranged to
> support vectorized load.
> (sincosf_poly): Don't define if HAVE_SINCOSF_POLY is defined.
> Updated for vectorized load.
> (sinf_poly): Updated for vectorized load.
> * sysdeps/ieee754/flt-32/s_sincosf_data.c (__sincosf_table):
> Rearranged to allow vectorized load.
> * sysdeps/x86_64/fpu/s_sincosf.h: New file.
> * sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c: Just include
> <sysdeps/ieee754/flt-32/s_sincosf.c>.
> ---
> sysdeps/ieee754/flt-32/s_sincosf.c | 2 +-
> sysdeps/ieee754/flt-32/s_sincosf.h | 38 ++-
> sysdeps/ieee754/flt-32/s_sincosf_data.c | 18 +-
> sysdeps/x86_64/fpu/multiarch/s_sincosf-fma.c | 271 +------------------
> sysdeps/x86_64/fpu/s_sincosf.h | 57 ++++
> 5 files changed, 93 insertions(+), 293 deletions(-)
> create mode 100644 sysdeps/x86_64/fpu/s_sincosf.h
>
> diff --git a/sysdeps/ieee754/flt-32/s_sincosf.c b/sysdeps/ieee754/flt-32/s_sincosf.c
> index f7e3245097..28dd7530c5 100644
> --- a/sysdeps/ieee754/flt-32/s_sincosf.c
> +++ b/sysdeps/ieee754/flt-32/s_sincosf.c
> @@ -22,7 +22,7 @@
> #include <math-barriers.h>
> #include <libm-alias-float.h>
> #include "math_config.h"
> -#include "s_sincosf.h"
> +#include <s_sincosf.h>
>
> #ifndef SINCOSF
> # define SINCOSF_FUNC __sincosf
> diff --git a/sysdeps/ieee754/flt-32/s_sincosf.h b/sysdeps/ieee754/flt-32/s_sincosf.h
> index 1dcb04f235..a1d1639c17 100644
> --- a/sysdeps/ieee754/flt-32/s_sincosf.h
> +++ b/sysdeps/ieee754/flt-32/s_sincosf.h
> @@ -31,8 +31,24 @@ typedef struct
> double sign[4]; /* Sign of sine in quadrants 0..3. */
> double hpi_inv; /* 2 / PI ( * 2^24 if !TOINT_INTRINSICS). */
> double hpi; /* PI / 2. */
> - double c0, c1, c2, c3, c4; /* Cosine polynomial. */
> - double s1, s2, s3; /* Sine polynomial. */
> + /* Cosine polynomial: c0, c1, c2, c3, c4.
> + Sine polynomial: s1, s2, s3. */
> + double c0, c1;
> + struct
> + {
> + double s1;
> + double c2;
> + } s1c2;
> + struct
> + {
> + double s2;
> + double c3;
> + } s2c3;
> + struct
> + {
> + double s3;
> + double c4;
> + } s3c4;
> } sincos_t;
>
I don't think this should be a problem for other architectures, do you see
any possible issue about changing the layout?
> /* Polynomial data (the cosine polynomial is negated in the 2nd entry). */
> @@ -48,6 +64,7 @@ abstop12 (float x)
> return (asuint (x) >> 20) & 0x7ff;
> }
>
> +#ifndef HAVE_SINCOSF_POLY
> /* Compute the sine and cosine of inputs X and X2 (X squared), using the
> polynomial P and store the results in SINP and COSP. N is the quadrant,
> if odd the cosine and sine polynomials are swapped. */
> @@ -59,8 +76,8 @@ sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
>
> x4 = x2 * x2;
> x3 = x2 * x;
> - c2 = p->c3 + x2 * p->c4;
> - s1 = p->s2 + x2 * p->s3;
> + c2 = p->s2c3.c3 + x2 * p->s3c4.c4;
> + s1 = p->s2c3.s2 + x2 * p->s3c4.s3;
>
> /* Swap sin/cos result based on quadrant. */
> float *tmp = (n & 1 ? cosp : sinp);
> @@ -71,12 +88,13 @@ sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
> x5 = x3 * x2;
> x6 = x4 * x2;
>
> - s = x + x3 * p->s1;
> - c = c1 + x4 * p->c2;
> + s = x + x3 * p->s1c2.s1;
> + c = c1 + x4 * p->s1c2.c2;
>
> *sinp = s + x5 * s1;
> *cosp = c + x6 * c2;
> }
> +#endif
I think current trend it to avoid such construction based on define macros
and instead use file-based definition which can be override. For this specific
change, wouldn't be better to create a generic s_sincosf_poly.h with
sincosf_poly and override it on x86_64 folder? Maybe also move s_sincosf_t
to its own header, so s_sincosf_poly.h can include it as well or an
architecture can use a different layout if it requires so.