[PATCH 11/11] math: Use tanf from CORE-MATH

Thu Nov 21 03:19:23 GMT 2024

Various comments that need fixing.

Otherwise LGTM
Reviewed-by: DJ Delorie <dj@redhat.com>

Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> The CORE-MATH implementation is correctly rounded (for any rounding mode)
> and shows better performance to the generic tanf.
> diff --git a/SHARED-FILES b/SHARED-FILES
> index 033ce7f092..580e6b231a 100644
> --- a/SHARED-FILES
> +++ b/SHARED-FILES
> @@ -288,3 +288,9 @@ sysdeps/ieee754/flt-32/e_lgammaf_r.c:
>    - remove the errno stuff (this is done by the wrapper)
>    - replace 0x1p127f * 0x1p127f by math_narrow_eval (x * 0x1p127f)
>    - add libm_alias_finite (__ieee754_lgammaf_r, __lgammaf_r) at the end
> +sysdeps/ieee754/flt-32/s_tanf.c:
> +  (src/binary32/tan/tanf.cc in CORE-MATH)
> +  - The code was adapted to use glibc code style and internal
> +    functions to handle errno, overflow, and underflow.  It was changed
> +    to use an internal wrapper for 128 bit unsigned integer operation
> +    for ABIs that do not support the type natively.

Ok.

> diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
> diff --git a/sysdeps/alpha/fpu/libm-test-ulps b/sysdeps/alpha/fpu/libm-test-ulps
> diff --git a/sysdeps/arc/fpu/libm-test-ulps b/sysdeps/arc/fpu/libm-test-ulps
> diff --git a/sysdeps/arc/nofpu/libm-test-ulps b/sysdeps/arc/nofpu/libm-test-ulps
> diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps
> diff --git a/sysdeps/csky/fpu/libm-test-ulps b/sysdeps/csky/fpu/libm-test-ulps
> diff --git a/sysdeps/csky/nofpu/libm-test-ulps b/sysdeps/csky/nofpu/libm-test-ulps

Ok.

> diff --git a/sysdeps/generic/math_int128.h b/sysdeps/generic/math_int128.h

If only unsigned is supported, this would make sense as math_uint128.h -
unless the plan is to add signed int128 in the future.

> +/* Internal 128 bit int support.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef _MATH_INT128_H
> +#define _MATH_INT128_H
> +
> +/* Limited support for internal 128 bit integer, used on some math
> +   implementations.  It uses compiler builtin type if supported, otherwise
> +   it is emulate.  Only unsigned and some operations are currently supported:

"it is emulated" (spelling)

> +
> +   - u128_t:         the 128 bit unsigned type.
> +   - u128_high:      return the high part of the number.
> +   - u128_low:       return the low part of the number.
> +   - u128_from_u64:  createa 128 bit number from a 64 bit one.

s/createa/create a/

> +   - u128_mul:       multiple two 128 bit numbers.

"multiply" (spelling)

> +   - u128_add:       add two 128 bit numbers.
> +   - u128_lshift:    left shift a number.
> +   - u128_rshift:    right shift a number.
> + */

Ok.

> +#ifdef __SIZEOF_INT128__
> +typedef unsigned __int128 u128;
> +# define u128_high(__x)         (uint64_t)((__x) >> 64)
> +# define u128_low(__x)          (uint64_t)(__x)
> +# define u128_from_u64(__x)     (u128)(__x)
> +# define u128_mul(__x, __y)     (__x) * (__y)
> +# define u128_add(__x, __y)     (__x) + (__y)
> +# define u128_lshift(__x, __y)  (__x) << (__y)
> +# define u128_rshift(__x, __y)  (__x) >> (__y)

Ok.

> +#else
> +typedef struct
> +{
> +  uint64_t low;
> +  uint64_t high;
> +} u128;
> +
> +# define u128_high(__x)         (__x).high
> +# define u128_low(__x)          (__x).low
> +
> +# define u128_from_u64(__x)     (u128){.low = (__x), .high = 0}
> +
> +# define TOPBIT                 (UINT64_C(1) << 63)
> +# define MASK32                 (UINT64_C(0xffffffff))

Ok.

> +static u128 u128_add (u128 x, u128 y)
> +{
> +  uint64_t lower = (x.low & ~TOPBIT) + (y.low & ~TOPBIT);
> +  bool carry = (lower >> 63) + (x.low >> 63) + (y.low >> 63) > 1;
> +  return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
> +}

Ok.

> +static u128 u128_lshift (u128 x, unsigned int n)
> +{
> +  switch (n)
> +    {
> +    case 0:         return x;
> +    case 1 ... 63:  return (u128) { .high = (x.high << n) | (x.low >> (64 - n)),
> +				    .low = x.low << n };
> +    case 64 ...127: return (u128) { .high = x.low << (n - 64), .low = 0};
> +    default:        return (u128) { .high = 0, .low = 0 };
> +    }
> +}

Ok.

> +static u128 u128_rshift (u128 x, unsigned int n)
> +{
> +  switch (n)
> +    {
> +    case 0:         return x;
> +    case 1 ... 63:  return (u128) { .high = x.high >> n,
> +				    .low = (x.high << (64 - n)) | (x.low >> n) };
> +    case 64 ...127: return (u128) { .high = 0, .low = x.high >> (n - 64) };
> +    default:        return (u128) { .high = 0, .low = 0 };
> +    }
> +}

Ok.

> +static u128 u128_mul (u128 x, u128 y)
> +{
> +  if (x.high == 0 && y.high == 0)
> +    {
> +      uint64_t x0 = x.low & MASK32;
> +      uint64_t x1 = x.low >> 32;
> +      uint64_t y0 = y.low & MASK32;
> +      uint64_t y1 = y.low >> 32;
> +      u128 x0y0 = { .high = 0, .low = x0 * y0 };
> +      u128 x0y1 = { .high = 0, .low = x0 * y1 };
> +      u128 x1y0 = { .high = 0, .low = x1 * y0 };
> +      u128 x1y1 = { .high = 0, .low = x1 * y1 };
> +      /* x0y0 + ((x0y1 + x1y0) << 32) + (x1y1 << 64)  */
> +      return u128_add (u128_add (x0y0, u128_lshift (u128_add (x0y1,
> +							      x1y0),
> +						    32)),
> +		       u128_lshift (x1y1, 64));
> +    }

Ok.

> +  else
> +    {
> +      uint64_t x0 = x.low & MASK32;
> +      uint64_t x1 = x.low >> 32;
> +      uint64_t x2 = x.high & MASK32;
> +      uint64_t x3 = x.high >> 32;
> +      uint64_t y0 = y.low & MASK32;
> +      uint64_t y1 = y.low >> 32;
> +      uint64_t y2 = y.high & MASK32;
> +      uint64_t y3 = y.high >> 32;
> +      u128 x0y0 = { .high = 0, .low = x0 * y0 };
> +      u128 x0y1 = { .high = 0, .low = x0 * y1 };
> +      u128 x0y2 = { .high = 0, .low = x0 * y2 };
> +      u128 x0y3 = { .high = 0, .low = x0 * y3 };
> +      u128 x1y0 = { .high = 0, .low = x1 * y0 };
> +      u128 x1y1 = { .high = 0, .low = x1 * y1 };
> +      u128 x1y2 = { .high = 0, .low = x1 * y2 };
> +      u128 x2y0 = { .high = 0, .low = x2 * y0 };
> +      u128 x2y1 = { .high = 0, .low = x2 * y1 };
> +      u128 x3y0 = { .high = 0, .low = x3 * y0 };
> +      /* x0y0 + ((x0y1 + x1y0) << 32) + ((x0y2 + x1y1 + x2y0) << 64) +
> +          ((x0y3 + x1y2 + x2y1 + x3y0) << 96)  */
> +      u128 r0 = u128_add (x0y0,
> +			  u128_lshift (u128_add (x0y1, x1y0),
> +				       32));
> +      u128 r1 = u128_add (u128_lshift (u128_add (u128_add (x0y2, x1y1), x2y0),
> +				       64),
> +			  u128_lshift (u128_add (u128_add (x0y3, x1y2),
> +						 u128_add (x2y1, x3y0)),
> +				       96));
> +      return u128_add (r0, r1);
> +   }
> +}
> +#endif /* __SIZEOF_INT128__ */
> +
> +#endif

Ok.

> diff --git a/sysdeps/hppa/fpu/libm-test-ulps b/sysdeps/hppa/fpu/libm-test-ulps
> diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
> diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps

Ok.

> diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c
> -/* k_tanf.c -- float version of k_tan.c
> - */
> . . .
> -}
> +/* Not needed.  */

Ok.

> diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c
> index ae6600bd57..41fd8fe496 100644
> --- a/sysdeps/ieee754/flt-32/s_tanf.c
> +++ b/sysdeps/ieee754/flt-32/s_tanf.c
> @@ -1,76 +1,176 @@
> -/* s_tanf.c -- float version of s_tan.c.
> - */
> +/* Correctly-rounded tangent of binary32 value.
>  
> -/*
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> +Copyright (c) 2022-2024 Alexei Sibidanov.
>  
> -#if defined(LIBM_SCCS) && !defined(lint)
> -static char rcsid[] = "$NetBSD: s_tanf.c,v 1.4 1995/05/10 20:48:20 jtc Exp $";
> -#endif
> +The original version of this file was copied from the CORE-MATH
> +project (file src/binary32/tan/tanf.c, revision 59d21d7).
>  
> -#include <errno.h>
> -#include <math.h>
> -#include <math_private.h>
> +Permission is hereby granted, free of charge, to any person obtaining a copy
> +of this software and associated documentation files (the "Software"), to deal
> +in the Software without restriction, including without limitation the rights
> +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> +copies of the Software, and to permit persons to whom the Software is
> +furnished to do so, subject to the following conditions:
> +
> +The above copyright notice and this permission notice shall be included in all
> +copies or substantial portions of the Software.
> +
> +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> +SOFTWARE.
> +*/

Ok.

> +
> +#include <array_length.h>
> +#include <stdint.h>
>  #include <libm-alias-float.h>
> -#include "s_sincosf.h"
> +#include "math_config.h"
> +#include <math_int128.h>

Ok.

> -/* Reduce range of X to a multiple of PI/2.  The modulo result is between
> -   -PI/4 and PI/4 and returned as a high part y[0] and a low part y[1].
> -   The low bit in the return value indicates the first or 2nd half of tanf.  */
> -static inline int32_t
> -rem_pio2f (float x, float *y)

Ok.

> +static inline double
> +rltl (float z, int *q)

Function needs a comment saying what its purpose is?  I can't guess from
"rltl" nor from the contents or usage.

>  {
> -  double dx = x;
> -  int n;
> -  const sincos_t *p = &__sincosf_table[0];

Ok.

> +  double x = z;
> +  double idl = -0x1.b1bbead603d8bp-32 * x;
> +  double idh = 0x1.45f306ep-1 * x;
> +  double id = roundeven (idh);
> +  *q = (int64_t) id;
> +  return (idh - id) + idl;
> +}

Ok.

> -  if (__glibc_likely (abstop12 (x) < abstop12 (120.0f)))
> -    dx = reduce_fast (dx, p, &n);
> -  else

Ok.

> +static double __attribute__ ((noinline))
> +rbig (uint32_t u, int *q)

Same here (comment needed).  Radians Bigger?  Rotate Bigwise?  Rhythm
and Blues Ignores Generations?

> +{
> +  static const uint64_t ipi[] =
>      {
> -      uint32_t xi = asuint (x);
> -      int sign = xi >> 31;
> -
> -      dx = reduce_large (xi, &n);
> -      dx = sign ? -dx : dx;

Ok.

> +      0xfe5163abdebbc562, 0xdb6295993c439041,
> +      0xfc2757d1f534ddc0, 0xa2f9836e4e441529
> +    };
> +  int e = (u >> 23) & 0xff, i;
> +  uint64_t m = (u & (~0u >> 9)) | 1 << 23;
> +  u128 p0 = u128_mul (u128_from_u64 (m), u128_from_u64 (ipi[0]));
> +  u128 p1 = u128_mul (u128_from_u64 (m), u128_from_u64 (ipi[1]));
> +  p1 = u128_add (p1, u128_rshift (p0, 64));
> +  u128 p2 = u128_mul (u128_from_u64 (m), u128_from_u64 (ipi[2]));
> +  p2 = u128_add (p2, u128_rshift (p1, 64));
> +  u128 p3 = u128_mul (u128_from_u64 (m), u128_from_u64 (ipi[3]));
> +  p3 = u128_add (p3, u128_rshift (p2, 64));
> +  uint64_t p3h = u128_high (p3);
> +  uint64_t p3l = u128_low (p3);
> +  uint64_t p2l = u128_low (p2);
> +  uint64_t p1l = u128_low (p1);
> +  int64_t a;
> +  int k = e - 127, s = k - 23;

Ok.

> +  /* in ctanf(), rbig() is called in the case 127+28 <= e < 0xff
> +     thus 155 <= e <= 254, which yields 28 <= k <= 127 and 5 <= s <= 104 */

I'm guessing this comment goes with the lines above it, not the
comparison below?  If so, please include a blank line here ;-)

> +  if (s < 64)
> +    {
> +      i = p3h << s | p3l >> (64 - s);
> +      a = p3l << s | p2l >> (64 - s);
>      }

Ok.

> -
> -  y[0] = dx;
> -  y[1] = dx - y[0];
> -  return n;

Ok.

> +  else if (s == 64)
> +    {
> +      i = p3l;
> +      a = p2l;
> +    }
> +  else
> +    { /* s > 64 */
> +      i = p3l << (s - 64) | p2l >> (128 - s);
> +      a = p2l << (s - 64) | p1l >> (128 - s);
> +    }

Ok.

> +  int sgn = u;
> +  sgn >>= 31;

Ok.

> +  int64_t sm = a >> 63;
> +  i -= sm;
> +  double z = (a ^ sgn) * 0x1p-64;
> +  i = (i ^ sgn) - sgn;
> +  *q = i;
> +  return z;
>  }

Ok.

> -float __tanf(float x)
> +float
> +__tanf (float x)
>  {

Ok.

> -	float y[2],z=0.0;
> -	int32_t n, ix;
> -
> -	GET_FLOAT_WORD(ix,x);
> -
> -    /* |x| ~< pi/4 */
> -	ix &= 0x7fffffff;
> -	if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
> -
> -    /* tan(Inf or NaN) is NaN */
> -	else if (ix>=0x7f800000) {
> -	  if (ix==0x7f800000)
> -	    __set_errno (EDOM);
> -	  return x-x;		/* NaN */

Ok.

> +  uint32_t t = asuint (x);
> +  int e = (t >> 23) & 0xff;
> +  int i;
> +  double z;
> +  if (__glibc_likely (e < 127 + 28))
> +    {
> +      if (__glibc_unlikely (e < 115))
> +	{
> +	  if (__glibc_unlikely (e < 102))
> +	    return fmaf (x, fabsf (x), x);
> +	  float x2 = x * x;
> +	  return fmaf (x, 0x1.555556p-2f * x2, x);
>  	}

Ok.

> -
> -    /* argument reduction needed */
> -	else {
> -	    n = rem_pio2f(x,y);
> -	    return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
> -							      -1 -- n odd */

Ok.

> +      z = rltl (x, &i);
> +    }
> +  else if (e < 0xff)
> +    z = rbig (t, &i);
> +  else
> +    {
> +      if (t << 9)
> +	return x + x; /* nan */
> +      return __math_invalidf (x);
> +    }

Ok.

> +  double z2 = z * z;
> +  double z4 = z2 * z2;
> +  static const double cn[] =
> +    {
> +      0x1.921fb54442d18p+0, -0x1.fd226e573289fp-2,
> +      0x1.b7a60c8dac9f6p-6, -0x1.725beb40f33e5p-13
> +    };
> +  static const double cd[] =
> +    {
> +      0x1p+0,               -0x1.2395347fb829dp+0,
> +      0x1.2313660f29c36p-3, -0x1.9a707ab98d1c1p-9
> +    };

Ok.

> +  static const double s[] = { 0, 1 };
> +  double n = cn[0] + z2 * cn[1];
> +  double n2 = cn[2] + z2 * cn[3];
> +  n += z4 * n2;
> +  double d = cd[0] + z2 * cd[1];
> +  double d2 = cd[2] + z2 * cd[3];
> +  d += z4 * d2;
> +  n *= z;
> +  double s0 = s[i & 1];
> +  double s1 = s[1 - (i & 1)];
> +  double r1 = (n * s1 - d * s0) / (n * s0 + d * s1);

Ok.

> +  uint64_t tail = (asuint64 (r1) + 7) & (~UINT64_C(0) >> 35);
> +  if (__glibc_unlikely (tail <= 14))
> +    {
> +      static const struct
> +      {
> +	float arg;
> +	float rh;
> +        float rl;
> +      } st[] = {
> +	{ 0x1.143ec4p+0f,    0x1.ddf9f6p+0f, -0x1.891d24p-52f },
> +	{ 0x1.ada6aap+27f,   0x1.e80304p-3f,  0x1.419f46p-58f },
> +	{ 0x1.af61dap+48f,   0x1.60d1c8p-2f, -0x1.2d6c3ap-55f },
> +	{ 0x1.0088bcp+52f,   0x1.ca1edp+0f,   0x1.f6053p-53f },
> +	{ 0x1.f90dfcp+72f,   0x1.597f9cp-1f,  0x1.925978p-53f },
> +	{ 0x1.cc4e22p+85f,  -0x1.f33584p+1f,  0x1.d7254ap-51f },
> +	{ 0x1.a6ce12p+86f,  -0x1.c5612ep-1f, -0x1.26c33ep-53f },
> +	{ 0x1.6a0b76p+102f, -0x1.e42a1ep+0f, -0x1.1dc906p-52f },
> +      };
> +      uint32_t ax = t & (~0u >> 1);
> +      uint32_t sgn = t >> 31;
> +      for (int j = 0; j < array_length (st); j++)
> +	{
> +	  if (__glibc_unlikely (asfloat (st[j].arg) == ax))
> +	    {
> +	      if (sgn)
> +		return -st[j].rh - st[j].rl;
> +	      else
> +		return st[j].rh + st[j].rl;
> +	    }

Ok.

>  	}
> +    }
> +  return r1;
>  }
>  libm_alias_float (__tan, tan)

Ok.

> diff --git a/sysdeps/loongarch/lp64/libm-test-ulps b/sysdeps/loongarch/lp64/libm-test-ulps
> diff --git a/sysdeps/microblaze/libm-test-ulps b/sysdeps/microblaze/libm-test-ulps
> diff --git a/sysdeps/mips/mips32/libm-test-ulps b/sysdeps/mips/mips32/libm-test-ulps
> diff --git a/sysdeps/mips/mips64/libm-test-ulps b/sysdeps/mips/mips64/libm-test-ulps
> diff --git a/sysdeps/nios2/libm-test-ulps b/sysdeps/nios2/libm-test-ulps
> diff --git a/sysdeps/or1k/fpu/libm-test-ulps b/sysdeps/or1k/fpu/libm-test-ulps
> diff --git a/sysdeps/or1k/nofpu/libm-test-ulps b/sysdeps/or1k/nofpu/libm-test-ulps
> diff --git a/sysdeps/powerpc/fpu/libm-test-ulps b/sysdeps/powerpc/fpu/libm-test-ulps
> diff --git a/sysdeps/powerpc/nofpu/libm-test-ulps b/sysdeps/powerpc/nofpu/libm-test-ulps
> diff --git a/sysdeps/riscv/nofpu/libm-test-ulps b/sysdeps/riscv/nofpu/libm-test-ulps
> diff --git a/sysdeps/riscv/rvd/libm-test-ulps b/sysdeps/riscv/rvd/libm-test-ulps
> diff --git a/sysdeps/s390/fpu/libm-test-ulps b/sysdeps/s390/fpu/libm-test-ulps
> diff --git a/sysdeps/sh/libm-test-ulps b/sysdeps/sh/libm-test-ulps
> diff --git a/sysdeps/sparc/fpu/libm-test-ulps b/sysdeps/sparc/fpu/libm-test-ulps
> diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps

Ok.