[PATCH] Fix the inaccuracy of j0f (BZ 14469).

Paul Zimmermann Paul.Zimmermann@inria.fr
Mon Jan 18 07:13:41 GMT 2021


The largest error for all binary32 inputs is now less than 9.5 ulps with
respect to the infinite precision result, i.e., less than 9 ulps after
rounding, which is the largest error allowed. (This is for rounding to
nearest, for other rounding modes, the new code should not behave worse than
the old one.)

The new code is enabled only when there is a cancellation at the very end of
the j0f computation, thus should not give any visible slowdown on average.
When there is a cancellation, two different algorithms are used:

* around the first 64 zeros of j0, approximation polynomials of degree 3 are
  used, which were computed using the Sollya tool (https://www.sollya.org/)

* for large inputs, an asymptotic formula is used from [1]

[1] Fast and Accurate Bessel Function Computation,
John Harrison, Proceedings of Arith 19, 2009.
---
 math/auto-libm-test-in            |   3 +-
 math/auto-libm-test-out-j0        |  25 +++
 sysdeps/ieee754/flt-32/e_j0f.c    | 341 +++++++++++++++++++++++++++++-
 sysdeps/x86_64/fpu/libm-test-ulps |  14 +-
 4 files changed, 369 insertions(+), 14 deletions(-)

diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 73840b8bef..dbf1dca35c 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -5756,8 +5756,9 @@ j0 -0x1.001000001p+593
 j0 0x1p1023
 j0 0x1p16382
 j0 0x1p16383
-# the next value generates larger error bounds on x86_64 (binary32)
+# the next values generate large error bounds on x86_64 (binary32)
 j0 0x2.602774p+0 xfail-rounding:ibm128-libgcc
+j0 0x1.8bde7ep+5
 # the next value exercises the flt-32 code path for x >= 2^127
 j0 0x8.2f4ecp+124
 
diff --git a/math/auto-libm-test-out-j0 b/math/auto-libm-test-out-j0
index cc1fea074e..d659f47632 100644
--- a/math/auto-libm-test-out-j0
+++ b/math/auto-libm-test-out-j0
@@ -1359,6 +1359,31 @@ j0 0x2.602774p+0 xfail-rounding:ibm128-libgcc
 = j0 tonearest ibm128 0x2.602774p+0 : 0x3.e83779fe19911fa806cee83ab7p-8 : inexact-ok
 = j0 towardzero ibm128 0x2.602774p+0 : 0x3.e83779fe19911fa806cee83ab7p-8 : xfail:ibm128-libgcc inexact-ok
 = j0 upward ibm128 0x2.602774p+0 : 0x3.e83779fe19911fa806cee83ab8p-8 : xfail:ibm128-libgcc inexact-ok
+j0 0x1.8bde7ep+5
+= j0 downward binary32 0x3.17bcfcp+4 : 0x7.a5f028p-16 : inexact-ok
+= j0 tonearest binary32 0x3.17bcfcp+4 : 0x7.a5f03p-16 : inexact-ok
+= j0 towardzero binary32 0x3.17bcfcp+4 : 0x7.a5f028p-16 : inexact-ok
+= j0 upward binary32 0x3.17bcfcp+4 : 0x7.a5f03p-16 : inexact-ok
+= j0 downward binary64 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77cp-16 : inexact-ok
+= j0 tonearest binary64 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77cp-16 : inexact-ok
+= j0 towardzero binary64 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77cp-16 : inexact-ok
+= j0 upward binary64 0x3.17bcfcp+4 : 0x7.a5f02c0a5b78p-16 : inexact-ok
+= j0 downward intel96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0a8p-16 : inexact-ok
+= j0 tonearest intel96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0bp-16 : inexact-ok
+= j0 towardzero intel96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0a8p-16 : inexact-ok
+= j0 upward intel96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0bp-16 : inexact-ok
+= j0 downward m68k96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0a8p-16 : inexact-ok
+= j0 tonearest m68k96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0bp-16 : inexact-ok
+= j0 towardzero m68k96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0a8p-16 : inexact-ok
+= j0 upward m68k96 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0bp-16 : inexact-ok
+= j0 downward binary128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f168p-16 : inexact-ok
+= j0 tonearest binary128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f168p-16 : inexact-ok
+= j0 towardzero binary128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f168p-16 : inexact-ok
+= j0 upward binary128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f16cp-16 : inexact-ok
+= j0 downward ibm128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935fp-16 : inexact-ok
+= j0 tonearest ibm128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f2p-16 : inexact-ok
+= j0 towardzero ibm128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935fp-16 : inexact-ok
+= j0 upward ibm128 0x3.17bcfcp+4 : 0x7.a5f02c0a5b77d0af558d6935f2p-16 : inexact-ok
 j0 0x8.2f4ecp+124
 = j0 downward binary32 0x8.2f4ecp+124 : 0xd.331efp-68 : inexact-ok
 = j0 tonearest binary32 0x8.2f4ecp+124 : 0xd.331efp-68 : inexact-ok
diff --git a/sysdeps/ieee754/flt-32/e_j0f.c b/sysdeps/ieee754/flt-32/e_j0f.c
index 5d29611eb7..f6efab828c 100644
--- a/sysdeps/ieee754/flt-32/e_j0f.c
+++ b/sysdeps/ieee754/flt-32/e_j0f.c
@@ -37,6 +37,330 @@ S04  =  1.1661400734e-09; /* 0x30a045e8 */
 
 static const float zero = 0.0;
 
+#define FIRST_ZERO 2.404825f
+
+#define SMALL_SIZE 64
+
+/* The following table contains successive zeros of j0 and degree-3 polynomial
+   approximations of j0 around these zeros: P[0] for the first zero (2.404825), P[1]
+   for the second one (5.520078), and so one. Each line contains:
+              {x0, xmid, x1, p0, p1, p2, p3}
+   where [x0,x1] is the interval around the zero, xmid is the binary32 number closest
+   to the zero, and p0+p1*x+p2*x^2+p3*x^3 is the approximation polynomial.
+   Each polynomial was generated using Remez' algorithm on the interval [x0,x1]
+   around the corresponding zero where the error is larger than 9 ulps for the
+   default code below.
+   Degree 3 is enough to get an error less than 4 ulps.
+*/
+static float P[SMALL_SIZE][7] = {
+{0x2.63cb8cp+0,0x2.67a2a4p+0,0x2.6f5d28p+0,0xf.26247p-28,-0x8.4e6d9p-4,0x1.ba1c7p-4,0xe.6c06ap-8},
+/* the following polynomial was generated by Sollya */
+{0x5.83abe8p+0,0x5.8523d8p+0,0x5.8a557p+0,0x6.9205fp-28,0x5.71b98p-4,-0x7.e3e798p-8,-0xd.87d1p-8},
+{0x8.a66f1p+0,0x8.a75abp+0,0x8.a92a7p+0,0x1.bcc1cap-24,-0x4.57de6p-4,0x4.03debp-8,0xb.44a4ap-8},
+{0xb.c9905p+0,0xb.caa2p+0,0xb.ccc6bp+0,-0xf.2976fp-32,0x3.b827ccp-4,-0x2.85fdb8p-8,-0x9.c5a97p-8},
+{0xe.edb6ap+0,0xe.ee50ap+0,0xe.ef864p+0,-0x1.bd67d8p-28,-0x3.4e03ap-4,0x1.c54b8ep-8,0x8.bb70dp-8},
+{0x1.2119fp+4,0x1.212314p+4,0x1.21375p+4,0x1.62209cp-28,0x3.00efecp-4,-0x1.5467fp-8,-0x7.f5a2p-8},
+{0x1.535d28p+4,0x1.5362dep+4,0x1.536cbcp+4,-0x2.853f74p-24,-0x2.c5b274p-4,0x1.0bab0ap-8,0x7.5c05b8p-8},
+{0x1.859df6p+4,0x1.85a3bap+4,0x1.85afcap+4,0x2.19ed1cp-24,0x2.96545cp-4,-0xd.995c9p-12,-0x6.e024cp-8},
+{0x1.b7dfe6p+4,0x1.b7e54ap+4,0x1.b7ebccp+4,0xe.959aep-28,-0x2.6f5594p-4,0xb.55ff1p-12,0x6.79cf58p-8},
+{0x1.ea22bcp+4,0x1.ea275ap+4,0x1.ea2e2ep+4,0x2.0c3964p-24,0x2.4e80fcp-4,-0x9.a35abp-12,-0x6.234b08p-8},
+{0x2.1c6638p+4,0x2.1c69c4p+4,0x2.1c6d7cp+4,-0x3.642554p-24,-0x2.325e48p-4,0x8.534f1p-12,0x5.d9048p-8},
+{0x2.4ea8dcp+4,0x2.4eac7p+4,0x2.4eb39cp+4,0x1.6c015ap-24,0x2.19e7d8p-4,-0x7.4915f8p-12,-0x5.984698p-8},
+{0x2.80ec2p+4,0x2.80ef5p+4,0x2.80f72cp+4,-0x4.b18c9p-28,-0x2.046174p-4,0x6.720468p-12,0x5.5f426p-8},
+{0x2.b32e6p+4,0x2.b33258p+4,0x2.b33654p+4,-0x1.8b8792p-24,0x1.f13fbp-4,-0x5.c149dp-12,-0x5.2c935p-8},
+{0x2.e5736p+4,0x2.e5758p+4,0x2.e57894p+4,0x3.a26e0cp-24,-0x1.e018dap-4,0x5.2df918p-12,0x4.ff0f68p-8},
+{0x3.17b694p+4,0x3.17b8c4p+4,0x3.17bcecp+4,-0x2.18fabcp-24,0x1.d09b22p-4,-0x4.b1c31p-12,-0x4.d5ecd8p-8},
+{0x3.49f9d8p+4,0x3.49fc1cp+4,0x3.4a0084p+4,0x3.2370e8p-24,-0x1.c28614p-4,0x4.47bb78p-12,0x4.b08458p-8},
+{0x3.7c3d78p+4,0x3.7c3f88p+4,0x3.7c43ep+4,-0x5.9eae3p-28,0x1.b5a622p-4,-0x3.ec892cp-12,-0x4.8e4d3p-8},
+{0x3.ae80ap+4,0x3.ae83p+4,0x3.ae8528p+4,0x2.9fa1e8p-24,-0x1.a9d184p-4,0x3.9d2fa8p-12,0x4.6edccp-8},
+{0x3.e0c484p+4,0x3.e0c688p+4,0x3.e0c8a4p+4,0x9.9ac67p-28,0x1.9ee5eep-4,-0x3.57e9ep-12,-0x4.51d1e8p-8},
+{0x4.1308f8p+4,0x4.130a18p+4,0x4.130c58p+4,0xd.6ab94p-28,-0x1.94c6f6p-4,0x3.1ac03p-12,0x4.36e4bp-8},
+{0x4.454cp+4,0x4.454dbp+4,0x4.45504p+4,-0x4.4cb2d8p-24,0x1.8b5cccp-4,-0x2.e477ap-12,-0x4.1dd858p-8},
+{0x4.778f6p+4,0x4.779158p+4,0x4.779368p+4,-0x4.4aa8c8p-24,-0x1.829356p-4,0x2.b4726cp-12,0x4.0676dp-8},
+{0x4.a9d3ep+4,0x4.a9d5p+4,0x4.a9d6cp+4,0x2.077c38p-24,0x1.7a597ep-4,-0x2.891dbcp-12,-0x3.f09154p-8},
+{0x4.dc175p+4,0x4.dc18bp+4,0x4.dc1a08p+4,-0x2.6a6cd8p-24,-0x1.72a09ap-4,0x2.62315cp-12,0x3.dc034p-8},
+{0x5.0e5bb8p+4,0x5.0e5c6p+4,0x5.0e5dbp+4,-0x5.08287p-24,0x1.6b5c06p-4,-0x2.3ec48p-12,-0x3.c8a91cp-8},
+{0x5.409ebp+4,0x5.40a02p+4,0x5.40a188p+4,-0x3.4598dcp-24,-0x1.6480c4p-4,0x2.1f1798p-12,0x3.b667ccp-8},
+{0x5.72e268p+4,0x5.72e3ep+4,0x5.72e54p+4,0x5.4e74bp-24,0x1.5e0544p-4,-0x2.021254p-12,-0x3.a5248cp-8},
+{0x5.a5263p+4,0x5.a527ap+4,0x5.a528d8p+4,-0x2.05751cp-24,-0x1.57e12p-4,0x1.e7643ep-12,0x3.94c994p-8},
+{0x5.d76acp+4,0x5.d76b68p+4,0x5.d76ccp+4,0x4.c5e278p-24,0x1.520ceep-4,-0x1.cf1d4ep-12,-0x3.85428cp-8},
+{0x6.09ae88p+4,0x6.09af3p+4,0x6.09b0bp+4,-0x3.50e62cp-24,-0x1.4c822p-4,0x1.b8ab9ap-12,0x3.767f34p-8},
+{0x6.3bf21p+4,0x6.3bf2f8p+4,0x6.3bf418p+4,-0x1.c39f38p-24,0x1.473ae6p-4,-0x1.a3dccep-12,-0x3.68700cp-8},
+{0x6.6e362p+4,0x6.6e36c8p+4,0x6.6e3818p+4,-0x1.9245b6p-28,-0x1.42320ap-4,0x1.90d5f2p-12,0x3.5b0634p-8},
+{0x6.a079dp+4,0x6.a07a98p+4,0x6.a07b78p+4,-0x1.0acf8p-24,0x1.3d62e6p-4,-0x1.7f1e42p-12,-0x3.4e3678p-8},
+{0x6.d2bda8p+4,0x6.d2be68p+4,0x6.d2bfb8p+4,0x4.cd92d8p-24,-0x1.38c94ap-4,0x1.6e94e2p-12,0x3.41f4acp-8},
+{0x7.05018p+4,0x7.05024p+4,0x7.0503p+4,-0x1.37bf8ap-24,0x1.34617p-4,-0x1.5f6a22p-12,-0x3.3637c8p-8},
+{0x7.37459p+4,0x7.374618p+4,0x7.3747ap+4,-0x1.8f62dep-28,-0x1.3027fp-4,0x1.51357ap-12,0x3.2af594p-8},
+{0x7.69892p+4,0x7.6989fp+4,0x7.698b98p+4,-0x9.81e79p-28,0x1.2c19b4p-4,-0x1.43e0aep-12,-0x3.20271p-8},
+{0x7.9bccf8p+4,0x7.9bcdc8p+4,0x7.9bceap+4,0x3.103b3p-24,-0x1.2833eep-4,0x1.37580ep-12,0x3.15c484p-8},
+{0x7.ce10dp+4,0x7.ce11a8p+4,0x7.ce136p+4,0x2.07b058p-24,0x1.24740ap-4,-0x1.2bd334p-12,-0x3.0bc628p-8},
+{0x8.0054cp+4,0x8.00558p+4,0x8.00562p+4,0x3.87576cp-24,-0x1.20d7b6p-4,0x1.20af6cp-12,0x3.022738p-8},
+{0x8.32994p+4,0x8.32996p+4,0x8.329a4p+4,-0x1.691ecp-24,0x1.1d5ccap-4,-0x1.167022p-12,-0x2.f8e084p-8},
+{0x8.64dcdp+4,0x8.64dd4p+4,0x8.64dd9p+4,0x9.b406dp-28,-0x1.1a015p-4,0x1.0cbfd2p-12,0x2.efee34p-8},
+{0x8.97209p+4,0x8.97212p+4,0x8.9721bp+4,-0xf.bfd8fp-28,0x1.16c37ap-4,-0x1.039356p-12,-0x2.e74a78p-8},
+{0x8.c9649p+4,0x8.c965p+4,0x8.c965bp+4,0x2.6d50c8p-24,-0x1.13a19ep-4,0xf.ae0ap-16,0x2.def13p-8},
+{0x8.fba89p+4,0x8.fba8ep+4,0x8.fba9dp+4,-0x4.d475c8p-24,0x1.109a32p-4,-0xf.29e9cp-16,-0x2.d6de4cp-8},
+{0x9.2dec7p+4,0x9.2deccp+4,0x9.2dedp+4,0x8.1982p-24,-0x1.0dabc8p-4,0xe.ac514p-16,0x2.cf0e6p-8},
+{0x9.60306p+4,0x9.6030bp+4,0x9.60318p+4,0x4.864518p-24,0x1.0ad51p-4,-0xe.3d1fbp-16,-0x2.c77d28p-8},
+{0x9.92743p+4,0x9.92749p+4,0x9.9274ep+4,0x6.8917a8p-28,-0x1.0814d4p-4,0xd.cb25ap-16,0x2.c0283p-8},
+{0x9.c4b81p+4,0x9.c4b87p+4,0x9.c4b8ep+4,-0x5.fa18fp-24,0x1.0569fp-4,-0xd.5e6d8p-16,-0x2.b90bep-8},
+{0x9.f6fc2p+4,0x9.f6fc6p+4,0x9.f6fcep+4,-0x4.0e5c98p-24,-0x1.02d354p-4,0xc.feb37p-16,0x2.b2259p-8},
+{0xa.293f8p+4,0xa.29404p+4,0xa.29408p+4,-0x2.c3ddbp-24,0x1.005004p-4,-0xc.9b641p-16,-0x2.ab72fp-8},
+{0xa.5b83bp+4,0xa.5b843p+4,0xa.5b852p+4,-0x5.d052p-24,-0xf.ddf16p-8,0xc.444dcp-16,0x2.a4f0d4p-8},
+{0xa.8dc7ap+4,0xa.8dc81p+4,0xa.8dc87p+4,-0x2.0b97dcp-24,0xf.b7fafp-8,-0xb.e93a7p-16,-0x2.9e9dccp-8},
+{0xa.c00b4p+4,0xa.c00cp+4,0xa.c00c4p+4,-0x5.4aab5p-24,-0xf.930fep-8,0xb.99b61p-16,0x2.98774p-8},
+{0xa.f24f5p+4,0xa.f24fep+4,0xa.f2509p+4,-0x3.6dadd8p-24,0xf.6f245p-8,-0xb.45e12p-16,-0x2.927b08p-8},
+{0xb.24939p+4,0xb.2493dp+4,0xb.24948p+4,-0x2.d7e038p-24,-0xf.4c2cep-8,0xa.fd076p-16,0x2.8ca788p-8},
+{0xb.56d73p+4,0xb.56d7bp+4,0xb.56d82p+4,-0x6.977a1p-24,0xf.2a1fp-8,-0xa.af99ap-16,-0x2.86fb2p-8},
+{0xb.891b3p+4,0xb.891bap+4,0xb.891c7p+4,0x1.3cc95ep-24,-0xf.08f0ap-8,0xa.6ca59p-16,0x2.8173b8p-8},
+{0xb.bb5f5p+4,0xb.bb5f9p+4,0xb.bb5fdp+4,0x3.a4921p-24,0xe.e8986p-8,-0xa.2c5b8p-16,-0x2.7c1024p-8},
+{0xb.eda37p+4,0xb.eda37p+4,0xb.eda3bp+4,0x6.b45a7p-24,-0xe.c90d8p-8,0x9.e7307p-16,0x2.76ceacp-8},
+{0xc.1fe71p+4,0xc.1fe76p+4,0xc.1fe87p+4,-0x2.8f34a4p-24,0xe.aa478p-8,-0x9.abd8fp-16,-0x2.71adecp-8},
+{0xc.522aep+4,0xc.522b5p+4,0xc.522c4p+4,-0x1.325968p-24,-0xe.8c3eap-8,0x9.72bf7p-16,0x2.6cacd4p-8},
+{0xc.846eep+4,0xc.846f4p+4,0xc.846fap+4,0x4.96b808p-24,0xe.6eeb5p-8,-0x9.3bc53p-16,-0x2.67ca04p-8},
+};
+
+/* argument reduction mod 2pi: T[i] ~ 2^i mod (2*pi) */
+static double T[128] = {
+0x1p+0,
+0x2p+0,
+-0x2.487ed5110b462p+0,
+0x1.b7812aeef4b9fp+0,
+-0x2.d97c7f3321d24p+0,
+0x9.585d6aac7a1a8p-4,
+0x1.2b0bad558f435p+0,
+0x2.56175aab1e86ap+0,
+-0x1.9c501fbace38dp+0,
+0x3.0fde959b6ed46p+0,
+-0x2.8c1a9da2d9d3cp-4,
+-0x5.18353b45b3a78p-4,
+-0xa.306a768b674fp-4,
+-0x1.460d4ed16ce9ep+0,
+-0x2.8c1a9da2d9d3cp+0,
+0x1.304999cb579e8p+0,
+0x2.60933396af3dp+0,
+-0x1.87586de3accc3p+0,
+-0x3.0eb0dbc759986p+0,
+0x2.b1d1d8258155p-4,
+0x5.63a3b04b02aap-4,
+0xa.c74760960554p-4,
+0x1.58e8ec12c0aa8p+0,
+0x2.b1d1d8258155p+0,
+-0xe.4db24c6089bf8p-4,
+-0x1.c9b6498c1137fp+0,
+0x2.b51241f8e8d64p+0,
+-0xd.e5a511f3999bp-4,
+-0x1.bcb4a23e73336p+0,
+0x2.cf15909424df4p+0,
+-0xa.a53b3e8c1877p-4,
+-0x1.54a767d1830eep+0,
+-0x2.a94ecfa3061dcp+0,
+0xf.5e135caff0a78p-4,
+0x1.ebc26b95fe14fp+0,
+-0x2.70f9fde50f1c4p+0,
+0x1.668ad946ed0dap+0,
+0x2.cd15b28dda1b4p+0,
+-0xa.e536ff5570fbp-4,
+-0x1.5ca6dfeaae1f6p+0,
+-0x2.b94dbfd55c3ecp+0,
+0xd.5e3556652c89p-4,
+0x1.abc6aacca5912p+0,
+-0x2.f0f17f77c023ep+0,
+0x6.69bd6218afe64p-4,
+0xc.d37ac4315fcc8p-4,
+0x1.9a6f58862bf99p+0,
+-0x3.13a02404b352ep+0,
+0x2.13e8d07a4a046p-4,
+0x4.27d1a0f49408cp-4,
+0x8.4fa341e928118p-4,
+0x1.09f4683d25023p+0,
+0x2.13e8d07a4a046p+0,
+-0x2.20ad341c773d4p+0,
+0x2.07246cd81ccb8p+0,
+-0x2.3a35fb60d1af2p+0,
+0x1.d412de4f67e7ep+0,
+-0x2.a05918723b764p+0,
+0x1.07cca42c94599p+0,
+0x2.0f99485928b32p+0,
+-0x2.294c445eb9dfep+0,
+0x1.f5e64c5397865p+0,
+-0x2.5cb23c69dc396p+0,
+0x1.8f1a5c3d52d34p+0,
+0x3.1e34b87aa5a68p+0,
+-0xc.15641bbff90bp-8,
+-0x1.82ac8377ff216p-4,
+-0x3.055906effe42cp-4,
+-0x6.0ab20ddffc858p-4,
+-0xc.15641bbff90bp-4,
+-0x1.82ac8377ff216p+0,
+-0x3.055906effe42cp+0,
+0x3.dccc7310ec09ap-4,
+0x7.b998e621d8134p-4,
+0xf.7331cc43b0268p-4,
+0x1.ee6639887604dp+0,
+-0x2.6bb262001f3c6p+0,
+0x1.711a1110cccd4p+0,
+0x2.e2342221999a8p+0,
+-0x8.41690cdd81128p-4,
+-0x1.082d219bb0225p+0,
+-0x2.105a43376044ap+0,
+0x2.27ca4ea24abcep+0,
+-0x1.f8ea37cc75cc6p+0,
+0x2.56aa65781fad6p+0,
+-0x1.9b2a0a20cbeb6p+0,
+0x3.122ac0cf736f6p+0,
+-0x2.4295372246764p-4,
+-0x4.852a6e448cec8p-4,
+-0x9.0a54dc8919d9p-4,
+-0x1.214a9b91233b2p+0,
+-0x2.4295372246764p+0,
+0x1.c35466cc7e59ap+0,
+-0x2.c1d607780e92ep+0,
+0xc.4d2c620ee206p-4,
+0x1.89a58c41dc40cp+0,
+0x3.134b1883b8818p+0,
+-0x2.1e8a4099a4324p-4,
+-0x4.3d14813348648p-4,
+-0x8.7a29026690c9p-4,
+-0x1.0f45204cd2192p+0,
+-0x2.1e8a4099a4324p+0,
+0x2.0b6a53ddc2e18p+0,
+-0x2.31aa2d558583p+0,
+0x1.e52a7a6600402p+0,
+-0x2.7e29e0450ac5cp+0,
+0x1.4c2b1486f5ba8p+0,
+0x2.9856290deb75p+0,
+-0x1.17d282f5345bfp+0,
+-0x2.2fa505ea68b7ep+0,
+0x1.e934c93c39d64p+0,
+-0x2.761542989799ap+0,
+0x1.5c544fdfdc12fp+0,
+0x2.b8a89fbfb825ep+0,
+-0xd.72d95919afa58p-4,
+-0x1.ae5b2b2335f4bp+0,
+0x2.ebc87eca9f5cap+0,
+-0x7.0edd77bcc8ccp-4,
+-0xe.1dbaef799198p-4,
+-0x1.c3b75def3233p+0,
+0x2.c1101932a6e02p+0,
+-0xc.65ea2abbd85fp-4,
+-0x1.8cbd45577b0bep+0,
+-0x3.197a8aaef617cp+0,
+0x1.589bfb31f1687p-4,
+0x2.b137f663e2d0ep-4,
+0x5.626fecc7c5a1cp-4,
+0xa.c4dfd98f8b438p-4,
+};
+
+/* return h such that x - pi/4 mod (2*pi) ~ h */
+static double
+reduce_mod_twopi (double x)
+{
+  double sign = 1, h;
+  if (x < 0)
+    {
+      x = -x;
+      sign = -1;
+    }
+  h = 0; /* invariant is x+h mod (2*pi) */
+  while (x >= 4)
+    {
+      int i = ilogb (x);
+      x -= ldexp (1.0f, i);
+      h += T[i];
+    }
+  /* add the remainder x */
+  h += x;
+  /* subtract pi/4 */
+  double piover4 = 0xc.90fdaa22168cp-4;
+  h -= piover4;
+  /* reduce mod 2*pi */
+  double twopi = 0x6.487ed5110b46p+0;
+  while (fabs (h) > twopi * 0.5)
+    {
+      if (h > 0)
+        h -= twopi;
+      else
+        h += twopi;
+    }
+  return sign * h;
+}
+
+/* formula page 5 of https://www.cl.cam.ac.uk/~jrh13/papers/bessel.pdf */
+static float
+j0f_asympt (float x)
+{
+  /* the following code fails to give an error <= 9 ulps in only one case,
+     for which we tabulate the result */
+  if (x == 0x1.4665d2p+24f)
+    return 0xa.50206p-52f;
+  double y = 1.0 / (double) x;
+  double y2 = y * y;
+  /* beta0 = 1 - 16/x^2 + 53/(512*x^4) */
+  double beta0 = 1.0f + y2 * (-0x1p-4f + 0x1.a8p-4 * y2);
+  /* alpha0 = 1/(8*x) - 25/(384*x^3) */
+  double alpha0 = y * (0x2p-4 - 0x1.0aaaaap-4 * y2);
+  double h;
+  h = reduce_mod_twopi ((double) x);
+  /* subtract alpha0 */
+  h -= alpha0;
+  /* now reduce mod pi/2 */
+  double piover2 = 0x1.921fb54442d18p+0;
+  int n = 0;
+  while (fabs (h) > piover2 / 2)
+    {
+      if (h > 0)
+        {
+          h -= piover2;
+          n ++;
+        }
+      else
+        {
+          h += piover2;
+          n --;
+        }
+    }
+  /* now x - pi/4 - alpha0 = h + n*pi/2 mod (2*pi) */
+  float xr = (float) h;
+  n = n & 3;
+  float cst = 0xc.c422ap-4; /* sqrt(2/pi) rounded to nearest */
+  float t = cst / sqrtf (x) * (float) beta0;
+  if (n == 0)
+    return t * cosf (xr);
+  else if (n == 2) /* cos(x+pi) = -cos(x) */
+    return -t * cosf (xr);
+  else if (n == 1) /* cos(x+pi/2) = -sin(x) */
+    return -t * sinf (xr);
+  else             /* cos(x+3pi/2) = sin(x) */
+    return t * sinf (xr);
+}
+
+/* Special code for x near a root of j0.
+   z is the value computed by the generic code.
+   For small x, we use a polynomial approximating j0 around its root.
+   For large x, we use an asymptotic formula (j0f_asympt). */
+static float
+j0f_near_root (float x, float z)
+{
+  float index_f;
+  int index;
+  index_f = roundf ((x - FIRST_ZERO) / (float) M_PI);
+  /* j0f_asympt fails to give an error <= 9 ulps for x=0x1.324e92p+7 (index 48)
+     thus we can't reduce SMALL_SIZE below 49 */
+  if (index_f >= SMALL_SIZE)
+    return j0f_asympt (x);
+  index = (int) index_f;
+  float *p = P[index];
+  float x0 = p[0];
+  float x1 = p[2];
+  /* if we are not in the interval [x0,x1] around xmid, we return the value z */
+  if (! (x0 <= x && x <= x1))
+    return z;
+  float xmid = p[1];
+  float y = x - xmid;
+  return p[3] + y * (p[4] + y * (p[5] + y * p[6]));
+}
+
 float
 __ieee754_j0f(float x)
 {
@@ -75,12 +399,17 @@ __ieee754_j0f(float x)
 	 * j0(x) = 1/sqrt(pi) * (P(0,x)*cc - Q(0,x)*ss) / sqrt(x)
 	 * y0(x) = 1/sqrt(pi) * (P(0,x)*ss + Q(0,x)*cc) / sqrt(x)
 	 */
-		if(ix>0x5c000000) z = (invsqrtpi*cc)/sqrtf(x);
-		else {
-		    u = pzerof(x); v = qzerof(x);
-		    z = invsqrtpi*(u*cc-v*ss)/sqrtf(x);
-		}
-		return z;
+                if (ix <= 0x5c000000)
+                  {
+                    u = pzerof(x); v = qzerof(x);
+                    cc = u*cc-v*ss;
+                  }
+                z = (invsqrtpi * cc) / sqrtf(x);
+                float magic = 0x1.8p+20; /* 2^20 + 2^19 */
+                if (magic + cc != magic) /* most likely */
+                  return z;
+                else /* |cc| <= 2^-4 */
+                  return j0f_near_root (x, z);
 	}
 	if(ix<0x39000000) {	/* |x| < 2**-13 */
 	    math_force_eval(huge+x);		/* raise inexact if x != 0 */
diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps
index 633d2ab8e4..75deb1462d 100644
--- a/sysdeps/x86_64/fpu/libm-test-ulps
+++ b/sysdeps/x86_64/fpu/libm-test-ulps
@@ -1312,22 +1312,22 @@ float128: 1
 ldouble: 1
 
 Function: "j0":
-double: 2
-float: 8
-float128: 2
+double: 5
+float: 9
+float128: 4
 ldouble: 2
 
 Function: "j0_downward":
 double: 2
-float: 4
+float: 9
 float128: 4
 ldouble: 6
 
 Function: "j0_towardzero":
-double: 5
-float: 6
+double: 9
+float: 7
 float128: 4
-ldouble: 6
+ldouble: 9
 
 Function: "j0_upward":
 double: 4
-- 
2.29.2



More information about the Libc-alpha mailing list