[PATCH 3/4] Use libc_fe* macros in ldbl-128/e_expl.c.

Thu Mar 26 14:53:54 GMT 2020

On 26/03/2020 06:08, Stefan Liebler via Libc-alpha wrote:
> On 3/25/20 4:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>
>>
>> On 25/03/2020 12:07, Adhemerval Zanella wrote:
>>>
>>>
>>> On 25/03/2020 12:00, Adhemerval Zanella wrote:
>>>>
>>>>
>>>> On 25/03/2020 07:13, Stefan Liebler via Libc-alpha wrote:
>>>>> Unfortunately, this patch is responsible for testfails on x86_64:
>>>>>
>>>>> math/test-float128-exp.out:
>>>>> Failure: exp (-0x1p-10000): Exception "Underflow" set
>>>>> Failure: exp (-0x2p-16384): Exception "Underflow" set
>>>>> ...
>>>>>
>>>>> math/test-float128-cexp.out:
>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x4p-1076 i): Exception "Underflow" set
>>>>> Failure: Real part of: cexp (0x2p-16384 - 0x8p-152 i): Exception "Underflow" set
>>>>
>>>> The sysdeps/x86/fpu/fenv_private.h states:
>>>>
>>>> 296 #ifdef __x86_64__
>>>> 297 /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>>>> 298    x86_64, so that must be set for float128 computations.  */
>>>> 299 # define SET_RESTORE_ROUNDF128(RM) \
>>>> 300   SET_RESTORE_ROUND_GENERIC (RM, libc_feholdsetround_sse, libc_feresetround_sse)
>>>>
>>>> So
>>>>
>>>>>> diff --git a/sysdeps/ieee754/ldbl-128/e_expl.c b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> index 37c1538c08..104ace1690 100644
>>>>>> --- a/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> +++ b/sysdeps/ieee754/ldbl-128/e_expl.c
>>>>>> @@ -66,6 +66,7 @@
>>>>>>    #include <inttypes.h>
>>>>>>    #include <math-barriers.h>
>>>>>>    #include <math_private.h>
>>>>>> +#include <fenv_private.h>
>>>>>>    #include <math-underflow.h>
>>>>>>    #include <stdlib.h>
>>>>>>    #include "t_expl.h"
>>>>>> @@ -146,9 +147,10 @@ __ieee754_expl (_Float128 x)
>>>>>>          union ieee854_long_double ex2_u, scale_u;
>>>>>>          fenv_t oldenv;
>>>>>>    -      feholdexcept (&oldenv);
>>>>>>    #ifdef FE_TONEAREST
>>>>>> -      fesetround (FE_TONEAREST);
>>>>>> +      libc_feholdexcept_setroundl (&oldenv, FE_TONEAREST);
>>>>
>>>> Should be libc_feholdexcept_setroundf128.
>>>
>>> But it does not see to help here, so I don't know what is failing as well.
>>
>> Ok, so what is happening __sfp_handle_exceptions always use 387 exception
>> mode for FP_EX_OVERFLOW and FP_EX_UNDERFLOW:
>>
>> config/i386/sfp-exceptions.c
>>
>>   79   if (_fex & FP_EX_OVERFLOW)
>>   80     {
>>   81       struct fenv temp;
>>   82       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>   83       temp.__status_word |= FP_EX_OVERFLOW;
>>   84       asm volatile ("fldenv\t%0" : : "m" (temp));
>>   85       asm volatile ("fwait");
>>   86     }
>>   87   if (_fex & FP_EX_UNDERFLOW)
>>   88     {
>>   89       struct fenv temp;
>>   90       asm volatile ("fnstenv\t%0" : "=m" (temp));
>>   91       temp.__status_word |= FP_EX_UNDERFLOW;
>>   92       asm volatile ("fldenv\t%0" : : "m" (temp));
>>   93       asm volatile ("fwait");
>>   94     }
>> Yes this looks like the mentioned disassembly.
>> Different that FP_EX_INEXACT, for instance, where __SSE_MATH__ sets
>> whether SSE is used or not.
>>
>> So I think it is not safe to use the SSE variants for libc_*_testf128,
>> as for i387 we should use the default_* instead.
>>
> I've just switched to default_* in sysdeps/x86/fpu/fenv_private.h:
> -#ifdef __x86_64__
> +#if 0
>  /* The SSE rounding mode is used by soft-fp (libgcc and glibc) on
>     x86_64, so that must be set for float128 computations.  */
>  # define SET_RESTORE_ROUNDF128(RM) \
> 
> But now there are 7 testfails. For some of them, the max.ulp goes up (over 10), but there are even worse cases. Here are shortened excerpts of the out files:
> - math/test-float128-clog.out:
> Failure: Test: Real part of: clog_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  162259276829213363391578010288127.0000
>  max.ulp   :  3.0000
> Failure: Test: Real part of: clog_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  162259276829213363391578010288128.0000
>  max.ulp   :  4.0000
> 
> - math/test-float128-clog10.out:
> Failure: Test: Real part of: clog10_downward (0x3.bea2bd62e35p-4 + 0xf.8e3d619a8d11bfd30b038eep-4 i)
>  ulp       :  4.0000
>  max.ulp   :  3.0000
> Failure: Test: Real part of: clog10_towardzero (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  140936617129079063283494433422698.0000
>  max.ulp   :  4.0000
> Failure: Test: Real part of: clog10_upward (0x2.82b795e420b281a934c6dd315cb2p-4 + 0xf.cd42a15bf9a361243a89663e81e8p-4 i)
>  ulp       :  140936617129079063283494433422698.0000
>  max.ulp   :  4.0000
> 
> - math/test-float128-jn.out
> - math/test-float128-lgamma.out
> - math/test-float128-tgamma.out:
> something like:
>  ulp       :  12.0000
>  max.ulp   :  4.0000
> 
> Failure: tgamma_upward (-0x6.ec00000000000008p+8): errno set to 0, expected 34 (ERANGE)
> 
> - math/test-float128-y1.out:
> Failure: Test: y1_downward (0x2p+0)
>  ulp       :  13.0000
>  max.ulp   :  4.0000
> Failure: Test: y1_towardzero (0x2p+0)
>  ulp       :  6.0000
>  max.ulp   :  2.0000
> Failure: Test: y1_upward (0x2p+0)
>  ulp       :  10.0000
>  max.ulp   :  5.0000
> 
> - math/test-float128-yn.out
> 

So it seems the issue is the mix on how libm fenv function, the internal
libc_fe*, and libgcc handles the exception register.  The exported fenv
operates on both i387 and SSE (since it should work on long double as well),
and the internal libc_fe* will set either SSE for float, double, and float128
and i387 for long double (as expected).

The libgcc, however, will set either SEE or i387 depending of the exception.
This broke the assumption of libc_fe* for float128 where either SSE or i387
will be used.

One option might be to force libgcc to not use its __sfp_handle_exceptions
on x86_64 and provide one that uses only SEE operations since libgcc does 
not use 'long double' on float128 operations.  The patch below does it
and applied on top your patches shows no regressions.

And I think we should fix libgcc in a similar manner, since checking on
config/i386/64/sfp-machine.h it only support SSE rounding mode.

--

diff --git a/sysdeps/x86/fpu/sfp-exceptions.c b/sysdeps/x86/fpu/sfp-exceptions.c
new file mode 100644
index 0000000000..676f396bc3
--- /dev/null
+++ b/sysdeps/x86/fpu/sfp-exceptions.c
@@ -0,0 +1,49 @@
+#include <fenv.h>
+#include <float.h>
+#include <math-barriers.h>
+
+#define FP_EX_INVALID           0x01
+#define FP_EX_DENORM            0x02
+#define FP_EX_DIVZERO           0x04
+#define FP_EX_OVERFLOW          0x08
+#define FP_EX_UNDERFLOW         0x10
+#define FP_EX_INEXACT           0x20
+#define FP_EX_ALL \
+        (FP_EX_INVALID | FP_EX_DENORM | FP_EX_DIVZERO | FP_EX_OVERFLOW \
+         | FP_EX_UNDERFLOW | FP_EX_INEXACT)
+
+void
+__sfp_handle_exceptions (int _fex)
+{
+  if (_fex & FP_EX_INVALID)
+    {
+      float f = 0.0f;
+      math_force_eval (f / f);
+    }
+  if (_fex & FP_EX_DENORM)
+    {
+      float f = FLT_MIN, g = 2.0f;
+      math_force_eval (f / g);
+    }
+  if (_fex & FP_EX_DIVZERO)
+    {
+      float f = 1.0f, g = 0.0f;
+      math_force_eval (f / g);
+    }
+  if (_fex & FP_EX_OVERFLOW)
+    {
+      float force_underflow = FLT_MAX * FLT_MAX;
+      math_force_eval (force_underflow);
+    }
+  if (_fex & FP_EX_UNDERFLOW)
+    {
+      float force_overflow = FLT_MIN * FLT_MIN;
+      math_force_eval (force_overflow);
+    }
+  if (_fex & FP_EX_INEXACT)
+    {
+      float f = 1.0f, g = 3.0f;
+      math_force_eval (f / g);
+    }
+}
+strong_alias (__sfp_handle_exceptions, __wrap___sfp_handle_exceptions)
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index a4ff2723a8..5becb96fa3 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -25,6 +25,9 @@ endif
 
 # Variables for libmvec tests.
 ifeq ($(subdir),math)
+libm-routines += sfp-exceptions
+LDFLAGS-m.so += -Wl,--wrap=__sfp_handle_exceptions
+
 ifeq ($(build-mathvec),yes)
 libmvec-tests += double-vlen2 double-vlen4 double-vlen4-avx2 \
 		 float-vlen4 float-vlen8 float-vlen8-avx2