[PATCH] PowerPC: Trigonometric optimizations for POWER

Tue Oct 25 19:06:00 GMT 2011

On 09/14/2011 10:13 AM, Andreas Schwab wrote:
> You didn't run the testsuite.
>
> check-localplt.out:
> --- ../scripts/data/localplt-powerpc-linux-gnu.data	2011-09-13 18:01:15.000000000 +0000
> +++ -	2011-09-14 11:55:41.838340244 +0000
> @@ -4,4 +4,8 @@
>  libc.so: malloc
>  libc.so: memalign
>  libc.so: realloc
> +libm.so: floorf
>  libm.so: matherr
> +libm.so: sqrt
> +libm.so: sqrtf
> +libm.so: truncf
>
> Andreas.
>
I used these symbols and not the usual GLIBC ones (__ieee754_sqrt, _floorf, __truncf) 
so the compiler could use the intrinsic PPC instruction for each function 
(fsqrt[f], frim, friz) instead of branches to internal symbols.

At first sight it works correctly for any ISA 2.03+ (POWER5+ and forward), however
you probably built it using POWER4 optimizations and then the symbols generated 
PTL calls.

I also found another issue related to sqrt(): although it is defined on ISA 2.02 (POWER4) 
newer GCCs translates it to:

sqrt(x):
r = fsqrt(x)  // PPC instruction
if (x < 0)
  r = sqrt(x) // PLT call

While some old GCC version (like the one on RHEL6, which I was using) translate to just an 
'fsqrt' instruction. Since on this algorithm x is always higher than 0 we can use 'fqsrt'
directly.

You corrected it by calling the GLIBC internal symbols (__ieee754_sqrt, _floorf, __truncf).
However this is not the best solution regarding performance: this patch creates inline 
assembly function to use PPC instruction when compilation allows and relaying on the 
internal GLIBC function only when the instruction are now implemented (for instance, on POWER4).

---

[PATCH] PowerPC - Patch for assembly version of truncf/floorf

This patch creates inline assembly functions that use intrinsic PPC
floating point instructions when the platform supports them but rely on
the internal GLIBC functions when the instructions are not implemented
(for instance, on POWER4).

2011-08-01  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

        * sysdeps/powerpc/fpu/e_hypot.c: Use PPC intrinsic instruction, when
	avaliable, for __ieee754_sqrt.
        * sysdeps/powerpc/fpu/e_hypotf.c: Use PPC intrinsic instruction, when
	avaliable, for __ieee754_sqrtf.
        * sysdeps/powerpc/fpu/e_rem_pio2f.c: Use PPC intrinsic instruction,
	when avaliable, for __floorf.
        * sysdeps/powerpc/fpu/k_rem_pio2f.c: Use PPC intrinsic instruction,
	when avaliable, for __floorf and __truncf.
        * sysdeps/powerpc/fpu/math_private.h: Implemented inline function to
	use PPC intrinsic instructions for __ieee754_sqrt, __floorf, and
	__truncf.


 sysdeps/powerpc/fpu/e_hypot.c      |    8 ++--
 sysdeps/powerpc/fpu/e_hypotf.c     |    8 ++--
 sysdeps/powerpc/fpu/e_rem_pio2f.c  |    8 ++--
 sysdeps/powerpc/fpu/k_rem_pio2f.c  |    8 ++--
 sysdeps/powerpc/fpu/math_private.h |   83 +++++++++++++++++++++++++++++++++++-
 5 files changed, 98 insertions(+), 17 deletions(-)

diff --git a/sysdeps/powerpc/fpu/e_hypot.c b/sysdeps/powerpc/fpu/e_hypot.c
index 454a59d..e38bc49 100644
--- a/sysdeps/powerpc/fpu/e_hypot.c
+++ b/sysdeps/powerpc/fpu/e_hypot.c
@@ -96,7 +96,7 @@ __ieee754_hypot (double x, double y)
     {
       x *= twoM600;
       y *= twoM600;
-      return __ieee754_sqrt (x * x + y * y) / twoM600;
+      return __ieee754_sqrt_inline (x * x + y * y) / twoM600;
     }
   if (y < twoM500)
     {
@@ -104,14 +104,14 @@ __ieee754_hypot (double x, double y)
        {
          x *= two1022;
          y *= two1022;
-         return __ieee754_sqrt (x * x + y * y) / two1022;
+         return __ieee754_sqrt_inline (x * x + y * y) / two1022;
        }
       else
        {
          x *= two600;
          y *= two600;
-         return __ieee754_sqrt (x * x + y * y) / two600;
+         return __ieee754_sqrt_inline (x * x + y * y) / two600;
        }
     }
-  return __ieee754_sqrt (x * x + y * y);
+  return __ieee754_sqrt_inline (x * x + y * y);
 }
diff --git a/sysdeps/powerpc/fpu/e_hypotf.c b/sysdeps/powerpc/fpu/e_hypotf.c
index e3757ff..a7d03a3 100644
--- a/sysdeps/powerpc/fpu/e_hypotf.c
+++ b/sysdeps/powerpc/fpu/e_hypotf.c
@@ -97,7 +97,7 @@ __ieee754_hypotf (float x, float y)
     {
       x *= twoM60;
       y *= twoM60;
-      return __ieee754_sqrtf (x * x + y * y) / twoM60;
+      return __ieee754_sqrtf_inline (x * x + y * y) / twoM60;
     }
   if (y < twoM50)
     {
@@ -105,14 +105,14 @@ __ieee754_hypotf (float x, float y)
        {
          x *= two126;
          y *= two126;
-         return __ieee754_sqrtf (x * x + y * y) / two126;
+         return __ieee754_sqrtf_inline (x * x + y * y) / two126;
        }
       else
        {
          x *= two60;
          y *= two60;
-         return __ieee754_sqrtf (x * x + y * y) / two60;
+         return __ieee754_sqrtf_inline (x * x + y * y) / two60;
        }
     }
-  return __ieee754_sqrtf (x * x + y * y);
+  return __ieee754_sqrtf_inline (x * x + y * y);
 }
diff --git a/sysdeps/powerpc/fpu/e_rem_pio2f.c b/sysdeps/powerpc/fpu/e_rem_pio2f.c
index a0c2890..a5dd080 100644
--- a/sysdeps/powerpc/fpu/e_rem_pio2f.c
+++ b/sysdeps/powerpc/fpu/e_rem_pio2f.c
@@ -112,7 +112,7 @@ __ieee754_rem_pio2f (float x, float *y)
     }
   if (ax <= pio2_2e7)
     {
-      n = __floorf (ax * invpio2 + half);
+      n = __floorf_inline (ax * invpio2 + half);
       i = (int32_t) n;
       r = ax - n * pio2_1;
       w = n * pio2_1t;         /* 1st round good to 40 bit */
@@ -168,11 +168,11 @@ __ieee754_rem_pio2f (float x, float *y)
   e0 = __float_and8 (ax / 128.0);
   z = ax / e0;
 
-  tx[0] = __floorf (z);
+  tx[0] = __floorf_inline (z);
   z = (z - tx[0]) * two8;
-  tx[1] = __floorf (z);
+  tx[1] = __floorf_inline (z);
   z = (z - tx[1]) * two8;
-  tx[2] = __floorf (z);
+  tx[2] = __floorf_inline (z);
 
   nx = 3;
   while (tx[nx - 1] == zero)
diff --git a/sysdeps/powerpc/fpu/k_rem_pio2f.c b/sysdeps/powerpc/fpu/k_rem_pio2f.c
index edaef09..1e3cf06 100644
--- a/sysdeps/powerpc/fpu/k_rem_pio2f.c
+++ b/sysdeps/powerpc/fpu/k_rem_pio2f.c
@@ -135,16 +135,16 @@ recompute:
   /* distill q[] into iq[] reversingly */
   for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
     {
-      fw = __truncf (twon8 * z);
+      fw = __truncf_inline (twon8 * z);
       iq[i] = (int32_t) (z - two8 * fw);
       z = q[j - 1] + fw;
     }
 
   /* compute n */
   z = __scalbnf (z, q0);       /* actual value of z */
-  z -= 8.0 * __floorf (z * 0.125);       /* trim off integer >= 8 */
+  z -= 8.0 * __floorf_inline (z * 0.125);       /* trim off integer >= 8 */
   n = (int32_t) z;
-  z -= __truncf (z);
+  z -= __truncf_inline (z);
   ih = 0;
   if (q0 > 0)
     {                          /* need iq[jz-1] to determine n */
@@ -234,7 +234,7 @@ recompute:
       z = __scalbnf (z, -q0);
       if (z >= two8)
        {
-         fw = __truncf (twon8 * z);
+         fw = __truncf_inline (twon8 * z);
          iq[jz] = (int32_t) (z - two8 * fw);
          jz += 1;
          q0 += 8;
diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
index 90021c6..881e5eb 100644
--- a/sysdeps/powerpc/fpu/math_private.h
+++ b/sysdeps/powerpc/fpu/math_private.h
@@ -27,9 +27,90 @@
 
 # if __WORDSIZE == 64 || defined _ARCH_PWR4
 #  define __CPU_HAS_FSQRT 1
+
+static inline double
+__ieee754_sqrt_inline (double __x)
+{
+  double __z;
+  __asm __volatile (
+	"	fsqrt	%0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
+static inline float
+__ieee754_sqrtf_inline (float __x)
+{
+  float __z;
+  __asm __volatile (
+	"       fsqrts  %0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
 # else
 #  define __CPU_HAS_FSQRT ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0)
-# endif
+#  define __ieee754_sqrt_inline __ieee754_sqrt
+# endif	// __WORDSIZE == 64 || defined _ARCH_PWR4
+
+
+# if defined _ARCH_PWR5X
+
+static inline double
+__floor_inline (double __x)
+{
+  double __z;
+  __asm __volatile (
+	"	frim %0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
+static inline float
+__floorf_inline (float __x)
+{
+  float __z;
+  __asm __volatile (
+	"	frim %0,%1\n"
+	"	frsp %0,%0\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
+static inline double
+__trunc_inline (double __x)
+{
+  double __z;
+  __asm __volatile (
+	"	friz %0,%1\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
+static inline float
+__truncf_inline (float __x)
+{
+  float __z;
+  __asm __volatile (
+	"	friz %0,%1\n"
+	"	frsp %0,%0\n"
+		: "=f" (__z)
+		: "f" (__x));
+  return __z;
+}
+
+# else
+#  define __floor_inline  __floor
+#  define __floorf_inline __floorf
+#  define __trunc_inline  __trunc
+#  define __truncf_inline __truncf
+# endif	// _ARCH_PWR5X
+
 
 # ifndef __LIBC_INTERNAL_MATH_INLINES
 extern double __slow_ieee754_sqrt (double);
-- 
1.7.1