[PATCH] PowerPC: Trigonometric optimizations for POWER
Adhemerval Zanella
azanella@linux.vnet.ibm.com
Tue Oct 25 19:06:00 GMT 2011
On 09/14/2011 10:13 AM, Andreas Schwab wrote:
> You didn't run the testsuite.
>
> check-localplt.out:
> --- ../scripts/data/localplt-powerpc-linux-gnu.data 2011-09-13 18:01:15.000000000 +0000
> +++ - 2011-09-14 11:55:41.838340244 +0000
> @@ -4,4 +4,8 @@
> libc.so: malloc
> libc.so: memalign
> libc.so: realloc
> +libm.so: floorf
> libm.so: matherr
> +libm.so: sqrt
> +libm.so: sqrtf
> +libm.so: truncf
>
> Andreas.
>
I used these symbols and not the usual GLIBC ones (__ieee754_sqrt, _floorf, __truncf)
so the compiler could use the intrinsic PPC instruction for each function
(fsqrt[f], frim, friz) instead of branches to internal symbols.
At first sight it works correctly for any ISA 2.03+ (POWER5+ and forward), however
you probably built it using POWER4 optimizations and then the symbols generated
PTL calls.
I also found another issue related to sqrt(): although it is defined on ISA 2.02 (POWER4)
newer GCCs translates it to:
sqrt(x):
r = fsqrt(x) // PPC instruction
if (x < 0)
r = sqrt(x) // PLT call
While some old GCC version (like the one on RHEL6, which I was using) translate to just an
'fsqrt' instruction. Since on this algorithm x is always higher than 0 we can use 'fqsrt'
directly.
You corrected it by calling the GLIBC internal symbols (__ieee754_sqrt, _floorf, __truncf).
However this is not the best solution regarding performance: this patch creates inline
assembly function to use PPC instruction when compilation allows and relaying on the
internal GLIBC function only when the instruction are now implemented (for instance, on POWER4).
---
[PATCH] PowerPC - Patch for assembly version of truncf/floorf
This patch creates inline assembly functions that use intrinsic PPC
floating point instructions when the platform supports them but rely on
the internal GLIBC functions when the instructions are not implemented
(for instance, on POWER4).
2011-08-01 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/fpu/e_hypot.c: Use PPC intrinsic instruction, when
avaliable, for __ieee754_sqrt.
* sysdeps/powerpc/fpu/e_hypotf.c: Use PPC intrinsic instruction, when
avaliable, for __ieee754_sqrtf.
* sysdeps/powerpc/fpu/e_rem_pio2f.c: Use PPC intrinsic instruction,
when avaliable, for __floorf.
* sysdeps/powerpc/fpu/k_rem_pio2f.c: Use PPC intrinsic instruction,
when avaliable, for __floorf and __truncf.
* sysdeps/powerpc/fpu/math_private.h: Implemented inline function to
use PPC intrinsic instructions for __ieee754_sqrt, __floorf, and
__truncf.
sysdeps/powerpc/fpu/e_hypot.c | 8 ++--
sysdeps/powerpc/fpu/e_hypotf.c | 8 ++--
sysdeps/powerpc/fpu/e_rem_pio2f.c | 8 ++--
sysdeps/powerpc/fpu/k_rem_pio2f.c | 8 ++--
sysdeps/powerpc/fpu/math_private.h | 83 +++++++++++++++++++++++++++++++++++-
5 files changed, 98 insertions(+), 17 deletions(-)
diff --git a/sysdeps/powerpc/fpu/e_hypot.c b/sysdeps/powerpc/fpu/e_hypot.c
index 454a59d..e38bc49 100644
--- a/sysdeps/powerpc/fpu/e_hypot.c
+++ b/sysdeps/powerpc/fpu/e_hypot.c
@@ -96,7 +96,7 @@ __ieee754_hypot (double x, double y)
{
x *= twoM600;
y *= twoM600;
- return __ieee754_sqrt (x * x + y * y) / twoM600;
+ return __ieee754_sqrt_inline (x * x + y * y) / twoM600;
}
if (y < twoM500)
{
@@ -104,14 +104,14 @@ __ieee754_hypot (double x, double y)
{
x *= two1022;
y *= two1022;
- return __ieee754_sqrt (x * x + y * y) / two1022;
+ return __ieee754_sqrt_inline (x * x + y * y) / two1022;
}
else
{
x *= two600;
y *= two600;
- return __ieee754_sqrt (x * x + y * y) / two600;
+ return __ieee754_sqrt_inline (x * x + y * y) / two600;
}
}
- return __ieee754_sqrt (x * x + y * y);
+ return __ieee754_sqrt_inline (x * x + y * y);
}
diff --git a/sysdeps/powerpc/fpu/e_hypotf.c b/sysdeps/powerpc/fpu/e_hypotf.c
index e3757ff..a7d03a3 100644
--- a/sysdeps/powerpc/fpu/e_hypotf.c
+++ b/sysdeps/powerpc/fpu/e_hypotf.c
@@ -97,7 +97,7 @@ __ieee754_hypotf (float x, float y)
{
x *= twoM60;
y *= twoM60;
- return __ieee754_sqrtf (x * x + y * y) / twoM60;
+ return __ieee754_sqrtf_inline (x * x + y * y) / twoM60;
}
if (y < twoM50)
{
@@ -105,14 +105,14 @@ __ieee754_hypotf (float x, float y)
{
x *= two126;
y *= two126;
- return __ieee754_sqrtf (x * x + y * y) / two126;
+ return __ieee754_sqrtf_inline (x * x + y * y) / two126;
}
else
{
x *= two60;
y *= two60;
- return __ieee754_sqrtf (x * x + y * y) / two60;
+ return __ieee754_sqrtf_inline (x * x + y * y) / two60;
}
}
- return __ieee754_sqrtf (x * x + y * y);
+ return __ieee754_sqrtf_inline (x * x + y * y);
}
diff --git a/sysdeps/powerpc/fpu/e_rem_pio2f.c b/sysdeps/powerpc/fpu/e_rem_pio2f.c
index a0c2890..a5dd080 100644
--- a/sysdeps/powerpc/fpu/e_rem_pio2f.c
+++ b/sysdeps/powerpc/fpu/e_rem_pio2f.c
@@ -112,7 +112,7 @@ __ieee754_rem_pio2f (float x, float *y)
}
if (ax <= pio2_2e7)
{
- n = __floorf (ax * invpio2 + half);
+ n = __floorf_inline (ax * invpio2 + half);
i = (int32_t) n;
r = ax - n * pio2_1;
w = n * pio2_1t; /* 1st round good to 40 bit */
@@ -168,11 +168,11 @@ __ieee754_rem_pio2f (float x, float *y)
e0 = __float_and8 (ax / 128.0);
z = ax / e0;
- tx[0] = __floorf (z);
+ tx[0] = __floorf_inline (z);
z = (z - tx[0]) * two8;
- tx[1] = __floorf (z);
+ tx[1] = __floorf_inline (z);
z = (z - tx[1]) * two8;
- tx[2] = __floorf (z);
+ tx[2] = __floorf_inline (z);
nx = 3;
while (tx[nx - 1] == zero)
diff --git a/sysdeps/powerpc/fpu/k_rem_pio2f.c b/sysdeps/powerpc/fpu/k_rem_pio2f.c
index edaef09..1e3cf06 100644
--- a/sysdeps/powerpc/fpu/k_rem_pio2f.c
+++ b/sysdeps/powerpc/fpu/k_rem_pio2f.c
@@ -135,16 +135,16 @@ recompute:
/* distill q[] into iq[] reversingly */
for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
{
- fw = __truncf (twon8 * z);
+ fw = __truncf_inline (twon8 * z);
iq[i] = (int32_t) (z - two8 * fw);
z = q[j - 1] + fw;
}
/* compute n */
z = __scalbnf (z, q0); /* actual value of z */
- z -= 8.0 * __floorf (z * 0.125); /* trim off integer >= 8 */
+ z -= 8.0 * __floorf_inline (z * 0.125); /* trim off integer >= 8 */
n = (int32_t) z;
- z -= __truncf (z);
+ z -= __truncf_inline (z);
ih = 0;
if (q0 > 0)
{ /* need iq[jz-1] to determine n */
@@ -234,7 +234,7 @@ recompute:
z = __scalbnf (z, -q0);
if (z >= two8)
{
- fw = __truncf (twon8 * z);
+ fw = __truncf_inline (twon8 * z);
iq[jz] = (int32_t) (z - two8 * fw);
jz += 1;
q0 += 8;
diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
index 90021c6..881e5eb 100644
--- a/sysdeps/powerpc/fpu/math_private.h
+++ b/sysdeps/powerpc/fpu/math_private.h
@@ -27,9 +27,90 @@
# if __WORDSIZE == 64 || defined _ARCH_PWR4
# define __CPU_HAS_FSQRT 1
+
+static inline double
+__ieee754_sqrt_inline (double __x)
+{
+ double __z;
+ __asm __volatile (
+ " fsqrt %0,%1\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
+static inline float
+__ieee754_sqrtf_inline (float __x)
+{
+ float __z;
+ __asm __volatile (
+ " fsqrts %0,%1\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
# else
# define __CPU_HAS_FSQRT ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0)
-# endif
+# define __ieee754_sqrt_inline __ieee754_sqrt
+# endif // __WORDSIZE == 64 || defined _ARCH_PWR4
+
+
+# if defined _ARCH_PWR5X
+
+static inline double
+__floor_inline (double __x)
+{
+ double __z;
+ __asm __volatile (
+ " frim %0,%1\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
+static inline float
+__floorf_inline (float __x)
+{
+ float __z;
+ __asm __volatile (
+ " frim %0,%1\n"
+ " frsp %0,%0\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
+static inline double
+__trunc_inline (double __x)
+{
+ double __z;
+ __asm __volatile (
+ " friz %0,%1\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
+static inline float
+__truncf_inline (float __x)
+{
+ float __z;
+ __asm __volatile (
+ " friz %0,%1\n"
+ " frsp %0,%0\n"
+ : "=f" (__z)
+ : "f" (__x));
+ return __z;
+}
+
+# else
+# define __floor_inline __floor
+# define __floorf_inline __floorf
+# define __trunc_inline __trunc
+# define __truncf_inline __truncf
+# endif // _ARCH_PWR5X
+
# ifndef __LIBC_INTERNAL_MATH_INLINES
extern double __slow_ieee754_sqrt (double);
--
1.7.1
More information about the Libc-alpha
mailing list