This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH v3] Don't spin around multiplying zeroes in __mul


Hi,

I managed to improve this patch a bit further.  Attached is an updated
patch, this time also with comments describing the rationale.  The
testsuite continues to run clean o x86_64 and the performance numbers
are as follows:

Without the patch:

Total:41379782042, Fastest:402117, Slowest:795883, Avg:413797.820420

With the patch:

Total:26096919666, Fastest:256379, Slowest:811423, Avg:260969.196660

That makes it a 36.9% improvement on average runtime and 36.2%
improvement on the fastest time.  So that's about 7% better than the
last patch.


Siddhesh

	* sysdeps/ieee754/dbl-64/mpa.c (__mul): Don't bother with zero
	values in the mantissa.


diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c
index dffa9d8..0555493 100644
--- a/sysdeps/ieee754/dbl-64/mpa.c
+++ b/sysdeps/ieee754/dbl-64/mpa.c
@@ -604,7 +604,7 @@ void
 SECTION
 __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
 {
-  int i, j, k, k2;
+  int i, j, k, ip, ip2;
   double u, zk;
 
   /* Is z=0?  */
@@ -614,11 +614,35 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
       return;
     }
 
+  /* We need not iterate through all X's and Y's since it's pointless to
+     multiply zeroes.  Here, both are zero...  */
+  for (ip2 = p; ip2 > 0; ip2--)
+    if (X[ip2] != ZERO || Y[ip2] != ZERO)
+      break;
+
+  /* ... and here, at least one of them is still zero.  */
+  for (ip = ip2; ip > 0; ip--)
+    if (X[ip] * Y[ip] != ZERO)
+      break;
+
   /* Multiply, add and carry.  */
-  k2 = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
-  zk = Z[k2] = ZERO;
+  k = (__glibc_unlikely (p < 3)) ? p + p : p + 3;
+
+  /* For X with precision IP and Y with precision IP2, the term X[I]*Y[K-I] is
+     non-zero only when the ranges of non-zero values overlap.  This happens
+     only for values of K <= IP + IP2.  Note that we go from 1..K-1, which is
+     why we come down to IP + IP2 + 1 and not just IP + IP2.  */
+  while (k > ip + ip2 + 1)
+    Z[k--] = ZERO;
+
+  zk = Z[k] = ZERO;
 
-  for (k = k2; k > p; k--)
+  /* This gives us additional precision if required.  This is only executed
+     when P < IP1 + IP2 + 1, i.e. at least one of the numbers have precision
+     of greater than or equal to half of what's required (P).  Anything less
+     and we're just wasting our time since we'll be spinning around
+     multiplying zeroes.  */
+  while (k > p)
     {
       for (i = k - p, j = p; i < p + 1; i++, j--)
 	zk += X[i] * Y[j];
@@ -626,10 +650,11 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
       u = (zk + CUTTER) - CUTTER;
       if (u > zk)
 	u -= RADIX;
-      Z[k] = zk - u;
+      Z[k--] = zk - u;
       zk = u * RADIXI;
     }
 
+  /* The real deal.  */
   while (k > 1)
     {
       for (i = 1, j = k - 1; i < k; i++, j--)
@@ -638,9 +663,8 @@ __mul (const mp_no *x, const mp_no *y, mp_no *z, int p)
       u = (zk + CUTTER) - CUTTER;
       if (u > zk)
 	u -= RADIX;
-      Z[k] = zk - u;
+      Z[k--] = zk - u;
       zk = u * RADIXI;
-      k--;
     }
   Z[k] = zk;
 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]