Optimization in cblas_dgemm routine

Trammell Hudson hudson@rotomotion.com
Tue Jan 14 17:24:00 GMT 2003


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Here is a really minor optimization in the *gemm CBLAS routine that nets
about 10% improvement on my Pentium 4 machine.  Since HPL Linpack spends
95% of its time in the cblas_dgemm call, this directly shows up in the
final GFLOPS results.


- --- /tmp/gsl-1.3/cblas/source_gemm_r.h  Tue Jan 14 15:10:23 2003
+++ ./gsl-1.3/cblas/source_gemm_r.h     Tue Jan 14 15:08:16 2003
@@ -70,11 +70,14 @@
~     /* form  C := alpha*A*B + C */

~     for (k = 0; k < K; k++) {
+      const BASE *G_k = &G[ ldg * k ];
~       for (i = 0; i < n1; i++) {
~        const BASE temp = alpha * F[ldf * i + k];
+       BASE *C_i = &C[ ldc * i ];
+
~        if (temp != 0.0) {
~          for (j = 0; j < n2; j++) {
- -           C[ldc * i + j] += temp * G[ldg * k + j];
+           C_i[j] += temp * G_k[j];
~          }
~        }
~       }
@@ -85,23 +88,33 @@
~     /* form  C := alpha*A*B' + C */

~     for (i = 0; i < n1; i++) {
+      BASE *C_i = &C[ ldc * i ];
+      const BASE *F_i = &F[ ldf * i ];
+
~       for (j = 0; j < n2; j++) {
~        BASE temp = 0.0;
+       const BASE *G_j = &G[ ldg * j ];
+
~        for (k = 0; k < K; k++) {
- -         temp += F[ldf * i + k] * G[ldg * j + k];
+         temp += F_i[k] * G_j[k];
~        }
- -       C[ldc * i + j] += alpha * temp;
+       C_i[j] += alpha * temp;
~       }
~     }

~   } else if (TransF == CblasTrans && TransG == CblasNoTrans) {

~     for (k = 0; k < K; k++) {
+      const BASE *G_k = &G[ ldg * k ];
+      const BASE *F_k = &F[ ldf * k ];
+
~       for (i = 0; i < n1; i++) {
- -       const BASE temp = alpha * F[ldf * k + i];
+       const BASE temp = alpha * F_k[i];
+       BASE *C_i = &C[ ldc * i ];
+
~        if (temp != 0.0) {
~          for (j = 0; j < n2; j++) {
- -           C[ldc * i + j] += temp * G[ldg * k + j];
+           C_i[j] += temp * G_k[j];
~          }
~        }
~       }
@@ -110,12 +123,16 @@
~   } else if (TransF == CblasTrans && TransG == CblasTrans) {

~     for (i = 0; i < n1; i++) {
+      BASE *C_i = &C[ ldc * i ];
+
~       for (j = 0; j < n2; j++) {
~        BASE temp = 0.0;
+       const BASE *G_j = &G[ ldg * j ];
+
~        for (k = 0; k < K; k++) {
- -         temp += F[ldf * k + i] * G[ldg * j + k];
+         temp += F[ldf * k + i] * G_j[k];
~        }
- -       C[ldc * i + j] += alpha * temp;
+       C_i[j] += alpha * temp;
~       }
~     }

-----BEGIN PGP SIGNATURE-----
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iD8DBQE+JCuRMXvNuse+YRoRAj3YAJ9NxTpJki5CX4HJ3X9dLNr/uXqlcACbBK57
EnzkPKugSsKTeV4/5l1bUFI=
=zHaT
-----END PGP SIGNATURE-----



More information about the Gsl-discuss mailing list