This is the mail archive of the
gsl-discuss@sources.redhat.com
mailing list for the GSL project.
Optimization in cblas_dgemm routine
- From: Trammell Hudson <hudson at rotomotion dot com>
- To: gsl-discuss at sources dot redhat dot com
- Date: Tue, 14 Jan 2003 15:24:02 +0000
- Subject: Optimization in cblas_dgemm routine
- Organization: The Rotomotion Corporation
-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1
Here is a really minor optimization in the *gemm CBLAS routine that nets
about 10% improvement on my Pentium 4 machine. Since HPL Linpack spends
95% of its time in the cblas_dgemm call, this directly shows up in the
final GFLOPS results.
- --- /tmp/gsl-1.3/cblas/source_gemm_r.h Tue Jan 14 15:10:23 2003
+++ ./gsl-1.3/cblas/source_gemm_r.h Tue Jan 14 15:08:16 2003
@@ -70,11 +70,14 @@
~ /* form C := alpha*A*B + C */
~ for (k = 0; k < K; k++) {
+ const BASE *G_k = &G[ ldg * k ];
~ for (i = 0; i < n1; i++) {
~ const BASE temp = alpha * F[ldf * i + k];
+ BASE *C_i = &C[ ldc * i ];
+
~ if (temp != 0.0) {
~ for (j = 0; j < n2; j++) {
- - C[ldc * i + j] += temp * G[ldg * k + j];
+ C_i[j] += temp * G_k[j];
~ }
~ }
~ }
@@ -85,23 +88,33 @@
~ /* form C := alpha*A*B' + C */
~ for (i = 0; i < n1; i++) {
+ BASE *C_i = &C[ ldc * i ];
+ const BASE *F_i = &F[ ldf * i ];
+
~ for (j = 0; j < n2; j++) {
~ BASE temp = 0.0;
+ const BASE *G_j = &G[ ldg * j ];
+
~ for (k = 0; k < K; k++) {
- - temp += F[ldf * i + k] * G[ldg * j + k];
+ temp += F_i[k] * G_j[k];
~ }
- - C[ldc * i + j] += alpha * temp;
+ C_i[j] += alpha * temp;
~ }
~ }
~ } else if (TransF == CblasTrans && TransG == CblasNoTrans) {
~ for (k = 0; k < K; k++) {
+ const BASE *G_k = &G[ ldg * k ];
+ const BASE *F_k = &F[ ldf * k ];
+
~ for (i = 0; i < n1; i++) {
- - const BASE temp = alpha * F[ldf * k + i];
+ const BASE temp = alpha * F_k[i];
+ BASE *C_i = &C[ ldc * i ];
+
~ if (temp != 0.0) {
~ for (j = 0; j < n2; j++) {
- - C[ldc * i + j] += temp * G[ldg * k + j];
+ C_i[j] += temp * G_k[j];
~ }
~ }
~ }
@@ -110,12 +123,16 @@
~ } else if (TransF == CblasTrans && TransG == CblasTrans) {
~ for (i = 0; i < n1; i++) {
+ BASE *C_i = &C[ ldc * i ];
+
~ for (j = 0; j < n2; j++) {
~ BASE temp = 0.0;
+ const BASE *G_j = &G[ ldg * j ];
+
~ for (k = 0; k < K; k++) {
- - temp += F[ldf * k + i] * G[ldg * j + k];
+ temp += F[ldf * k + i] * G_j[k];
~ }
- - C[ldc * i + j] += alpha * temp;
+ C_i[j] += alpha * temp;
~ }
~ }
-----BEGIN PGP SIGNATURE-----
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org
iD8DBQE+JCuRMXvNuse+YRoRAj3YAJ9NxTpJki5CX4HJ3X9dLNr/uXqlcACbBK57
EnzkPKugSsKTeV4/5l1bUFI=
=zHaT
-----END PGP SIGNATURE-----