[PATCH v3] malloc: Optimize small memory zeroing for calloc

H.J. Lu hjl.tools@gmail.com
Thu Nov 28 09:56:34 GMT 2024


For memory size up to 9 * INTERNAL_SIZE_T bytes, calloc has special codes
to clear the memory.  Add calloc-clear-memory.h to allow architecture
specific optimization.  The default unroll size is 9.  Since the size
isn't fixed, compiler can't generate the best inline codes for all
targets.  Since x86-64 has fast unaligned 16-byte vector stores, arrange
the codes to help compiler vectorize stores with overlapping unaligned
vector stores with 1 branch, instead of up to 3, and up to 5 stores,
instead of 9.  The calloc microbenchmark shows:

Test Platform: Xeon-8380
Bench Function: calloc
Ratio: New / Original time_per_iteration (Lower is Better)

Threads#   | Ratio
-----------|------
1 thread   | 0.953
4 threads  | 0.952

NB: Since INTERNAL_SIZE_T can be either 4 bytes or 8 bytes and not all
targets have fast unaligned vector stores, x86-64 approach doesn't work
for all targets.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
---
 malloc/malloc-internal.h                |  1 +
 malloc/malloc.c                         | 30 +++-------------
 sysdeps/generic/calloc-clear-memory.h   | 46 +++++++++++++++++++++++++
 sysdeps/x86_64/64/calloc-clear-memory.h | 44 +++++++++++++++++++++++
 4 files changed, 96 insertions(+), 25 deletions(-)
 create mode 100644 sysdeps/generic/calloc-clear-memory.h
 create mode 100644 sysdeps/x86_64/64/calloc-clear-memory.h

diff --git a/malloc/malloc-internal.h b/malloc/malloc-internal.h
index cba03433fe..3349e2d1fe 100644
--- a/malloc/malloc-internal.h
+++ b/malloc/malloc-internal.h
@@ -23,6 +23,7 @@
 #include <malloc-sysdep.h>
 #include <malloc-size.h>
 #include <malloc-hugepages.h>
+#include <calloc-clear-memory.h>
 
 /* Called in the parent process before a fork.  */
 void __malloc_fork_lock_parent (void) attribute_hidden;
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 81ddd2c3a8..53f5c856ad 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -3853,38 +3853,18 @@ __libc_calloc (size_t n, size_t elem_size)
     }
 #endif
 
-  /* Unroll clear of <= 36 bytes (72 if 8byte sizes).  We know that
-     contents have an odd number of INTERNAL_SIZE_T-sized words;
-     minimally 3.  */
+  /* Unroll clear of <= MAX_CLEAR_SMALL_MEMORY_SIZE * INTERNAL_SIZE_T
+     bytes.  We know that contents have an odd number of INTERNAL_SIZE_T-
+     sized words; minimally 3.  */
   d = (INTERNAL_SIZE_T *) mem;
   clearsize = csz - SIZE_SZ;
   nclears = clearsize / sizeof (INTERNAL_SIZE_T);
   assert (nclears >= 3);
 
-  if (nclears > 9)
+  if (nclears > MAX_CLEAR_SMALL_MEMORY_SIZE)
     return memset (d, 0, clearsize);
-
   else
-    {
-      *(d + 0) = 0;
-      *(d + 1) = 0;
-      *(d + 2) = 0;
-      if (nclears > 4)
-        {
-          *(d + 3) = 0;
-          *(d + 4) = 0;
-          if (nclears > 6)
-            {
-              *(d + 5) = 0;
-              *(d + 6) = 0;
-              if (nclears > 8)
-                {
-                  *(d + 7) = 0;
-                  *(d + 8) = 0;
-                }
-            }
-        }
-    }
+    clear_small_memory (d, nclears);
 
   return mem;
 }
diff --git a/sysdeps/generic/calloc-clear-memory.h b/sysdeps/generic/calloc-clear-memory.h
new file mode 100644
index 0000000000..2b9a2941dc
--- /dev/null
+++ b/sysdeps/generic/calloc-clear-memory.h
@@ -0,0 +1,46 @@
+/* Clear a block of memory for calloc.  Generic version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MAX_CLEAR_SMALL_MEMORY_SIZE 9
+
+/* Clear a memory size up to MAX_CLEAR_SMALL_MEMORY_SIZE * INTERNAL_SIZE_T
+   bytes.  We know that contents have an odd number of INTERNAL_SIZE_T-
+   sized words; minimally 3 words.  */
+
+static __always_inline void
+clear_small_memory (INTERNAL_SIZE_T *mem, unsigned long nclears)
+{
+  *(mem + 0) = 0;
+  *(mem + 1) = 0;
+  *(mem + 2) = 0;
+  if (nclears > 4)
+    {
+      *(mem + 3) = 0;
+      *(mem + 4) = 0;
+      if (nclears > 6)
+	{
+	  *(mem + 5) = 0;
+	  *(mem + 6) = 0;
+	  if (nclears > 8)
+	    {
+	      *(mem + 7) = 0;
+	      *(mem + 8) = 0;
+	    }
+	}
+    }
+}
diff --git a/sysdeps/x86_64/64/calloc-clear-memory.h b/sysdeps/x86_64/64/calloc-clear-memory.h
new file mode 100644
index 0000000000..b9fbd896d0
--- /dev/null
+++ b/sysdeps/x86_64/64/calloc-clear-memory.h
@@ -0,0 +1,44 @@
+/* Clear a block of memory for calloc.  X86-64/LP64 version.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MAX_CLEAR_SMALL_MEMORY_SIZE 9
+
+/* Clear a memory size up to MAX_CLEAR_SMALL_MEMORY_SIZE * INTERNAL_SIZE_T
+   bytes.  We know that contents have an odd number of INTERNAL_SIZE_T-
+   sized words; minimally 3 words.  */
+
+static __always_inline void
+clear_small_memory (INTERNAL_SIZE_T *mem, unsigned long nclears)
+{
+  /* Since x86-64 has fast unaligned 16-byte vector stores, arrange the
+     codes to help compiler vectorize stores with overlapping unaligned
+     vector stores with 1 branch, instead of up to 3, and up to 5 stores,
+     instead of 9.  */
+  *(mem + 0) = 0;
+  *(mem + 1) = 0;
+  *(mem + 2) = 0;
+  *(mem + nclears - 2) = 0;
+  *(mem + nclears - 2 + 1) = 0;
+  if (nclears > 6)
+    {
+      *(mem + 3) = 0;
+      *(mem + 3 + 1) = 0;
+      *(mem + nclears - 4) = 0;
+      *(mem + nclears - 4 + 1) = 0;
+    }
+}
-- 
2.47.0



More information about the Libc-alpha mailing list