This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

PATCH] Improve performance of memmem


This patch significantly improves performance of memmem using a novel
modified Horspool algorithm.  Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.

By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.

Small needles up to size 2 use a dedicated linear search.  Very long needles
use the Two-Way algorithm (to avoid increasing stack size inlining is now disabled).

The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).

The size-optimized memmem has also been rewritten from scratch to get a
2.7x performance gain.

Tested against GLIBC testsuite and randomized tests.

--

diff --git a/newlib/libc/string/memmem.c b/newlib/libc/string/memmem.c
index 5c57eff9cb7998587cb73324e7d79f6944f52888..3516e9c0511935e89941951a1c91ef2b3e916033 100644
--- a/newlib/libc/string/memmem.c
+++ b/newlib/libc/string/memmem.c
@@ -1,8 +1,30 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
- * Copyright (C) 2008 Eric Blake
- * Permission to use, copy, modify, and distribute this software
- * is freely granted, provided that this notice is preserved.
- */
+/* Optimized memmem function.
+   Copyright (c) 2018 Arm Ltd.  All rights reserved.
+
+   SPDX-License-Identifier: BSD-3-Clause
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   3. The name of the company may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
 /*
 FUNCTION
@@ -13,7 +35,7 @@ INDEX
 
 SYNOPSIS
 	#include <string.h>
-	char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
+	void *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
 		     size_t <[l2]>);
 
 DESCRIPTION
@@ -21,8 +43,8 @@ DESCRIPTION
 	Locates the first occurrence in the memory region pointed to
 	by <[s1]> with length <[l1]> of the sequence of bytes pointed
 	to by <[s2]> of length <[l2]>.  If you already know the
-	lengths of your haystack and needle, <<memmem>> can be much
-	faster than <<strstr>>.
+	lengths of your haystack and needle, <<memmem>> is much faster
+	than <<strstr>>.
 
 RETURNS
 	Returns a pointer to the located segment, or a null pointer if
@@ -38,64 +60,126 @@ QUICKREF
 */
 
 #include <string.h>
+#include <stdint.h>
+
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+
+/* Small and efficient memmem implementation.  */
+void *
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
+{
+  const char *hs = haystack;
+  const char *ne = needle;
+
+  if (ne_len == 0)
+    return (void *)hs;
+  int i;
+  int c = ne[0];
+  const char *end = hs + hs_len - ne_len;
+
+  for ( ; hs <= end; hs++)
+  {
+    if (hs[0] != c)
+      continue;
+    for (i = ne_len-1; i != 0; i--)
+      if (hs[i] != ne[i])
+	break;
+    if (i == 0)
+      return (void *)hs;
+  }
+
+  return NULL;
+}
+
+#else
 
-#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
 # define RETURN_TYPE void *
 # define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
 # include "str-two-way.h"
-#endif
 
+#define hash2(p) ((size_t)((p)[0]-((p)[-1]<<3)) % sizeof (shift))
+
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+   Small needles up to size 2 use a dedicated linear search.  Longer needles
+   up to size 256 use a novel modified Horspool algorithm.  It hashes pairs
+   of characters to quickly skip past mismatches.
+   By limiting the needle length to 256, the shift table can be reduced to 8
+   bits per entry, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies worst-case performance is linear.  On long needles a
+   self-adapting filtering step is used to quickly determine mismatches.
+   Needles larger than 256 characters use the linear-time Two-Way algorithm.  */
 void *
-memmem (const void *haystack_start,
-	size_t haystack_len,
-	const void *needle_start,
-	size_t needle_len)
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
 {
-  /* Abstract memory is considered to be an array of 'unsigned char' values,
-     not an array of 'char' values.  See ISO C 99 section 6.2.6.1.  */
-  const unsigned char *haystack = (const unsigned char *) haystack_start;
-  const unsigned char *needle = (const unsigned char *) needle_start;
+  const unsigned char *hs = haystack;
+  const unsigned char *ne = needle;
 
-  if (needle_len == 0)
-    /* The first occurrence of the empty string is deemed to occur at
-       the beginning of the string.  */
-    return (void *) haystack;
+  if (ne_len == 0)
+    return (void *) hs;
+  if (ne_len == 1)
+    return (void *) memchr (hs, ne[0], hs_len);
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
+    return NULL;
 
-  /* Less code size, but quadratic performance in the worst case.  */
-  while (needle_len <= haystack_len)
+  const unsigned char *end = hs + hs_len - ne_len;
+
+  if (ne_len == 2)
     {
-      if (!memcmp (haystack, needle, needle_len))
-        return (void *) haystack;
-      haystack++;
-      haystack_len--;
+      uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+      for (hs++; hs <= end && hw != nw; hw = hw << 16 | *++hs);
+      return hw == nw ? (void *)hs - 1 : NULL;
     }
-  return NULL;
 
-#else /* compilation for speed */
+  /* Use Two-Way algorithm for very long needles.  */
+  if (__builtin_expect (ne_len > 256, 0))
+    return two_way_long_needle (hs, hs_len, ne, ne_len);
 
-  /* Larger code size, but guaranteed linear performance.  */
+  uint8_t shift[256];
+  size_t tmp, shift1;
+  size_t m1 = ne_len - 1;
+  size_t offset = 0;
 
-  /* Sanity check, otherwise the loop might search through the whole
-     memory.  */
-  if (haystack_len < needle_len)
-    return NULL;
+  /* Initialize bad character shift hash table.  */
+  memset (shift, 0, sizeof (shift));
+  for (int i = 1; i < m1; i++)
+    shift[hash2 (ne + i)] = i;
+  shift1 = m1 - shift[hash2 (ne + m1)];
+  shift[hash2 (ne + m1)] = m1;
 
-  /* Use optimizations in memchr when possible, to reduce the search
-     size of haystack using a linear algorithm with a smaller
-     coefficient.  However, avoid memchr for long needles, since we
-     can often achieve sublinear performance.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
+  for ( ; hs <= end; )
     {
-      haystack = memchr (haystack, *needle, haystack_len);
-      if (!haystack || needle_len == 1)
-	return (void *) haystack;
-      haystack_len -= haystack - (const unsigned char *) haystack_start;
-      if (haystack_len < needle_len)
-	return NULL;
-      return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+      /* Skip past character pairs not in the needle.  */
+      do
+	{
+	  hs += m1;
+	  tmp = shift[hash2 (hs)];
+	}
+      while (tmp == 0 && hs <= end);
+
+      /* If the match is not at the end of the needle, shift to the end
+	 and continue until we match the last 2 characters.  */
+      hs -= tmp;
+      if (tmp < m1)
+	continue;
+
+      /* We've matched the last 2 characters.  If the needle is long, check a
+	 fixed number of characters first to quickly filter out mismatches.  */
+      if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+	{
+	  if (memcmp (hs, ne, m1) == 0)
+	    return (void *) hs;
+
+	  /* Adjust filter position when it doesn't find the mismatch.  */
+	  offset = offset >= sizeof (long) ? offset - sizeof (long)
+		   : m1 - 1 - sizeof (long);
+	}
+
+      /* Shift based on matching the last 2 characters in the needle.  */
+      hs += shift1;
     }
-  return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-#endif /* compilation for speed */
+  return NULL;
 }
+#endif /* Compilation for speed.  */
diff --git a/newlib/libc/string/str-two-way.h b/newlib/libc/string/str-two-way.h
index 416f9d29ff07b79da526b8bd4cee0b7142af36b4..90345a8dec6f140b00886efa74efd5ab12d44af3 100644
--- a/newlib/libc/string/str-two-way.h
+++ b/newlib/libc/string/str-two-way.h
@@ -31,6 +31,7 @@
 
 #include <limits.h>
 #include <stdint.h>
+#include <_ansi.h>
 
 /* We use the Two-Way string matching algorithm, which guarantees
    linear complexity with constant space.  Additionally, for long
@@ -288,7 +289,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
    sublinear performance is not possible.  */
-static RETURN_TYPE
+_NOINLINE_STATIC RETURN_TYPE
 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
 		     const unsigned char *needle, size_t needle_len)
 {


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]