This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Improve performance of strstr


This patch significantly improves performance of strstr by using Sunday's
Quick-Search algorithm.  Due to its simplicity it has the best average
performance of string matching algorithms on almost all inputs.  It uses a
bad-character shift table to skip past mismatches.

The needle length is limited to 254 - this reduces the shift table memory 
4 to 8 times, lowering preprocessing overhead and minimizing cache effects.
The limit also implies its worst-case performance is linear.

Larger needles are processed by the Two-Way algorithm.  The macro AVAILABLE
has been simplified as the haystack length is always precomputed.  This
results in a 2.5 times speedup for large needles, reducing the performance
drop when the Quick-Search algorithm can't be used.

The code for small needles 1-4 bytes has been simplified and now uses unsigned
char.  Since the optimized code relies on 8-bit chars, we defer to the
size-optimized implementation if CHAR_BIT > 8.

The performance gain of finding a set of randomly chosen words of size 8 in 
256 bytes of English text is 14 times on AArch64. For longer haystacks the
gain is well over 20 times.

The size-optimized strstr has also been rewritten from scratch to improve
performance.  On the same test the performance gain is 69%.

Tested against GLIBC testsuite and passes randomized tests.

---

diff --git a/newlib/libc/string/strstr.c b/newlib/libc/string/strstr.c
index e72b4bd9125f928486f52bb3ebd199eacef7cfaf..ccbcc50a1b034d36200b9b4b17e8c4f62fcb7fc0 100644
--- a/newlib/libc/string/strstr.c
+++ b/newlib/libc/string/strstr.c
@@ -1,169 +1,154 @@
-/*
-FUNCTION
-	<<strstr>>---find string segment
-
-INDEX
-	strstr
-
-SYNOPSIS
-	#include <string.h>
-	char *strstr(const char *<[s1]>, const char *<[s2]>);
-
-DESCRIPTION
-	Locates the first occurrence in the string pointed to by <[s1]> of
-	the sequence of characters in the string pointed to by <[s2]>
-	(excluding the terminating null character).
-
-RETURNS
-	Returns a pointer to the located string segment, or a null
-	pointer if the string <[s2]> is not found. If <[s2]> points to
-	a string with zero length, <[s1]> is returned.
-
-PORTABILITY
-<<strstr>> is ANSI C.
-
-<<strstr>> requires no supporting OS subroutines.
-
-QUICKREF
-	strstr ansi pure
-*/
+/* Optimized strstr function.
+   Copyright (c) 2018 Arm Ltd.  All rights reserved.
+
+   SPDX-License-Identifier: BSD-3-Clause
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+   3. The name of the company may not be used to endorse or promote
+      products derived from this software without specific prior written
+      permission.
+
+   THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+   MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+   IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
 
 #include <string.h>
+#include <limits.h>
+
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) \
+    || CHAR_BIT > 8
 
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+/* Small and efficient strstr implementation.  */
 char *
-strstr (const char *searchee,
-	const char *lookfor)
+strstr (const char *hs, const char *ne)
 {
-  /* Less code size, but quadratic performance in the worst case.  */
-  if (*searchee == 0)
-    {
-      if (*lookfor)
-	return (char *) NULL;
-      return (char *) searchee;
-    }
+  size_t i;
+  int c = ne[0];
 
-  while (*searchee)
+  if (c == 0)
+    return (char*)hs;
+
+  for ( ; hs[0] != '\0'; hs++)
     {
-      size_t i;
-      i = 0;
-
-      while (1)
-	{
-	  if (lookfor[i] == 0)
-	    {
-	      return (char *) searchee;
-	    }
-
-	  if (lookfor[i] != searchee[i])
-	    {
-	      break;
-	    }
-	  i++;
-	}
-      searchee++;
+      if (hs[0] != c)
+	continue;
+      for (i = 1; ne[i] != 0; i++)
+	if (hs[i] != ne[i])
+	  break;
+      if (ne[i] == '\0')
+	return (char*)hs;
     }
 
-  return (char *) NULL;
+  return NULL;
+}
 
 #else /* compilation for speed */
 
 # define RETURN_TYPE char *
-# define AVAILABLE(h, h_l, j, n_l)			\
-  (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l))	\
-   && ((h_l) = (j) + (n_l)))
+/* AVAILABLE assumes haystack length has been precomputed.  */
+# define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
 # include "str-two-way.h"
 
 static inline char *
-strstr2 (const char *hs, const char *ne)
+strstr2 (const unsigned char *hs, const unsigned char *ne)
 {
   uint32_t h1 = (ne[0] << 16) | ne[1];
   uint32_t h2 = 0;
-  int c = hs[0];
-  while (h1 != h2 && c != 0)
-    {
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
       h2 = (h2 << 16) | c;
-      c = *++hs;
-    }
   return h1 == h2 ? (char *)hs - 2 : NULL;
 }
 
 static inline char *
-strstr3 (const char *hs, const char *ne)
+strstr3 (const unsigned char *hs, const unsigned char *ne)
 {
   uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8);
   uint32_t h2 = 0;
-  int c = hs[0];
-  while (h1 != h2 && c != 0)
-    {
+  for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs)
       h2 = (h2 | c) << 8;
-      c = *++hs;
-    }
   return h1 == h2 ? (char *)hs - 3 : NULL;
 }
 
 static inline char *
-strstr4 (const char *hs, const char *ne)
+strstr4 (const unsigned char *hs, const unsigned char *ne)
 {
   uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3];
   uint32_t h2 = 0;
-  int c = hs[0];
-  while (h1 != h2 && c != 0)
-    {
-      h2 = (h2 << 8) | c;
-      c = *++hs;
-    }
+  for (int c = hs[0]; c != 0 && h1 != h2; c = *++hs)
+    h2 = (h2 << 8) | c;
   return h1 == h2 ? (char *)hs - 4 : NULL;
 }
 
+/* Extremely fast strstr algorithm with guaranteed linear-time performance.
+   Small needles up to size 4 use a dedicated linear search.  Longer needles
+   up to size 254 use Sunday's Quick-Search algorithm.  Due to its simplicity
+   it has the best average performance of string matching algorithms on almost
+   all inputs.  It uses a bad-character shift table to skip past mismatches.
+   By limiting the needle length to 254, the shift table can be reduced to
+   256 bytes, lowering preprocessing overhead and minimizing cache effects.
+   The limit also implies the worst-case performance is linear.
+   Even larger needles are processed by the linear-time Two-Way algorithm.
+*/
 char *
-strstr (const char *searchee,
-	const char *lookfor)
+strstr (const char *haystack, const char *needle)
 {
-  /* Larger code size, but guaranteed linear performance.  */
-  const char *haystack = searchee;
-  const char *needle = lookfor;
-  size_t needle_len; /* Length of NEEDLE.  */
-  size_t haystack_len; /* Known minimum length of HAYSTACK.  */
-  int ok = 1; /* True if NEEDLE is prefix of HAYSTACK.  */
+  const unsigned char *hs = (const unsigned char *) haystack;
+  const unsigned char *ne = (const unsigned char *) needle;
 
   /* Handle short needle special cases first.  */
-  if (needle[0] == '\0')
-    return (char *) haystack;
-  if (needle[1] == '\0')
-    return strchr (haystack, needle[0]);
-  if (needle[2] == '\0')
-    return strstr2 (haystack, needle);
-  if (needle[3] == '\0')
-    return strstr3 (haystack, needle);
-  if (needle[4] == '\0')
-    return strstr4 (haystack, needle);
-
-  /* Determine length of NEEDLE, and in the process, make sure
-     HAYSTACK is at least as long (no point processing all of a long
-     NEEDLE if HAYSTACK is too short).  */
-  while (*haystack && *needle)
-    ok &= *haystack++ == *needle++;
-  if (*needle)
+  if (ne[0] == '\0')
+    return (char *) hs;
+  hs = strchr (hs, ne[0]);
+  if (hs == NULL || ne[1] == '\0')
+    return (char *) hs;
+  if (ne[2] == '\0')
+    return strstr2 (hs, ne);
+  if (ne[3] == '\0')
+    return strstr3 (hs, ne);
+  if (ne[4] == '\0')
+    return strstr4 (hs, ne);
+
+  size_t ne_len = strlen (ne);
+  size_t hs_len = strlen (hs);
+
+  /* Ensure haystack length is >= needle length.  */
+  if (hs_len < ne_len)
     return NULL;
-  if (ok)
-    return (char *) searchee;
-
-  /* Reduce the size of haystack using strchr, since it has a smaller
-     linear coefficient than the Two-Way algorithm.  */
-  needle_len = needle - lookfor;
-  haystack = strchr (searchee + 1, *lookfor);
-  if (!haystack || needle_len == 1)
-    return (char *) haystack;
-  haystack_len = (haystack > searchee + needle_len ? 1
-		  : needle_len + searchee - haystack);
-
-  /* Perform the search.  */
-  if (needle_len < LONG_NEEDLE_THRESHOLD)
-    return two_way_short_needle ((const unsigned char *) haystack,
-				 haystack_len,
-				 (const unsigned char *) lookfor, needle_len);
-  return two_way_long_needle ((const unsigned char *) haystack, haystack_len,
-			      (const unsigned char *) lookfor, needle_len);
-#endif /* compilation for speed */
+  /* Check for a match at offset zero, avoiding preprocessing overheads.  */
+  if (memcmp (hs, ne, ne_len) == 0)
+    return (char*) hs;
+
+  /* Use the Quick-Search algorithm for needle lengths less than 255.  */
+  if (__builtin_expect (ne_len < 255, 1))
+    {
+      uint8_t shift[256];
+      const unsigned char *end = hs + hs_len - ne_len;
+
+      memset (shift, ne_len + 1, sizeof (shift));
+      for (int i = 0; i < ne_len; i++)
+	shift[ne[i]] = ne_len - i;
+
+      for (hs += shift[hs[ne_len]]; hs <= end; hs += shift[hs[ne_len]])
+	if (memcmp (hs, ne, ne_len) == 0)
+	  return (char*) hs;
+      return NULL;
+    }
+
+  /* Use Two-Way algorithm for very long needles.  */
+  return two_way_long_needle (hs, hs_len, ne, ne_len);
 }
+#endif /* compilation for speed */


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]