From 473f1a3a5dffbcc18160167a75e05bfa0a4ec1e0 Mon Sep 17 00:00:00 2001 From: Wilco Dijkstra Date: Thu, 18 Oct 2018 17:29:36 +0000 Subject: [PATCH] Improve performance of strstr v3: Add support for read ahead using strnlen, giving an additional 25% speedup on large inputs (both short and long needles). This patch significantly improves performance of strstr by using Sunday's Quick-Search algorithm. Due to its simplicity it has the best average performance of string matching algorithms on almost all inputs. It uses a bad-character shift table to skip past mismatches. The needle length is limited to 254 - this reduces the shift table memory 4 to 8 times, lowering preprocessing overhead and minimizing cache effects. The limit also implies its worst-case performance is linear. Larger needles are processed by the Two-Way algorithm. The macro AVAILABLE has been improved to use strnlen to read the input in chunks. This results in a 2.5 times speedup for large needles, reducing the performance drop when the Quick-Search algorithm can't be used. The code for 1-4 byte needles has been simplified and now uses unsigned char. Since the optimized code relies on 8-bit chars, we defer to the size-optimized implementation if CHAR_BIT > 8. The performance gain of finding a set of randomly chosen words of size 8 in 256 bytes of English text is 14 times on AArch64. For longer haystacks the gain is well over 20 times. The size-optimized strstr has also been rewritten from scratch to improve performance. On the same test the performance gain is 69%. Tested against GLIBC testsuite, randomized tests and the GNULIB strstr test (https://git.savannah.gnu.org/cgit/gnulib.git/tree/tests/test-strstr.c). -- --- newlib/libc/string/strstr.c | 223 ++++++++++++++++++++---------------- 1 file changed, 127 insertions(+), 96 deletions(-) diff --git a/newlib/libc/string/strstr.c b/newlib/libc/string/strstr.c index e72b4bd91..0256c1aa6 100644 --- a/newlib/libc/string/strstr.c +++ b/newlib/libc/string/strstr.c @@ -1,3 +1,31 @@ +/* Optimized strstr function. + Copyright (c) 2018 Arm Ltd. All rights reserved. + + SPDX-License-Identifier: BSD-3-Clause + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + 3. The name of the company may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + /* FUNCTION <>---find string segment @@ -29,141 +57,144 @@ QUICKREF */ #include +#include + +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) \ + || CHAR_BIT > 8 -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +/* Small and efficient strstr implementation. */ char * -strstr (const char *searchee, - const char *lookfor) +strstr (const char *hs, const char *ne) { - /* Less code size, but quadratic performance in the worst case. */ - if (*searchee == 0) - { - if (*lookfor) - return (char *) NULL; - return (char *) searchee; - } - - while (*searchee) - { - size_t i; - i = 0; + size_t i; + int c = ne[0]; - while (1) - { - if (lookfor[i] == 0) - { - return (char *) searchee; - } + if (c == 0) + return (char*)hs; - if (lookfor[i] != searchee[i]) - { - break; - } - i++; - } - searchee++; + for ( ; hs[0] != '\0'; hs++) + { + if (hs[0] != c) + continue; + for (i = 1; ne[i] != 0; i++) + if (hs[i] != ne[i]) + break; + if (ne[i] == '\0') + return (char*)hs; } - return (char *) NULL; + return NULL; +} #else /* compilation for speed */ # define RETURN_TYPE char * -# define AVAILABLE(h, h_l, j, n_l) \ - (!memchr ((h) + (h_l), '\0', (j) + (n_l) - (h_l)) \ - && ((h_l) = (j) + (n_l))) +# define AVAILABLE(h, h_l, j, n_l) (((j) <= (h_l) - (n_l)) \ + || ((h_l) += strnlen ((h) + (h_l), (n_l) | 2048), ((j) <= (h_l) - (n_l)))) + # include "str-two-way.h" +/* Number of bits used to index shift table. */ +#define SHIFT_TABLE_BITS 6 + static inline char * -strstr2 (const char *hs, const char *ne) +strstr2 (const unsigned char *hs, const unsigned char *ne) { uint32_t h1 = (ne[0] << 16) | ne[1]; uint32_t h2 = 0; - int c = hs[0]; - while (h1 != h2 && c != 0) - { + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) h2 = (h2 << 16) | c; - c = *++hs; - } return h1 == h2 ? (char *)hs - 2 : NULL; } static inline char * -strstr3 (const char *hs, const char *ne) +strstr3 (const unsigned char *hs, const unsigned char *ne) { uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8); uint32_t h2 = 0; - int c = hs[0]; - while (h1 != h2 && c != 0) - { + for (int c = hs[0]; h1 != h2 && c != 0; c = *++hs) h2 = (h2 | c) << 8; - c = *++hs; - } return h1 == h2 ? (char *)hs - 3 : NULL; } static inline char * -strstr4 (const char *hs, const char *ne) +strstr4 (const unsigned char *hs, const unsigned char *ne) { uint32_t h1 = (ne[0] << 24) | (ne[1] << 16) | (ne[2] << 8) | ne[3]; uint32_t h2 = 0; - int c = hs[0]; - while (h1 != h2 && c != 0) - { - h2 = (h2 << 8) | c; - c = *++hs; - } + for (int c = hs[0]; c != 0 && h1 != h2; c = *++hs) + h2 = (h2 << 8) | c; return h1 == h2 ? (char *)hs - 4 : NULL; } +/* Extremely fast strstr algorithm with guaranteed linear-time performance. + Small needles up to size 4 use a dedicated linear search. Longer needles + up to size 254 use Sunday's Quick-Search algorithm. Due to its simplicity + it has the best average performance of string matching algorithms on almost + all inputs. It uses a bad-character shift table to skip past mismatches. + By limiting the needle length to 254, the shift table can be reduced to 8 + bits per entry, lowering preprocessing overhead and minimizing cache effects. + The limit also implies the worst-case performance is linear. + Even larger needles are processed by the linear-time Two-Way algorithm. +*/ char * -strstr (const char *searchee, - const char *lookfor) +strstr (const char *haystack, const char *needle) { - /* Larger code size, but guaranteed linear performance. */ - const char *haystack = searchee; - const char *needle = lookfor; - size_t needle_len; /* Length of NEEDLE. */ - size_t haystack_len; /* Known minimum length of HAYSTACK. */ - int ok = 1; /* True if NEEDLE is prefix of HAYSTACK. */ + const unsigned char *hs = (const unsigned char *) haystack; + const unsigned char *ne = (const unsigned char *) needle; /* Handle short needle special cases first. */ - if (needle[0] == '\0') - return (char *) haystack; - if (needle[1] == '\0') - return strchr (haystack, needle[0]); - if (needle[2] == '\0') - return strstr2 (haystack, needle); - if (needle[3] == '\0') - return strstr3 (haystack, needle); - if (needle[4] == '\0') - return strstr4 (haystack, needle); - - /* Determine length of NEEDLE, and in the process, make sure - HAYSTACK is at least as long (no point processing all of a long - NEEDLE if HAYSTACK is too short). */ - while (*haystack && *needle) - ok &= *haystack++ == *needle++; - if (*needle) + if (ne[0] == '\0') + return (char *) hs; + if (ne[1] == '\0') + return (char*)strchr (hs, ne[0]); + if (ne[2] == '\0') + return strstr2 (hs, ne); + if (ne[3] == '\0') + return strstr3 (hs, ne); + if (ne[4] == '\0') + return strstr4 (hs, ne); + + size_t ne_len = strlen (ne); + size_t hs_len = strnlen (hs, ne_len | 512); + + /* Ensure haystack length is >= needle length. */ + if (hs_len < ne_len) return NULL; - if (ok) - return (char *) searchee; - - /* Reduce the size of haystack using strchr, since it has a smaller - linear coefficient than the Two-Way algorithm. */ - needle_len = needle - lookfor; - haystack = strchr (searchee + 1, *lookfor); - if (!haystack || needle_len == 1) - return (char *) haystack; - haystack_len = (haystack > searchee + needle_len ? 1 - : needle_len + searchee - haystack); - - /* Perform the search. */ - if (needle_len < LONG_NEEDLE_THRESHOLD) - return two_way_short_needle ((const unsigned char *) haystack, - haystack_len, - (const unsigned char *) lookfor, needle_len); - return two_way_long_needle ((const unsigned char *) haystack, haystack_len, - (const unsigned char *) lookfor, needle_len); -#endif /* compilation for speed */ + + /* Use the Quick-Search algorithm for needle lengths less than 255. */ + if (__builtin_expect (ne_len < 255, 1)) + { + uint8_t shift[1 << SHIFT_TABLE_BITS]; + const unsigned char *end = hs + hs_len - ne_len; + + /* Initialize bad character shift hash table. */ + memset (shift, ne_len + 1, sizeof (shift)); + for (int i = 0; i < ne_len; i++) + shift[ne[i] % sizeof (shift)] = ne_len - i; + + do + { + hs--; + + /* Search by skipping past bad characters. */ + size_t tmp = shift[hs[ne_len] % sizeof (shift)]; + for (hs += tmp; hs <= end; hs += tmp) + { + tmp = shift[hs[ne_len] % sizeof (shift)]; + if (memcmp (hs, ne, ne_len) == 0) + return (char*) hs; + } + if (end[ne_len] == 0) + return NULL; + end += strnlen (end + ne_len, 2048); + } + while (hs <= end); + + return NULL; + } + + /* Use Two-Way algorithm for very long needles. */ + return two_way_long_needle (hs, hs_len, ne, ne_len); } +#endif /* compilation for speed */ -- 2.43.5