This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: PATCH] Improve performance of memmem
Hi Jeff,
> Can you please resend with the patch as an attachment?
Of course, it's attached.
Wilco
On Wed, Dec 19, 2018 at 10:26 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
Hi,
Here is version 2 with improved comments and spaces:
This patch significantly improves performance of memmem using a novel
modified Horspool algorithm. Needles up to size 256 use a bad-character
table indexed by hashed pairs of characters to quickly skip past mismatches.
Long needles use a self-adapting filtering step to avoid comparing the whole
needle repeatedly.
By limiting the needle length to 256, the shift table only requires 8 bits
per entry, lowering preprocessing overhead and minimizing cache effects.
This limit also implies worst-case performance is linear.
Small needles up to size 2 use a dedicated linear search. Very long needles
use the Two-Way algorithm (to avoid increasing stack size inlining is now disabled).
The performance gain is 6.6 times on English text on AArch64 using random
needles with average size 8 (this is even faster than the recently improved strstr
algorithm, so I'll update that in the near future).
The size-optimized memmem has also been rewritten from scratch to get a
2.7x performance gain.
Tested against GLIBC testsuite and randomized tests.
diff --git a/newlib/libc/string/memmem.c b/newlib/libc/string/memmem.c
index 5c57eff9cb7998587cb73324e7d79f6944f52888..55d2459aa53a2861d89cab5c4f91092bba2e280b 100644
--- a/newlib/libc/string/memmem.c
+++ b/newlib/libc/string/memmem.c
@@ -1,8 +1,30 @@
-/* Byte-wise substring search, using the Two-Way algorithm.
- * Copyright (C) 2008 Eric Blake
- * Permission to use, copy, modify, and distribute this software
- * is freely granted, provided that this notice is preserved.
- */
+/* Optimized memmem function.
+ Copyright (c) 2018 Arm Ltd. All rights reserved.
+
+ SPDX-License-Identifier: BSD-3-Clause
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ 3. The name of the company may not be used to endorse or promote
+ products derived from this software without specific prior written
+ permission.
+
+ THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
/*
FUNCTION
@@ -13,7 +35,7 @@ INDEX
SYNOPSIS
#include <string.h>
- char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
+ void *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
size_t <[l2]>);
DESCRIPTION
@@ -21,8 +43,8 @@ DESCRIPTION
Locates the first occurrence in the memory region pointed to
by <[s1]> with length <[l1]> of the sequence of bytes pointed
to by <[s2]> of length <[l2]>. If you already know the
- lengths of your haystack and needle, <<memmem>> can be much
- faster than <<strstr>>.
+ lengths of your haystack and needle, <<memmem>> is much faster
+ than <<strstr>>.
RETURNS
Returns a pointer to the located segment, or a null pointer if
@@ -38,64 +60,127 @@ QUICKREF
*/
#include <string.h>
+#include <stdint.h>
+
+#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+
+/* Small and efficient memmem implementation (quadratic worst-case). */
+void *
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
+{
+ const char *hs = haystack;
+ const char *ne = needle;
+
+ if (ne_len == 0)
+ return (void *)hs;
+ int i;
+ int c = ne[0];
+ const char *end = hs + hs_len - ne_len;
+
+ for ( ; hs <= end; hs++)
+ {
+ if (hs[0] != c)
+ continue;
+ for (i = ne_len - 1; i != 0; i--)
+ if (hs[i] != ne[i])
+ break;
+ if (i == 0)
+ return (void *)hs;
+ }
+
+ return NULL;
+}
+
+#else
-#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
# define RETURN_TYPE void *
# define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
# include "str-two-way.h"
-#endif
+#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof (shift))
+
+/* Fast memmem algorithm with guaranteed linear-time performance.
+ Small needles up to size 2 use a dedicated linear search. Longer needles
+ up to size 256 use a novel modified Horspool algorithm. It hashes pairs
+ of characters to quickly skip past mismatches. The main search loop only
+ exits if the last 2 characters match, avoiding unnecessary calls to memcmp
+ and allowing for a larger skip if there is no match. A self-adapting
+ filtering check is used to quickly detect mismatches in long needles.
+ By limiting the needle length to 256, the shift table can be reduced to 8
+ bits per entry, lowering preprocessing overhead and minimizing cache effects.
+ The limit also implies worst-case performance is linear.
+ Needles larger than 256 characters use the linear-time Two-Way algorithm. */
void *
-memmem (const void *haystack_start,
- size_t haystack_len,
- const void *needle_start,
- size_t needle_len)
+memmem (const void *haystack, size_t hs_len, const void *needle, size_t ne_len)
{
- /* Abstract memory is considered to be an array of 'unsigned char' values,
- not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
- const unsigned char *haystack = (const unsigned char *) haystack_start;
- const unsigned char *needle = (const unsigned char *) needle_start;
+ const unsigned char *hs = haystack;
+ const unsigned char *ne = needle;
- if (needle_len == 0)
- /* The first occurrence of the empty string is deemed to occur at
- the beginning of the string. */
- return (void *) haystack;
+ if (ne_len == 0)
+ return (void *) hs;
+ if (ne_len == 1)
+ return (void *) memchr (hs, ne[0], hs_len);
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
+ /* Ensure haystack length is >= needle length. */
+ if (hs_len < ne_len)
+ return NULL;
+
+ const unsigned char *end = hs + hs_len - ne_len;
- /* Less code size, but quadratic performance in the worst case. */
- while (needle_len <= haystack_len)
+ if (ne_len == 2)
{
- if (!memcmp (haystack, needle, needle_len))
- return (void *) haystack;
- haystack++;
- haystack_len--;
+ uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
+ for (hs++; hs <= end && hw != nw; )
+ hw = hw << 16 | *++hs;
+ return hw == nw ? (void *)(hs - 1) : NULL;
}
- return NULL;
-#else /* compilation for speed */
+ /* Use Two-Way algorithm for very long needles. */
+ if (__builtin_expect (ne_len > 256, 0))
+ return two_way_long_needle (hs, hs_len, ne, ne_len);
- /* Larger code size, but guaranteed linear performance. */
+ uint8_t shift[256];
+ size_t tmp, shift1;
+ size_t m1 = ne_len - 1;
+ size_t offset = 0;
- /* Sanity check, otherwise the loop might search through the whole
- memory. */
- if (haystack_len < needle_len)
- return NULL;
+ /* Initialize bad character shift hash table. */
+ memset (shift, 0, sizeof (shift));
+ for (int i = 1; i < m1; i++)
+ shift[hash2 (ne + i)] = i;
+ shift1 = m1 - shift[hash2 (ne + m1)];
+ shift[hash2 (ne + m1)] = m1;
- /* Use optimizations in memchr when possible, to reduce the search
- size of haystack using a linear algorithm with a smaller
- coefficient. However, avoid memchr for long needles, since we
- can often achieve sublinear performance. */
- if (needle_len < LONG_NEEDLE_THRESHOLD)
+ for ( ; hs <= end; )
{
- haystack = memchr (haystack, *needle, haystack_len);
- if (!haystack || needle_len == 1)
- return (void *) haystack;
- haystack_len -= haystack - (const unsigned char *) haystack_start;
- if (haystack_len < needle_len)
- return NULL;
- return two_way_short_needle (haystack, haystack_len, needle, needle_len);
+ /* Skip past character pairs not in the needle. */
+ do
+ {
+ hs += m1;
+ tmp = shift[hash2 (hs)];
+ }
+ while (hs <= end && tmp == 0);
+
+ /* If the match is not at the end of the needle, shift to the end
+ and continue until we match the last 2 characters. */
+ hs -= tmp;
+ if (tmp < m1)
+ continue;
+
+ /* The last 2 characters match. If the needle is long, check a
+ fixed number of characters first to quickly filter out mismatches. */
+ if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) == 0)
+ {
+ if (memcmp (hs, ne, m1) == 0)
+ return (void *) hs;
+
+ /* Adjust filter offset when it doesn't find the mismatch. */
+ offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
+ }
+
+ /* Skip based on matching the last 2 characters. */
+ hs += shift1;
}
- return two_way_long_needle (haystack, haystack_len, needle, needle_len);
-#endif /* compilation for speed */
+ return NULL;
}
+#endif /* Compilation for speed. */
diff --git a/newlib/libc/string/str-two-way.h b/newlib/libc/string/str-two-way.h
index 416f9d29ff07b79da526b8bd4cee0b7142af36b4..90345a8dec6f140b00886efa74efd5ab12d44af3 100644
--- a/newlib/libc/string/str-two-way.h
+++ b/newlib/libc/string/str-two-way.h
@@ -31,6 +31,7 @@
#include <limits.h>
#include <stdint.h>
+#include <_ansi.h>
/* We use the Two-Way string matching algorithm, which guarantees
linear complexity with constant space. Additionally, for long
@@ -288,7 +289,7 @@ two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
sublinear performance is not possible. */
-static RETURN_TYPE
+_NOINLINE_STATIC RETURN_TYPE
two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
const unsigned char *needle, size_t needle_len)
{