This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: PATCH] Improve performance of memmem
Hi Wilco,
Can you please resend with the patch as an attachment?
Thanks,
-- Jeff J.
On Wed, Dec 19, 2018 at 10:26 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com>
wrote:
> Hi,
>
> Here is version 2 with improved comments and spaces:
>
> This patch significantly improves performance of memmem using a novel
> modified Horspool algorithm. Needles up to size 256 use a bad-character
> table indexed by hashed pairs of characters to quickly skip past
> mismatches.
> Long needles use a self-adapting filtering step to avoid comparing the
> whole
> needle repeatedly.
>
> By limiting the needle length to 256, the shift table only requires 8 bits
> per entry, lowering preprocessing overhead and minimizing cache effects.
> This limit also implies worst-case performance is linear.
>
> Small needles up to size 2 use a dedicated linear search. Very long
> needles
> use the Two-Way algorithm (to avoid increasing stack size inlining is now
> disabled).
>
> The performance gain is 6.6 times on English text on AArch64 using random
> needles with average size 8 (this is even faster than the recently
> improved strstr
> algorithm, so I'll update that in the near future).
>
> The size-optimized memmem has also been rewritten from scratch to get a
> 2.7x performance gain.
>
> Tested against GLIBC testsuite and randomized tests.
>
> --
> diff --git a/newlib/libc/string/memmem.c b/newlib/libc/string/memmem.c
> index
> 5c57eff9cb7998587cb73324e7d79f6944f52888..55d2459aa53a2861d89cab5c4f91092bba2e280b
> 100644
> --- a/newlib/libc/string/memmem.c
> +++ b/newlib/libc/string/memmem.c
> @@ -1,8 +1,30 @@
> -/* Byte-wise substring search, using the Two-Way algorithm.
> - * Copyright (C) 2008 Eric Blake
> - * Permission to use, copy, modify, and distribute this software
> - * is freely granted, provided that this notice is preserved.
> - */
> +/* Optimized memmem function.
> + Copyright (c) 2018 Arm Ltd. All rights reserved.
> +
> + SPDX-License-Identifier: BSD-3-Clause
> +
> + Redistribution and use in source and binary forms, with or without
> + modification, are permitted provided that the following conditions
> + are met:
> + 1. Redistributions of source code must retain the above copyright
> + notice, this list of conditions and the following disclaimer.
> + 2. Redistributions in binary form must reproduce the above copyright
> + notice, this list of conditions and the following disclaimer in the
> + documentation and/or other materials provided with the distribution.
> + 3. The name of the company may not be used to endorse or promote
> + products derived from this software without specific prior written
> + permission.
> +
> + THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR
> IMPLIED
> + WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> + MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> + IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> LIMITED
> + TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
>
> /*
> FUNCTION
> @@ -13,7 +35,7 @@ INDEX
>
> SYNOPSIS
> #include <string.h>
> - char *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
> + void *memmem(const void *<[s1]>, size_t <[l1]>, const void *<[s2]>,
> size_t <[l2]>);
>
> DESCRIPTION
> @@ -21,8 +43,8 @@ DESCRIPTION
> Locates the first occurrence in the memory region pointed to
> by <[s1]> with length <[l1]> of the sequence of bytes pointed
> to by <[s2]> of length <[l2]>. If you already know the
> - lengths of your haystack and needle, <<memmem>> can be much
> - faster than <<strstr>>.
> + lengths of your haystack and needle, <<memmem>> is much faster
> + than <<strstr>>.
>
> RETURNS
> Returns a pointer to the located segment, or a null pointer if
> @@ -38,64 +60,127 @@ QUICKREF
> */
>
> #include <string.h>
> +#include <stdint.h>
> +
> +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> +
> +/* Small and efficient memmem implementation (quadratic worst-case). */
> +void *
> +memmem (const void *haystack, size_t hs_len, const void *needle, size_t
> ne_len)
> +{
> + const char *hs = haystack;
> + const char *ne = needle;
> +
> + if (ne_len == 0)
> + return (void *)hs;
> + int i;
> + int c = ne[0];
> + const char *end = hs + hs_len - ne_len;
> +
> + for ( ; hs <= end; hs++)
> + {
> + if (hs[0] != c)
> + continue;
> + for (i = ne_len - 1; i != 0; i--)
> + if (hs[i] != ne[i])
> + break;
> + if (i == 0)
> + return (void *)hs;
> + }
> +
> + return NULL;
> +}
> +
> +#else
>
> -#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
> # define RETURN_TYPE void *
> # define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
> # include "str-two-way.h"
> -#endif
>
> +#define hash2(p) (((size_t)(p)[0] - ((size_t)(p)[-1] << 3)) % sizeof
> (shift))
> +
> +/* Fast memmem algorithm with guaranteed linear-time performance.
> + Small needles up to size 2 use a dedicated linear search. Longer
> needles
> + up to size 256 use a novel modified Horspool algorithm. It hashes
> pairs
> + of characters to quickly skip past mismatches. The main search loop
> only
> + exits if the last 2 characters match, avoiding unnecessary calls to
> memcmp
> + and allowing for a larger skip if there is no match. A self-adapting
> + filtering check is used to quickly detect mismatches in long needles.
> + By limiting the needle length to 256, the shift table can be reduced
> to 8
> + bits per entry, lowering preprocessing overhead and minimizing cache
> effects.
> + The limit also implies worst-case performance is linear.
> + Needles larger than 256 characters use the linear-time Two-Way
> algorithm. */
> void *
> -memmem (const void *haystack_start,
> - size_t haystack_len,
> - const void *needle_start,
> - size_t needle_len)
> +memmem (const void *haystack, size_t hs_len, const void *needle, size_t
> ne_len)
> {
> - /* Abstract memory is considered to be an array of 'unsigned char'
> values,
> - not an array of 'char' values. See ISO C 99 section 6.2.6.1. */
> - const unsigned char *haystack = (const unsigned char *) haystack_start;
> - const unsigned char *needle = (const unsigned char *) needle_start;
> + const unsigned char *hs = haystack;
> + const unsigned char *ne = needle;
>
> - if (needle_len == 0)
> - /* The first occurrence of the empty string is deemed to occur at
> - the beginning of the string. */
> - return (void *) haystack;
> + if (ne_len == 0)
> + return (void *) hs;
> + if (ne_len == 1)
> + return (void *) memchr (hs, ne[0], hs_len);
>
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> + /* Ensure haystack length is >= needle length. */
> + if (hs_len < ne_len)
> + return NULL;
> +
> + const unsigned char *end = hs + hs_len - ne_len;
>
> - /* Less code size, but quadratic performance in the worst case. */
> - while (needle_len <= haystack_len)
> + if (ne_len == 2)
> {
> - if (!memcmp (haystack, needle, needle_len))
> - return (void *) haystack;
> - haystack++;
> - haystack_len--;
> + uint32_t nw = ne[0] << 16 | ne[1], hw = hs[0] << 16 | hs[1];
> + for (hs++; hs <= end && hw != nw; )
> + hw = hw << 16 | *++hs;
> + return hw == nw ? (void *)(hs - 1) : NULL;
> }
> - return NULL;
>
> -#else /* compilation for speed */
> + /* Use Two-Way algorithm for very long needles. */
> + if (__builtin_expect (ne_len > 256, 0))
> + return two_way_long_needle (hs, hs_len, ne, ne_len);
>
> - /* Larger code size, but guaranteed linear performance. */
> + uint8_t shift[256];
> + size_t tmp, shift1;
> + size_t m1 = ne_len - 1;
> + size_t offset = 0;
>
> - /* Sanity check, otherwise the loop might search through the whole
> - memory. */
> - if (haystack_len < needle_len)
> - return NULL;
> + /* Initialize bad character shift hash table. */
> + memset (shift, 0, sizeof (shift));
> + for (int i = 1; i < m1; i++)
> + shift[hash2 (ne + i)] = i;
> + shift1 = m1 - shift[hash2 (ne + m1)];
> + shift[hash2 (ne + m1)] = m1;
>
> - /* Use optimizations in memchr when possible, to reduce the search
> - size of haystack using a linear algorithm with a smaller
> - coefficient. However, avoid memchr for long needles, since we
> - can often achieve sublinear performance. */
> - if (needle_len < LONG_NEEDLE_THRESHOLD)
> + for ( ; hs <= end; )
> {
> - haystack = memchr (haystack, *needle, haystack_len);
> - if (!haystack || needle_len == 1)
> - return (void *) haystack;
> - haystack_len -= haystack - (const unsigned char *) haystack_start;
> - if (haystack_len < needle_len)
> - return NULL;
> - return two_way_short_needle (haystack, haystack_len, needle,
> needle_len);
> + /* Skip past character pairs not in the needle. */
> + do
> + {
> + hs += m1;
> + tmp = shift[hash2 (hs)];
> + }
> + while (hs <= end && tmp == 0);
> +
> + /* If the match is not at the end of the needle, shift to the end
> + and continue until we match the last 2 characters. */
> + hs -= tmp;
> + if (tmp < m1)
> + continue;
> +
> + /* The last 2 characters match. If the needle is long, check a
> + fixed number of characters first to quickly filter out
> mismatches. */
> + if (m1 <= 15 || memcmp (hs + offset, ne + offset, sizeof (long)) ==
> 0)
> + {
> + if (memcmp (hs, ne, m1) == 0)
> + return (void *) hs;
> +
> + /* Adjust filter offset when it doesn't find the mismatch. */
> + offset = (offset >= sizeof (long) ? offset : m1) - sizeof (long);
> + }
> +
> + /* Skip based on matching the last 2 characters. */
> + hs += shift1;
> }
> - return two_way_long_needle (haystack, haystack_len, needle, needle_len);
> -#endif /* compilation for speed */
> + return NULL;
> }
> +#endif /* Compilation for speed. */
> diff --git a/newlib/libc/string/str-two-way.h
> b/newlib/libc/string/str-two-way.h
> index
> 416f9d29ff07b79da526b8bd4cee0b7142af36b4..90345a8dec6f140b00886efa74efd5ab12d44af3
> 100644
> --- a/newlib/libc/string/str-two-way.h
> +++ b/newlib/libc/string/str-two-way.h
> @@ -31,6 +31,7 @@
>
> #include <limits.h>
> #include <stdint.h>
> +#include <_ansi.h>
>
> /* We use the Two-Way string matching algorithm, which guarantees
> linear complexity with constant space. Additionally, for long
> @@ -288,7 +289,7 @@ two_way_short_needle (const unsigned char *haystack,
> size_t haystack_len,
> If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
> HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
> sublinear performance is not possible. */
> -static RETURN_TYPE
> +_NOINLINE_STATIC RETURN_TYPE
> two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
> const unsigned char *needle, size_t needle_len)
> {
>
>