[PATCH v7] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c
Noah Goldstein
goldstein.w.n@gmail.com
Wed Feb 21 17:17:28 GMT 2024
On Wed, Feb 21, 2024 at 12:58 AM James Tirta Halim
<tirtajames45@gmail.com> wrote:
>
> Find the rarest byte in NE. Find the parts of HS that matches the rare byte
> and the byte after it. If found, shift back to the start of NE in HS and
> vector compare the first VEC_SIZE with NE. If matches, compare the rest
> with MEMCMPEQ.
>
> Timings (Core i3-1115G4):
> basic_memmem twoway_memmem __memmem_avx512 __memmem_avx2
> __memmem_generic
> Total:
> 6.80124e+06 1.06087e+06 219483 345385 768041
> Average:
> 25958.9 4049.11 837.721 1318.26 2931.45
>
> Passes make check.
>
> Changes in v1:
> 1. Add memmem-avx2.c
>
> Changes in v2:
> 1. Add avx512 support with a generic header file
> 2. Use __memcmpeq instead of memcmp
> 3. Remove scalar loop
> 4. Fix unsafe unaligned load
>
> Changes in v3:
> 1. Avoid checking for alignment to the start of the page since that will be rare
> 2. Use __memcmpeq instead of __memcmpeq_avx2 (it generates undefined
> reference errors)
> 3. Add memmem.c (needs review)
> 4. Add __memcmpeq_avx2 and __memcmpeq_avx512 to ifunc-impl-list.c (needs
> review)
> 5. Add libc_hidden_builtin_def and MEMMEM to memmem.c (needs review)
>
> Changes in v4:
> 1. Correct the cpu feature checks in ifunc-impl-list.c and memmem.c to
> use AVX512BW and BMI1 for AVX512 and AVX2 and BMI1 for AVX2
> 2. Correct the Makefile to use the appropriate flags
> 3. Rename memmem-vectorized-avx.h to memmem-avx-base.h
> 4. Remove unused vector macros (POPCNT and LZCNT)
>
> Changes in v5:
> 1. Rename SHIFT to RARE, OFF to OFF_S, OFF2 to OFF_E
> 2. Remove conditional for VEC_SIZE and ONES, and remove unused MASK_SIZE
> 3. Add comments
> 4. Limit needle length to VEC_SIZE when finding the rare byte
>
> Changes in v6:
> 1. Fix patch apply error in memmem.c
> 2. Correctly use MIN(ne_len, VEC_SIZE) when checking if RARE is found at the end
> of needle
> 3. Always do unaligned load at the tail code
> 4. Rename rarebyte_table to ___rarebyte_table
> 5. Add memmem-avx-base.c in which ___rarebyte_table is defined
> 6. Add memmem-avx-base to the Makefile
> 7. Add always_inline to find_rarest_byte
> 8. Change ((m << off) >> off) to (m & (ONES >> off))
> 9. Change void * to unsigned char * in find_rarest_byte
>
> Changes in v7:
> 1. Fallback to generic memmem for long needles for guaranteed
> linear-time worst-case performance
> 2. Use memmem instead of MEMMEM for libc_hidden_builtin_def in
> memmem.c (string/memmem.c and sysdeps/x86_64/multiarch/memmem.c may
> still need to be fixed for non-x86_64 builds to work. The changes were
> made following string/strstr.c and sysdeps/x86_64/multiarch/strstr.c)
> 3. Change some (VEC *) casts to (const VEC *)
>
> ---
> string/memmem.c | 7 +-
> sysdeps/x86_64/multiarch/Makefile | 6 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 ++
> sysdeps/x86_64/multiarch/memmem-avx-base.c | 20 +++
> sysdeps/x86_64/multiarch/memmem-avx-base.h | 191 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/memmem-avx2.c | 3 +
> sysdeps/x86_64/multiarch/memmem-avx512.c | 12 ++
> sysdeps/x86_64/multiarch/memmem.c | 67 ++++++++
> 8 files changed, 317 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.c
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx-base.h
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
> create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c
> create mode 100644 sysdeps/x86_64/multiarch/memmem.c
>
> diff --git a/string/memmem.c b/string/memmem.c
> index a4117f8e1e..0a89bd5f7c 100644
> --- a/string/memmem.c
> +++ b/string/memmem.c
> @@ -25,6 +25,10 @@
> # define __memmem memmem
> #endif
>
> +#ifndef MEMMEM
> +# define MEMMEM __memmem
> +#endif
> +
> #define RETURN_TYPE void *
> #define AVAILABLE(h, h_l, j, n_l) ((j) <= (h_l) - (n_l))
> #define FASTSEARCH(S,C,N) (void*) memchr ((void *)(S), (C), (N))
> @@ -50,7 +54,7 @@
> The limit also implies worst-case performance is linear.
> Needles larger than 256 characters use the linear-time Two-Way algorithm. */
> void *
> -__memmem (const void *haystack, size_t hs_len,
> +MEMMEM (const void *haystack, size_t hs_len,
> const void *needle, size_t ne_len)
> {
> const unsigned char *hs = (const unsigned char *) haystack;
> @@ -127,3 +131,4 @@ __memmem (const void *haystack, size_t hs_len,
> libc_hidden_def (__memmem)
> weak_alias (__memmem, memmem)
> libc_hidden_weak (memmem)
> +libc_hidden_builtin_def (memmem)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index d3d2270394..0b46d5f341 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -15,6 +15,9 @@ sysdep_routines += \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> memcmpeq-sse2 \
> + memmem-avx-base \
> + memmem-avx2 \
> + memmem-avx512 \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> @@ -122,6 +125,9 @@ sysdep_routines += \
> varshift \
> # sysdep_routines
>
> +CFLAGS-memmem-avx2.c += -mavx2 -mbmi -O3
> +CFLAGS-memmem-avx512.c += -mavx512f -mavx512bw -mbmi -O3
> +
> CFLAGS-strcspn-sse4.c += -msse4
> CFLAGS-strpbrk-sse4.c += -msse4
> CFLAGS-strspn-sse4.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c4a21d4b7c..20a8b85da9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -799,6 +799,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_generic))
>
> + /* Support sysdeps/x86_64/multiarch/memmem.c. */
> + IFUNC_IMPL (i, name, memmem,
> + IFUNC_IMPL_ADD (array, i, memmem,
> + (CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI1)),
> + __memmem_avx512)
> + IFUNC_IMPL_ADD (array, i, memmem,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (BMI1)),
> + __memmem_avx2)
> + IFUNC_IMPL_ADD (array, i, memmem, 1, __memmem_generic))
> +
> /* Support sysdeps/x86_64/multiarch/wcschr.c. */
> IFUNC_IMPL (i, name, wcschr,
> X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.c b/sysdeps/x86_64/multiarch/memmem-avx-base.c
> new file mode 100644
> index 0000000000..212d75c96f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.c
> @@ -0,0 +1,20 @@
> +const unsigned char ___rarebyte_table[256] attribute_hidden
> + = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4,
> + 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53,
> + 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219,
> + 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223,
> + 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216,
> + 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204,
> + 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246,
> + 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243,
> + 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190,
> + 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87,
> + 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115,
> + 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137,
> + 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73,
> + 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149,
> + 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123,
> + 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67,
> + 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34,
> + 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48,
> + 85, 49, 50, 51 };
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx-base.h b/sysdeps/x86_64/multiarch/memmem-avx-base.h
> new file mode 100644
> index 0000000000..08941798ff
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx-base.h
> @@ -0,0 +1,191 @@
> +#include <immintrin.h>
> +#include <inttypes.h>
> +#include <string.h>
> +#include <libc-pointer-arith.h>
> +
> +#ifndef FUNC_NAME
> +# define __memmem_avx2
> +#endif
> +#ifndef VEC
> +# define VEC __m256i
> +#endif
> +#ifndef MASK
> +# define MASK uint32_t
> +#endif
> +#ifndef LOAD
> +# define LOAD(x) _mm256_load_si256 (x)
> +#endif
> +#ifndef LOADU
> +# define LOADU(x) _mm256_loadu_si256 (x)
> +#endif
> +#ifndef CMPEQ8_MASK
> +# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y))
> +#endif
> +#ifndef SETONE8
> +# define SETONE8(x) _mm256_set1_epi8 (x)
> +#endif
> +#ifndef TZCNT
> +# define TZCNT(x) _tzcnt_u32 (x)
> +#endif
Use `__builtin_ctz`
> +#ifndef BLSR
> +# define BLSR(x) _blsr_u32 (x)
> +#endif
Think you can drop the `BLSR` define (here and in the avx512)
and just replace with `((x) & ((x) - 1))`
any reasonable compiler will optimize that correctly.
> +#define VEC_SIZE sizeof (VEC)
> +#define ONES ((MASK) -1)
> +
> +#ifndef MEMCMPEQ
> +# define MEMCMPEQ __memcmpeq
> +#endif
> +#ifndef MEMCPY
> +# define MEMCPY memcpy
> +#endif
> +#ifndef MEMCHR
> +# define MEMCHR memchr
> +#endif
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +#define MIN(x, y) (((x) < (y)) ? (x) : (y))
> +
> +extern void *__memmem_generic (const void *, size_t, const void *,
> + size_t) attribute_hidden;
> +
> +/* Lower is rarer. The table is based on the *.c and *.h files in glibc. */
> +extern const unsigned char ___rarebyte_table[256] attribute_hidden;
> +
> +static inline void *__attribute__ ((always_inline))
> +find_rarest_byte (const unsigned char *rare, size_t n)
> +{
> + const unsigned char *p = (const unsigned char *) rare;
> + int c_rare = ___rarebyte_table[*rare];
> + int c;
> + for (; n--; ++p)
> + {
> + c = ___rarebyte_table[*p];
> + if (c < c_rare)
> + {
> + rare = p;
> + c_rare = c;
> + }
> + }
> + return (void *) rare;
> +}
> +
> +void *
> +FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
> +{
> + if (ne_len == 1)
> + return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len);
> + if (__glibc_unlikely (ne_len == 0))
> + return (void *) hs;
> + if (__glibc_unlikely (hs_len < ne_len))
> + return NULL;
> + /* Linear-time worst-case performance is guaranteed by the generic
> + * implementation using the Two-Way algorithm. */
> + if (__glibc_unlikely (ne_len > 256))
> + return __memmem_generic (hs, hs_len, ne, ne_len)
Think this impl makes sense up to VEC_SIZE * 1 + 1, but after that
it doesn't seem to have that much advantage.
> + VEC hv0, hv1, hv, nv;
> + MASK i, hm0, hm1, m, cmpm;
> + const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0;
> + const MASK matchm = ONES << matchsh;
> + const unsigned char *h = (const unsigned char *) hs;
> + const unsigned char *const end = h + hs_len - ne_len;
> + const unsigned char *hp;
> + size_t rare = PTR_DIFF (
> + find_rarest_byte ((const unsigned char *) ne, MIN (ne_len, VEC_SIZE)),
> + ne);
> + /* RARE will always be the first byte to find.
> + If RARE is at the end of the needle, use the byte before it. */
> + if (rare == MIN (ne_len, VEC_SIZE) - 1)
> + --rare;
> + const VEC nv0 = SETONE8 (*((char *) ne + rare));
> + const VEC nv1 = SETONE8 (*((char *) ne + rare + 1));
> + unsigned int off_e = (PTR_DIFF (end, h) < VEC_SIZE)
> + ? VEC_SIZE - (unsigned int) (end - h) - 1
> + : 0;
> + /* Start from the position of RARE. */
> + h += rare;
> + /* Load the needle vector. */
> + if (((uintptr_t) ne & (PAGE_SIZE - 1)) > (PAGE_SIZE - VEC_SIZE)
> + || ne_len >= VEC_SIZE)
the `ne_len >= VEC_SIZE` should probably be the first check here.
> + nv = LOADU ((const VEC *) ne);
> + else
> + MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len));
> + const unsigned int off_s = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE));
> + /* Align down to VEC_SIZE. */
> + h -= off_s;
> + hv0 = LOAD ((const VEC *) h);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1;
> + /* Clear the irrelevant bits from aligning down (OFF_S) and ones that are out
> + * of bounds (OFF_E). */
> + m = ((hm0 & hm1) >> off_s) & (ONES >> off_e);
> + while (m)
> + {
> + i = TZCNT (m);
> + m = BLSR (m);
> + hp = h + off_s + i - rare;
> + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
> + {
> + /* Do a vector compare if we are not crossing a page. */
> + hv = LOADU ((const VEC *) hp);
> + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
> + /* Compare only the relevant bits of the needle vector. */
> + if (cmpm == matchm)
> + /* Compare the rest of the needle. */
> + if (ne_len <= VEC_SIZE
> + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
> + ne_len - VEC_SIZE))
> + return (void *) hp;
> + }
> + else
> + {
> + if (!MEMCMPEQ (hp, ne, ne_len))
> + return (void *) hp;
think (assuming you bound ne_len <= ~VEC_SIZE * 2), you can
just make a little inline impl of this that will be much faster
than a call to __memcmpeq.
> + }
> + }
> + h += VEC_SIZE - 1;
> + for (; h - rare + VEC_SIZE <= end; h += VEC_SIZE)
> + {
> + hv0 = LOADU ((const VEC *) h);
> + hv1 = LOAD ((const VEC *) (h + 1));
> + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + m = hm0 & hm1;
> + while (m)
> + {
> + match:
> + i = TZCNT (m);
> + m = BLSR (m);
> + hp = h + i - rare;
> + if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE)
> + {
> + hv = LOADU ((const VEC *) hp);
> + cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
> + if (cmpm == matchm)
> + if (ne_len <= VEC_SIZE
> + || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
> + ne_len - VEC_SIZE))
> + return (void *) hp;
> + }
> + else
> + {
> + if (!MEMCMPEQ (hp, ne, ne_len))
> + return (void *) hp;
> + }
> + }
> + }
> + if (h - rare <= end)
> + {
> + off_e = VEC_SIZE - (unsigned int) (end - (h - rare)) - 1;
> + hv0 = LOADU ((const VEC *) h);
> + hv1 = LOAD ((const VEC *) (h + 1));
> + hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
> + hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
> + /* Clear the irrelevant bits that are out of bounds. */
> + m = hm0 & hm1 & (ONES >> off_e);
> + if (m)
> + goto match;
> + }
> + return NULL;
> +}
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
> new file mode 100644
> index 0000000000..91f5d5d331
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
> @@ -0,0 +1,3 @@
> +#define FUNC_NAME __memmem_avx2
> +
> +#include "memmem-avx-base.h"
> diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c
> new file mode 100644
> index 0000000000..76016c1cfe
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem-avx512.c
> @@ -0,0 +1,12 @@
> +#define VEC __m512i
> +#define MASK uint64_t
> +#define LOAD(x) _mm512_load_si512 (x)
> +#define LOADU(x) _mm512_loadu_si512 (x)
> +#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y)
> +#define SETONE8(x) _mm512_set1_epi8 (x)
> +#define TZCNT(x) _tzcnt_u64 (x)
> +#define BLSR(x) _blsr_u64 (x)
> +
> +#define FUNC_NAME __memmem_avx512
> +
> +#include "memmem-avx-base.h"
> diff --git a/sysdeps/x86_64/multiarch/memmem.c b/sysdeps/x86_64/multiarch/memmem.c
> new file mode 100644
> index 0000000000..8fe7b77d33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memmem.c
> @@ -0,0 +1,67 @@
> +/* Multiple versions of memmem.
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2012-2023 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +/* Redefine memmem so that the compiler won't complain about the type
> + mismatch with the IFUNC selector in strong_alias, below. */
> +#undef memmem
> +#define memmem __redirect_memmem
> +#include <string.h>
> +#undef memmem
> +
> +#define MEMMEM __memmem_generic
> +#ifdef SHARED
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(name) \
> + __hidden_ver1 (__memmem_generic, __GI_memmem, __memmem_generic);
> +#endif
> +
> +#include "string/memmem.c"
> +
> +extern __typeof (__redirect_memmem) __memmem_avx2 attribute_hidden;
> +extern __typeof (__redirect_memmem) __memmem_generic attribute_hidden;
> +extern __typeof (__redirect_memmem) __memmem_avx512 attribute_hidden;
> +
> +#define SYMBOL_NAME memmem
> +
> +#include "init-arch.h"
> +
> +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> + ifunc symbol properly. */
> +extern __typeof (__redirect_memmem) __libc_memmem;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI1))
> + return __memmem_avx512;
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI1))
> + return __memmem_avx2;
> +
> + return __memmem_generic;
> +}
> +
> +libc_ifunc_redirected (__redirect_memmem, __libc_memmem, IFUNC_SELECTOR ());
> +#undef memmem
> +strong_alias (__libc_memmem, __memmem)
> --
> 2.43.2
>
More information about the Libc-alpha
mailing list