[PATCH v2] sysdeps/x86_64/multiarch/memmem-avx2.c: add memmem-avx2.c
James Tirta Halim
tirtajames45@gmail.com
Fri Dec 22 02:29:10 GMT 2023
Changes in v2:
1. Add avx512 support with a generic header file
2. Use __memcmpeq instead of memcmp
3. Remove scalar loop
4. Fix unsafe unaligned load
Average timings (Core i3-1115G4):
__memmem_avx512 __memmem_avx2 basic_memmem twoway_memmem memmem
842.43 1284.78 25569 4124.97 2927.43
Passes test-memmem (avx2 uses __memcmpeq, presumably __memcmpeq_avx2 works).
---
sysdeps/x86_64/multiarch/memmem-avx2.c | 4 +
sysdeps/x86_64/multiarch/memmem-avx512.c | 18 ++
.../x86_64/multiarch/memmem-vectorized-avx.h | 226 ++++++++++++++++++
3 files changed, 248 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/memmem-avx2.c
create mode 100644 sysdeps/x86_64/multiarch/memmem-avx512.c
create mode 100644 sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
diff --git a/sysdeps/x86_64/multiarch/memmem-avx2.c b/sysdeps/x86_64/multiarch/memmem-avx2.c
new file mode 100644
index 0000000000..ee78546f90
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-avx2.c
@@ -0,0 +1,4 @@
+#define MEMCMPEQ __memcmpeq_avx2
+#define FUNC_NAME __memmem_avx2
+
+#include "memmem-vectorized-avx.h"
diff --git a/sysdeps/x86_64/multiarch/memmem-avx512.c b/sysdeps/x86_64/multiarch/memmem-avx512.c
new file mode 100644
index 0000000000..6a6da9e69c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-avx512.c
@@ -0,0 +1,18 @@
+#define VEC __m512i
+#define MASK uint64_t
+#define LOAD(x) _mm512_load_si512 (x)
+#define LOADU(x) _mm512_loadu_si512 (x)
+#define STORE(dst, src) _mm512_store_si512 (dst, src)
+#define STOREU(dst, src) _mm512_storeu_si512 (dst, src)
+#define CMPEQ8_MASK(x, y) _mm512_cmpeq_epi8_mask (x, y)
+#define SETZERO(x) _mm512_setzero_si512 (x)
+#define SETONE8(x) _mm512_set1_epi8 (x)
+#define POPCNT(x) _mm_popcnt_u64 (x)
+#define TZCNT(x) _tzcnt_u64 (x)
+#define BLSR(x) _blsr_u64 (x)
+#define LZCNT(x) _lzcnt_u64 (x)
+#define ONES ((MASK) -1)
+
+#define FUNC_NAME __memmem_avx512
+
+#include "memmem-vectorized-avx.h"
diff --git a/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h b/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
new file mode 100644
index 0000000000..8810b3c118
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmem-vectorized-avx.h
@@ -0,0 +1,231 @@
+#include <immintrin.h>
+#include <inttypes.h>
+#include <string.h>
+#include <libc-pointer-arith.h>
+
+#ifndef FUNC_NAME
+# define __memmem_avx2
+#endif
+#ifndef VEC
+# define VEC __m256i
+#endif
+#ifndef VEC_SIZE
+# define VEC_SIZE sizeof (VEC)
+#endif
+#ifndef MASK
+# define MASK uint32_t
+#endif
+#ifndef MASK_SIZE
+# define MASK_SIZE sizeof (MASK)
+#endif
+#ifndef LOAD
+# define LOAD(x) _mm256_load_si256 (x)
+#endif
+#ifndef LOADU
+# define LOADU(x) _mm256_loadu_si256 (x)
+#endif
+#ifndef STORE
+# define STORE(dst, src) _mm256_store_si256 (dst, src)
+#endif
+#ifndef STOREU
+# define STOREU(dst, src) _mm256_storeu_si256 (dst, src)
+#endif
+#ifndef CMPEQ8_MASK
+# define CMPEQ8_MASK(x, y) _mm256_movemask_epi8 (_mm256_cmpeq_epi8 (x, y))
+#endif
+#ifndef SETZERO
+# define SETZERO(x) _mm256_setzero_si256 (x)
+#endif
+#ifndef SETONE8
+# define SETONE8(x) _mm256_set1_epi8 (x)
+#endif
+#ifndef POPCNT
+# define POPCNT(x) _mm_popcnt_u32 (x)
+#endif
+#ifndef TZCNT
+# define TZCNT(x) _tzcnt_u32 (x)
+#endif
+#ifndef BLSR
+# define BLSR(x) _blsr_u32 (x)
+#endif
+#ifndef LZCNT
+# define LZCNT(x) _lzcnt_u32 (x)
+#endif
+#ifndef ONES
+# define ONES ((MASK) -1)
+#endif
+
+#ifndef MEMCMPEQ
+# define MEMCMPEQ __memcmpeq
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+#ifndef MEMCHR
+# define MEMCHR memchr
+#endif
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline void *
+find_rarest_byte (const void *ne, size_t n)
+{
+ /* Lower is rarer. The table is based on the
+ *.c and *.h files in glibc. */
+ static const unsigned char rarebyte_table[256]
+ = { 0, 1, 13, 56, 59, 60, 61, 62, 63, 232, 248, 2, 158, 4,
+ 5, 6, 7, 8, 9, 10, 14, 20, 26, 29, 37, 46, 52, 53,
+ 54, 55, 57, 58, 255, 172, 242, 193, 162, 174, 178, 182, 218, 219,
+ 212, 180, 249, 197, 221, 210, 253, 231, 230, 224, 225, 226, 227, 223,
+ 222, 220, 176, 213, 184, 229, 188, 164, 159, 209, 181, 203, 189, 216,
+ 196, 192, 185, 205, 161, 168, 215, 187, 211, 194, 195, 165, 206, 204,
+ 214, 198, 173, 179, 175, 183, 167, 202, 239, 201, 160, 241, 163, 246,
+ 233, 238, 240, 254, 237, 208, 234, 250, 169, 186, 236, 217, 245, 243,
+ 228, 170, 247, 244, 251, 235, 199, 200, 252, 207, 177, 191, 171, 190,
+ 166, 3, 140, 134, 124, 126, 86, 128, 95, 117, 114, 93, 81, 87,
+ 132, 96, 112, 97, 103, 82, 139, 89, 98, 88, 119, 74, 156, 115,
+ 104, 75, 120, 106, 76, 155, 90, 122, 107, 125, 152, 145, 136, 137,
+ 101, 116, 102, 108, 99, 141, 77, 78, 118, 79, 109, 100, 150, 73,
+ 94, 72, 121, 151, 113, 135, 110, 105, 83, 91, 11, 12, 64, 149,
+ 146, 111, 65, 69, 66, 15, 16, 17, 18, 19, 130, 92, 144, 123,
+ 21, 22, 23, 24, 131, 133, 127, 142, 25, 70, 129, 27, 28, 67,
+ 153, 84, 143, 138, 147, 157, 148, 68, 71, 30, 31, 32, 33, 34,
+ 35, 36, 154, 38, 39, 40, 41, 42, 80, 43, 44, 45, 47, 48,
+ 85, 49, 50, 51 };
+ const unsigned char *rare = (const unsigned char *) ne;
+ const unsigned char *p = (const unsigned char *) ne;
+ int c_rare = rarebyte_table[*rare];
+ int c;
+ for (; n--; ++p)
+ {
+ c = rarebyte_table[*p];
+ if (c < c_rare)
+ {
+ rare = p;
+ c_rare = c;
+ }
+ }
+ return (void *) rare;
+}
+
+void *
+FUNC_NAME (const void *hs, size_t hs_len, const void *ne, size_t ne_len)
+{
+ if (ne_len == 1)
+ return (void *) MEMCHR (hs, *(unsigned char *) ne, hs_len);
+ if (__glibc_unlikely (ne_len == 0))
+ return (void *) hs;
+ if (__glibc_unlikely (hs_len < ne_len))
+ return NULL;
+ VEC hv0, hv1, hv, nv;
+ MASK i, hm0, hm1, m, cmpm;
+ const unsigned int matchsh = ne_len < VEC_SIZE ? VEC_SIZE - ne_len : 0;
+ const MASK matchm = ONES << matchsh;
+ const unsigned char *h = (const unsigned char *) hs;
+ const unsigned char *const end = h + hs_len - ne_len;
+ const unsigned char *hp;
+ size_t shift = PTR_DIFF (find_rarest_byte (ne, ne_len), ne);
+ if (shift == ne_len - 1)
+ --shift;
+ const VEC nv0 = SETONE8 (*((char *) ne + shift));
+ const VEC nv1 = SETONE8 (*((char *) ne + shift + 1));
+ h += shift;
+ if (PTR_DIFF (PTR_ALIGN_UP (ne, PAGE_SIZE), ne) >= VEC_SIZE
+ || PTR_IS_ALIGNED (ne, PAGE_SIZE) || ne_len >= VEC_SIZE)
+ nv = LOADU ((VEC *) ne);
+ else
+ MEMCPY (&nv, ne, MIN (VEC_SIZE, ne_len));
+ const unsigned int off = PTR_DIFF (h, PTR_ALIGN_DOWN (h, VEC_SIZE));
+ unsigned int off2 = (PTR_DIFF (end, (h - shift)) < VEC_SIZE)
+ ? VEC_SIZE - (unsigned int) (end - (h - shift)) - 1
+ : 0;
+ h -= off;
+ hv0 = LOAD ((const VEC *) h);
+ hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+ hm1 = (MASK) CMPEQ8_MASK (hv0, nv1) >> 1;
+ /* Clear matched bits that are out of bounds. */
+ m = (((hm0 & hm1) >> off) << off2) >> off2;
+ while (m)
+ {
+ i = TZCNT (m);
+ m = BLSR (m);
+ hp = h + off + i - shift;
+ if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE
+ || PTR_IS_ALIGNED (hp, PAGE_SIZE))
+ {
+ hv = LOADU ((VEC *) hp);
+ cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
+ if (cmpm == matchm)
+ if (ne_len <= VEC_SIZE
+ || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
+ ne_len - VEC_SIZE))
+ return (void *) hp;
+ }
+ else
+ {
+ if (!MEMCMPEQ (hp, ne, ne_len))
+ return (void *) hp;
+ }
+ }
+ h += VEC_SIZE - 1;
+ for (; h - shift + VEC_SIZE <= end; h += VEC_SIZE)
+ {
+ hv0 = LOADU ((const VEC *) h);
+ hv1 = LOAD ((const VEC *) (h + 1));
+ hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+ hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+ m = hm0 & hm1;
+ while (m)
+ {
+ match:
+ i = TZCNT (m);
+ m = BLSR (m);
+ hp = h + i - shift;
+ if (PTR_DIFF (PTR_ALIGN_UP (hp, PAGE_SIZE), hp) >= VEC_SIZE
+ || PTR_IS_ALIGNED (hp, PAGE_SIZE))
+ {
+ hv = LOADU ((VEC *) hp);
+ cmpm = (MASK) CMPEQ8_MASK (hv, nv) << matchsh;
+ if (cmpm == matchm)
+ if (ne_len <= VEC_SIZE
+ || !MEMCMPEQ (hp + VEC_SIZE, (const char *) ne + VEC_SIZE,
+ ne_len - VEC_SIZE))
+ return (void *) hp;
+ }
+ else
+ {
+ if (!MEMCMPEQ (hp, ne, ne_len))
+ return (void *) hp;
+ }
+ }
+ }
+ if (h - shift <= end)
+ {
+ off2 = VEC_SIZE - (unsigned int) (end - (h - shift)) - 1;
+ hv1 = LOAD ((const VEC *) (h + 1));
+ if (PTR_DIFF (PTR_ALIGN_UP (h, PAGE_SIZE), h) >= VEC_SIZE
+ || PTR_IS_ALIGNED (h, PAGE_SIZE))
+ {
+ hv0 = LOADU ((const VEC *) h);
+ hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+ hm0 = (MASK) CMPEQ8_MASK (hv0, nv0);
+ }
+ else
+ {
+ hm1 = (MASK) CMPEQ8_MASK (hv1, nv1);
+ hm0 = 1 | (MASK) CMPEQ8_MASK (hv1, nv0) << 1;
+ }
+ /* Clear matched bits that are out of bounds. */
+ m = ((hm0 & hm1) << off2) >> off2;
+ if (m)
+ goto match;
+ }
+ return NULL;
+}
--
2.43.0
More information about the Libc-alpha
mailing list