This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[Patch] aarch64: ThunderX2 specific memcpy and memmove
- From: "Saharoy, Saikat" <Saikat dot Saharoy at cavium dot com>
- To: "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>
- Date: Fri, 26 May 2017 21:49:06 +0000
- Subject: [Patch] aarch64: ThunderX2 specific memcpy and memmove
- Authentication-results: sourceware.org; auth=none
- Authentication-results: sourceware.org; dkim=none (message not signed) header.d=none;sourceware.org; dmarc=none action=none header.from=cavium.com;
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
With the IFUNC infrastructure for aarch64 in place, here is a
patch to add Thunderx2 specific versions of memcpy and
memmove.
The ThunderX2 version of memcpy and memmove use SIMD
instructions and paired loads/stores to obtain improved performance
for suitable copy sizes, especially for large (> 128 KB) sizes.
If people think we should use a separate ThunderX2 version of memcpy
(and memmove) for all aarch64 systems, I will be happy to drop this patch.
The primary change is addition of a new file memcpy_thunderx2.c.
Other minor changes are to support ThunderX2 specific functions for
memcpy and memmove.
Thanks,
Saikat Saharoy
saikat.saharoy@cavium.com
ChangeLog entry:
* sysdeps/aarch64/multiarch/memcpy_thunderx2.c: New file.
* sysdeps/aarch64/multiarch/Makefile:
Include _memcpy_thunderx2 and specific optimizations
* sysdeps/aarch64/multiarch/memcpy.c:
Use ThunderX2 memcpy function
* sysdeps/aarch64/multiarch/memmove.c: Likewise.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c:
Add ThunderX2 memcpy and memmove in IFUNC implementations
* sysdeps/unix/sysv/linux/aarch64/cpu-features.h:
Add checks for ThunderX2 and ThunderX2 Pass A platforms
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 78d52c7..25b37de 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,3 +1,4 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_generic memcpy_thunderx
+sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2
+CFLAGS-memcpy_thunderx2.c += -O3 -funroll-loops -fPIC
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index c4f23df..ee6f3ea 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -25,7 +25,7 @@
#include <stdio.h>
/* Maximum number of IFUNC implementations. */
-#define MAX_IFUNC 2
+#define MAX_IFUNC 3
size_t
__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@@ -39,10 +39,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/aarch64/multiarch/memcpy.c and memmove.c. */
IFUNC_IMPL (i, name, memcpy,
+ IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr),
+ __memcpy_thunderx2)
IFUNC_IMPL_ADD (array, i, memcpy, IS_THUNDERX (midr),
__memcpy_thunderx)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
IFUNC_IMPL (i, name, memmove,
+ IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX2 (midr),
+ __memmove_thunderx2)
+ IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX2PA (midr),
+ __memmove_thunderx2)
IFUNC_IMPL_ADD (array, i, memmove, IS_THUNDERX (midr),
__memmove_thunderx)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
index 9f73efb..2cdcde0 100644
--- a/sysdeps/aarch64/multiarch/memcpy.c
+++ b/sysdeps/aarch64/multiarch/memcpy.c
@@ -29,10 +29,15 @@
extern __typeof (__redirect_memcpy) __libc_memcpy;
extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
+extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
libc_ifunc (__libc_memcpy,
- IS_THUNDERX (midr) ? __memcpy_thunderx : __memcpy_generic);
+ IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memcpy_thunderx2 :
+ IS_THUNDERX (midr)
+ ? __memcpy_thunderx
+ : __memcpy_generic);
# undef memcpy
strong_alias (__libc_memcpy, memcpy);
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
index e69de29..cc0fb9e 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.c
@@ -0,0 +1,764 @@
+/* Multiple versions of memcpy and memmove. AARCH64 version.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <arm_neon.h>
+#include <string.h>
+
+#define _ldpq(x, y, p) \
+ asm ("ldp %q0, %q1, [%2]" : "=w" ((x)), "=w" ((y)) : "r" ((p)))
+
+#define _stpq(x, y, p) \
+ asm ("stp %q0, %q1, [%2]" :: "w" ((x)), "w" ((y)), "r" ((p)) : "memory")
+
+#define _ldpr(x, y, p) \
+ asm ("ldp %x0, %x1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stpr(x, y, p) \
+ asm ("stp %x0, %x1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ldp(x, y, p) \
+ asm ("ldp %w0, %w1, [%2]" : "=r" ((x)), "=r" ((y)) : "r" ((p)))
+
+#define _stp(x, y, p) \
+ asm ("stp %w0, %w1, [%2]" :: "r" ((x)), "r" ((y)), "r" ((p)) : "memory")
+
+#define _ld(x, p) \
+ asm ("ldr %w0, [%1]" : "=r" ((x)) : "r" ((p)))
+
+#define _st(x, p) \
+ asm ("str %w0, [%1]" :: "r" ((x)), "r" ((p)) : "memory")
+
+#define _memcpy_fixed_simd(dst, src, N_16bytes) \
+ do { \
+ int _i; \
+ size_t _offset = 0; \
+ for (_i=0; _i < (N_16bytes); _i++, _offset += 16) \
+ { \
+ vst1q_s32 ((dst) + _offset, vld1q_s32 ((src) + _offset)); \
+ } \
+ } while (0)
+
+/* Copy 32 bytes using ldp/stp. */
+static inline void
+memcpy_32bytes (void * dst, void * src)
+{
+ int32x4_t u, v;
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+/* Copy 16bytes using ldr/str. */
+static inline void
+memcpy_16bytes (void * dst, void * src)
+{
+ vst1q_s32 (dst, vld1q_s32(src));
+}
+
+
+/* Copy 8bytes using ldp/stp. */
+static inline void
+memcpy_8bytes (void * dst, void * src)
+{
+ int32_t u, v;
+ _ldp (u, v, src);
+ _stp (u, v, dst);
+}
+
+/* Copy 4bytes using ldr/src */
+static inline void
+memcpy_4bytes (void * dst, void * src)
+{
+ int32_t u;
+ _ld (u, src);
+ _st (u, dst);
+}
+
+static inline void
+memcpy_128bytes (void * dst, void * src)
+{
+ int32x4_t u8, v8;
+ int32x4_t u16, v16;
+ int32x4_t u24, v24;
+ int32x4_t u32, v32;
+
+ _ldpq (u8, v8, src);
+ _stpq (u8, v8, dst);
+
+ _ldpq (u16, v16, src + 32);
+ _stpq (u16, v16, dst + 32);
+
+ _ldpq (u24, v24, src + 64);
+ _stpq (u24, v24, dst + 64);
+
+ _ldpq (u32, v32, src + 96);
+ _stpq (u32, v32, dst + 96);
+}
+
+/* Copy in 128bytes stride */
+static inline void
+memcpy_128byte_multiple (void * dst, void * src, size_t bytes)
+{
+ size_t i = 0;
+ int32x4_t u8, v8;
+ int32x4_t u16, v16;
+ int32x4_t u24, v24;
+ int32x4_t u32, v32;
+
+ __builtin_prefetch (src + 256, 0, 3);
+ __builtin_prefetch (src + 512, 0, 3);
+
+ do {
+ _ldpq (u8, v8, src + i);
+ _stpq (u8, v8, dst + i);
+
+ _ldpq (u16, v16, src + i + 32);
+ _stpq (u16, v16, dst + i + 32);
+
+ _ldpq (u24, v24, src + i + 64);
+ _stpq (u24, v24, dst + i + 64);
+
+ _ldpq (u32, v32, src + i + 96);
+ _stpq (u32, v32, dst + i + 96);
+ } while ((i += 128) < bytes);
+}
+
+/* Copy in 32bytes stride */
+static inline void
+memcpy_32byte_multiple (void * dst, void * src, size_t bytes)
+{
+ size_t i = 0;
+ int32x4_t u, v;
+
+ __builtin_prefetch(src + 256, 0, 3);
+ __builtin_prefetch (src + 512, 0, 3);
+
+ do {
+ _ldpq (u, v, src + i);
+ _stpq (u, v, dst + i);
+ } while ((i += 32) < bytes);
+}
+
+
+/* Copy any size over 128bytes */
+static inline void
+memcpy_any_over_128bytes (void * dst, void * src, size_t bytes)
+{
+ memcpy_128byte_multiple (dst, src, bytes & ~127);
+
+ size_t rem = bytes & 127;
+
+ if (rem > 32)
+ {
+ memcpy_128bytes (dst + (bytes - 128), src + (bytes - 128));
+ return;
+ }
+
+ if (rem > 16)
+ {
+ memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+ return;
+ }
+
+ /* max size of remainder is 16 bytes */
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+/* Copy any size over 32bytes */
+static inline void
+memcpy_any_over_32bytes (void * dst, void * src, size_t bytes)
+{
+ memcpy_32byte_multiple (dst, src, bytes & ~31);
+
+ memcpy_32bytes (dst + (bytes - 32), src + (bytes - 32));
+}
+
+
+/* Copy up to 16bytes */
+static inline void
+memcpy_upto_16bytes (void * dst, void * src, size_t bytes)
+{
+ if (bytes == 16)
+ {
+ memcpy_16bytes (dst, src);
+ return;
+ }
+
+ size_t rem = bytes;
+ size_t index;
+
+ if (bytes >= 8)
+ {
+ goto L_mc_8;
+ }
+ if (bytes == 4)
+ {
+ memcpy_4bytes (dst, src);
+ return;
+ }
+
+ if (bytes == 1)
+ {
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+ if (bytes == 2)
+ {
+ ((char *)dst)[1] = ((char *)src)[1];
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+ if (bytes == 3)
+ {
+ ((char *)dst)[2] = ((char *)src)[2];
+ ((char *)dst)[1] = ((char *)src)[1];
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+
+ if (__glibc_unlikely(bytes == 0))
+ return;
+
+ goto L_mc_4;
+
+ L_mc_8:
+ index = bytes - 8;
+ memcpy_8bytes (dst + index, src + index);
+
+ rem -= 8;
+ if (rem == 0)
+ return;
+
+ if ((bytes - 8) > 4)
+ memcpy_8bytes(dst, src);
+ else
+ memcpy_4bytes(dst, src);
+
+ return;
+
+ L_mc_4:
+ index = bytes - 4;
+ memcpy_4bytes (dst + index, src + index);
+
+ rem -= 4;
+ if (rem == 0)
+ return;
+
+ memcpy_4bytes(dst, src);
+}
+
+
+static inline bool
+match_fixed_sizes (void * dst, void * src, size_t bytes)
+{
+ if (bytes & 15)
+ return false;
+
+ if (!(bytes & 127))
+ {
+ memcpy_128byte_multiple (dst, src, bytes);
+ return true;
+ }
+
+ if (!(bytes & 31))
+ {
+ memcpy_32byte_multiple (dst, src, bytes);
+ return true;
+ }
+
+ switch (bytes)
+ {
+ case 48:
+ memcpy_32bytes (dst, src);
+ memcpy_16bytes (dst + 32, src + 32);
+ return true;
+ case 80:
+ _memcpy_fixed_simd (dst, src, 5);
+ return true;
+ case 192:
+ _memcpy_fixed_simd (dst, src, 12);
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+static inline void
+memcpy_small_sizes (void * dst, void * src, size_t bytes)
+{
+ if (bytes > 128)
+ {
+ memcpy_any_over_128bytes (dst, src, bytes);
+ return;
+ }
+ if (bytes > 32)
+ {
+ memcpy_any_over_32bytes (dst, src, bytes);
+ return;
+ }
+ memcpy_16bytes (dst, src);
+ memcpy_16bytes (dst + (bytes - 16), src + (bytes - 16));
+}
+
+
+/* Align source address to 16-byte boundary */
+static inline void
+memalign_simd (char ** dst, char ** src, size_t * bytes)
+{
+ size_t align_bytes = 16 - ((uint64_t)(*src) & 0xf);
+
+ /* size is greater than 16bytes, so perform SIMD copy of 16bytes */
+ vst1q_s32 ((int32_t *)(*dst), vld1q_s32((int32_t *)(*src)));
+
+ *src += align_bytes;
+ *dst += align_bytes;
+ *bytes -= align_bytes;
+}
+
+/* glibc memcpy function follows */
+void
+__memcpy_thunderx2 (void * dst, void * src, size_t bytes)
+{
+ __builtin_prefetch (src, 0, 3);
+ __builtin_prefetch (dst, 1, 3);
+
+ if (bytes <= 16)
+ {
+ memcpy_upto_16bytes (dst, src, bytes);
+ return;
+ }
+
+ if (bytes < 512)
+ {
+ if (!match_fixed_sizes (dst, src, bytes))
+ memcpy_small_sizes (dst, src, bytes);
+
+ return;
+ }
+
+ if (((uint64_t)(src) & 0xf))
+ {
+ memalign_simd ((char **)&dst, (char **)&src, &bytes);
+ }
+
+ if (!(bytes & 0x7f))
+ { /* copy multiple of 128 bytes */
+ memcpy_128byte_multiple (dst, src, bytes);
+ return;
+ }
+
+ /* handle odd sizes over 128 bytes */
+ memcpy_any_over_128bytes (dst, src, bytes);
+}
+
+/* Below is inline version of simd memcpy used by simd_memmove */
+static inline void
+int_simd_memcpy (void * dst, void * src, size_t bytes)
+{
+ if (bytes <= 16)
+ {
+ memcpy_upto_16bytes (dst, src, bytes);
+ return;
+ }
+
+
+ if (bytes < 512)
+ {
+ if (!match_fixed_sizes (dst, src, bytes))
+ memcpy_small_sizes (dst, src, bytes);
+
+ return;
+ }
+
+ if (((uint64_t)(src) & 0xf))
+ {
+ memalign_simd ((char **)&dst, (char **)&src, &bytes);
+ }
+
+ if (!(bytes & 0x7f))
+ { /* multiple of 128 bytes */
+ memcpy_128byte_multiple (dst, src, bytes);
+ return;
+ }
+
+ /* handle odd sizes over 128 bytes */
+ memcpy_any_over_128bytes (dst, src, bytes);
+}
+
+// Memmove functions
+
+static inline void
+memmove_32byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void
+memmove_64byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void
+memmove_96byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+
+}
+
+
+static inline void
+memmove_128byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+
+}
+
+static inline void
+memmove_192byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void
+memmove_256byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+ int32x4_t u6, v6;
+ int32x4_t u7, v7;
+
+ __builtin_prefetch (src + 256, 0, 3);
+
+ _ldpq (u7, v7, src + 224);
+ _stpq (u7, v7, dst + 224);
+
+ _ldpq (u6, v6, src + 192);
+ _stpq (u6, v6, dst + 192);
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+
+static inline void
+memmove_512byte (void * dst, void * src, size_t bytes)
+{
+ int32x4_t u, v;
+ int32x4_t u1, v1;
+ int32x4_t u2, v2;
+ int32x4_t u3, v3;
+ int32x4_t u4, v4;
+ int32x4_t u5, v5;
+ int32x4_t u6, v6;
+ int32x4_t u7, v7;
+
+ int32x4_t u8, v8;
+ int32x4_t u9, v9;
+ int32x4_t u10, v10;
+ int32x4_t u11, v11;
+ int32x4_t u12, v12;
+ int32x4_t u13, v13;
+ int32x4_t u14, v14;
+ int32x4_t u15, v15;
+
+ __builtin_prefetch (src + 256, 0, 3);
+
+ _ldpq (u15, v15, src + 480);
+ _stpq (u15, v15, dst + 480);
+
+ _ldpq (u14, v14, src + 448);
+ _stpq (u14, v14, dst + 448);
+
+ _ldpq (u13, v13, src + 416);
+ _stpq (u13, v13, dst + 416);
+
+ _ldpq (u12, v12, src + 384);
+ _stpq (u12, v12, dst + 384);
+
+ _ldpq (u11, v11, src + 352);
+ _stpq (u11, v11, dst + 352);
+
+ _ldpq (u10, v10, src + 320);
+ _stpq (u10, v10, dst + 320);
+
+ _ldpq (u9, v9, src + 288);
+ _stpq (u9, v9, dst + 288);
+
+ _ldpq (u8, v8, src + 256 );
+ _stpq (u8, v8, dst + 256);
+
+ _ldpq (u7, v7, src + 224);
+ _stpq (u7, v7, dst + 224);
+
+ _ldpq (u6, v6, src + 192);
+ _stpq (u6, v6, dst + 192);
+
+ _ldpq (u5, v5, src + 160);
+ _stpq (u5, v5, dst + 160);
+
+ _ldpq (u4, v4, src + 128);
+ _stpq (u4, v4, dst + 128);
+
+ _ldpq (u3, v3, src + 96);
+ _stpq (u3, v3, dst + 96);
+
+ _ldpq (u2, v2, src + 64);
+ _stpq (u2, v2, dst + 64);
+
+ _ldpq (u1, v1, src + 32);
+ _stpq (u1, v1, dst + 32);
+
+ _ldpq (u, v, src);
+ _stpq (u, v, dst);
+}
+
+static inline void
+memmove_upto_16bytes (void * dst, void * src, size_t bytes)
+{
+ if (bytes == 16)
+ {
+ memcpy_16bytes (dst, src);
+ return;
+ }
+
+ size_t rem = bytes;
+ size_t bc = 0;
+ size_t index;
+
+ if (bytes >= 8)
+ goto L_mm_8;
+
+ if (bytes == 4)
+ {
+ memcpy_4bytes (dst, src);
+ return;
+ }
+
+ if (bytes == 1 )
+ {
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+
+ if (bytes == 2 )
+ {
+ ((char *)dst)[1] = ((char *)src)[1];
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+
+ if (bytes == 3 )
+ {
+ ((char *)dst)[2] = ((char *)src)[2];
+ ((char *)dst)[1] = ((char *)src)[1];
+ ((char *)dst)[0] = ((char *)src)[0];
+ return;
+ }
+
+ goto L_mm_4;
+
+ L_mm_8:
+ index = bytes - (bc + 8);
+ memcpy_8bytes (dst + index, src + index);
+
+ rem -= 8;
+
+ if (rem == 0)
+ return;
+
+ bc += 8;
+
+ L_mm_4:
+ index = bytes - (bc + 4);
+ memcpy_4bytes (dst + index, src + index);
+
+ rem -= 4;
+
+ if (rem == 0)
+ return;
+
+ bc += 4;
+
+ /* there are now less than 4 bytes left to copy */
+ switch (rem)
+ {
+ case 3:
+ ((char *)dst)[2] = ((char *)src)[2];
+ case 2:
+ ((char *)dst)[1] = ((char *)src)[1];
+ case 1:
+ ((char *)dst)[0] = ((char *)src)[0];
+ }
+} // memmove_upto_16bytes
+
+
+static inline bool
+memmove_small (void * dst, void * src, size_t bytes)
+{
+ /* no checking for overlapping src and dst is needed,
+ as loads and stores are done separately
+ */
+ if (bytes <= 16)
+ {
+ memmove_upto_16bytes (dst, src, bytes);
+ return true;
+ }
+
+ if (bytes & 0x1f)
+ return false; /* there will be no match in switch table below, if not divisible by 32 */
+
+ switch (bytes)
+ {
+ case 32:
+ memmove_32byte (dst, src, bytes);
+ return true;
+ case 64:
+ memmove_64byte (dst, src, bytes);
+ return true;
+ case 96:
+ memmove_96byte (dst, src, bytes);
+ return true;
+ case 128:
+ memmove_128byte (dst, src, bytes);
+ return true;
+ case 256:
+ memmove_256byte (dst, src, bytes);
+ return true;
+ case 512:
+ memmove_512byte (dst, src, bytes);
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+void
+__memmove_thunderx2 (void * dst, void * src, size_t bytes)
+{
+ if (dst == src || bytes == 0)
+ return;
+
+ __builtin_prefetch (src, 0, 3);
+
+ if (memmove_small (dst, src, bytes))
+ return;
+
+ uint64_t offset = (dst > src) ? (dst - src) : (src - dst);
+
+ if (bytes <= offset)
+ {
+ /* overlap does not matter */
+ int_simd_memcpy (dst, src, bytes);
+ return;
+ }
+
+ /* take care of overlap below */
+
+ if (dst > src)
+ int_simd_memcpy (dst + offset, src + (offset << 1), bytes - offset);
+ else
+ int_simd_memcpy (dst + offset, src + offset, bytes - offset);
+
+ int_simd_memcpy (dst, src, offset);
+
+} // end function: simd_memmove
diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
index 34c6b29..5692230 100644
--- a/sysdeps/aarch64/multiarch/memmove.c
+++ b/sysdeps/aarch64/multiarch/memmove.c
@@ -29,10 +29,15 @@
extern __typeof (__redirect_memmove) __libc_memmove;
extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
libc_ifunc (__libc_memmove,
- IS_THUNDERX (midr) ? __memmove_thunderx : __memmove_generic);
+ IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
+ ? __memmove_thunderx2 :
+ IS_THUNDERX (midr)
+ ? __memmove_thunderx
+ : __memmove_generic);
# undef memmove
strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
index c92b650..45ddce6 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
@@ -40,6 +40,10 @@
#define IS_THUNDERX(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
&& MIDR_PARTNUM(midr) == 0x0a1)
+#define IS_THUNDERX2PA(midr) (MIDR_IMPLEMENTOR(midr) == 'B' \
+ && MIDR_PARTNUM(midr) == 0x516)
+#define IS_THUNDERX2(midr) (MIDR_IMPLEMENTOR(midr) == 'C' \
+ && MIDR_PARTNUM(midr) == 0xaf)