[PATCH] AArch64: Add optimized Q-register memcpy
Carlos O'Donell
carlos@redhat.com
Tue Jul 14 20:17:07 GMT 2020
On 7/14/20 12:33 PM, Wilco Dijkstra wrote:
> Add a new memcpy using 128-bit Q registers - this is faster on modern
> cores and reduces codesize. Similar to the generic memcpy, small cases
> include copies up to 32 bytes. 64-128 byte copies are split into two
> cases to improve performance of 64-96 byte copies. Large copies align
> the source rather than the destination.
>
> bench-memcpy-random is ~9% faster than memcpy_falkor on Neoverse N1,
> so make this memcpy the default on N1 (on Centriq it is 15% faster than
> memcpy_falkor).
>
> Passes GLIBC regression tests. OK for commit?
As release manager this is OK for 2.32 if Szabolcs says it's OK.
> ---
>
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 4377df0735287c210efd661188f9e6e3923c8003..e93c21e764a8d02b9f07f5030c31836a3f03f3e1 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,5 +1,5 @@
> ifeq ($(subdir),string)
> -sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> +sysdep_routines += memcpy_generic memcpy_advsimd memcpy_thunderx memcpy_thunderx2 \
> memcpy_falkor \
> memcpy_new \
> memset_generic memset_falkor memset_emag memset_kunpeng \
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 0ccaf53e555e410569eb2be76ec7d5b4d7bc64a5..09feea97ea37ab923cf4a8557197d46adcd49204 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -42,11 +42,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_thunderx2)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_falkor)
> + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_simd)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_generic))
> IFUNC_IMPL (i, name, memmove,
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx2)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
> + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_simd)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> IFUNC_IMPL (i, name, memset,
> /* Enable this on non-falkor processors too so that other cores
> diff --git a/sysdeps/aarch64/multiarch/memcpy.c b/sysdeps/aarch64/multiarch/memcpy.c
> index 2fafefd5d23fc1528031b5fe52098218ed603b89..e6f3ae116701097d71a02e2a1f6bfdadc1eec34a 100644
> --- a/sysdeps/aarch64/multiarch/memcpy.c
> +++ b/sysdeps/aarch64/multiarch/memcpy.c
> @@ -29,6 +29,7 @@
> extern __typeof (__redirect_memcpy) __libc_memcpy;
>
> extern __typeof (__redirect_memcpy) __memcpy_generic attribute_hidden;
> +extern __typeof (__redirect_memcpy) __memcpy_simd attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> @@ -36,11 +37,11 @@ extern __typeof (__redirect_memcpy) __memcpy_falkor attribute_hidden;
> libc_ifunc (__libc_memcpy,
> (IS_THUNDERX (midr)
> ? __memcpy_thunderx
> - : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_ARES (midr) || IS_KUNPENG920 (midr)
> + : (IS_FALKOR (midr) || IS_PHECDA (midr) || IS_KUNPENG920 (midr)
> ? __memcpy_falkor
> : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> ? __memcpy_thunderx2
> - : __memcpy_generic))));
> + : (IS_ARES (midr) ? __memcpy_simd : __memcpy_generic)))));
>
> # undef memcpy
> strong_alias (__libc_memcpy, memcpy);
> diff --git a/sysdeps/aarch64/multiarch/memcpy_advsimd.S b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> new file mode 100644
> index 0000000000000000000000000000000000000000..d4ba74777744c8bb5a83e43ab2d63ad8dab35203
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memcpy_advsimd.S
> @@ -0,0 +1,247 @@
> +/* Generic optimized memcpy using SIMD.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
> + *
> + */
> +
> +#define dstin x0
> +#define src x1
> +#define count x2
> +#define dst x3
> +#define srcend x4
> +#define dstend x5
> +#define A_l x6
> +#define A_lw w6
> +#define A_h x7
> +#define B_l x8
> +#define B_lw w8
> +#define B_h x9
> +#define C_lw w10
> +#define tmp1 x14
> +
> +#define A_q q0
> +#define B_q q1
> +#define C_q q2
> +#define D_q q3
> +#define E_q q4
> +#define F_q q5
> +#define G_q q6
> +#define H_q q7
> +
> +
> +/* This implementation supports both memcpy and memmove and shares most code.
> + It uses unaligned accesses and branchless sequences to keep the code small,
> + simple and improve performance.
> +
> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> + copies of up to 128 bytes, and large copies. The overhead of the overlap
> + check in memmove is negligible since it is only required for large copies.
> +
> + Large copies use a software pipelined loop processing 64 bytes per
> + iteration. The destination pointer is 16-byte aligned to minimize
> + unaligned accesses. The loop tail is handled by always copying 64 bytes
> + from the end. */
> +
> +ENTRY (__memcpy_simd)
> + DELOUSE (0)
> + DELOUSE (1)
> + DELOUSE (2)
> +
> + add srcend, src, count
> + add dstend, dstin, count
> + cmp count, 128
> + b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
> +
> + /* Small copies: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> + ret
> +
> + /* Copy 8-15 bytes. */
> +L(copy16):
> + tbz count, 3, L(copy8)
> + ldr A_l, [src]
> + ldr A_h, [srcend, -8]
> + str A_l, [dstin]
> + str A_h, [dstend, -8]
> + ret
> +
> + /* Copy 4-7 bytes. */
> +L(copy8):
> + tbz count, 2, L(copy4)
> + ldr A_lw, [src]
> + ldr B_lw, [srcend, -4]
> + str A_lw, [dstin]
> + str B_lw, [dstend, -4]
> + ret
> +
> + /* Copy 0..3 bytes using a branchless sequence. */
> +L(copy4):
> + cbz count, L(copy0)
> + lsr tmp1, count, 1
> + ldrb A_lw, [src]
> + ldrb C_lw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret
> +
> + .p2align 4
> + /* Medium copies: 33..128 bytes. */
> +L(copy32_128):
> + ldp A_q, B_q, [src]
> + ldp C_q, D_q, [srcend, -32]
> + cmp count, 64
> + b.hi L(copy128)
> + stp A_q, B_q, [dstin]
> + stp C_q, D_q, [dstend, -32]
> + ret
> +
> + .p2align 4
> + /* Copy 65..128 bytes. */
> +L(copy128):
> + ldp E_q, F_q, [src, 32]
> + cmp count, 96
> + b.ls L(copy96)
> + ldp G_q, H_q, [srcend, -64]
> + stp G_q, H_q, [dstend, -64]
> +L(copy96):
> + stp A_q, B_q, [dstin]
> + stp E_q, F_q, [dstin, 32]
> + stp C_q, D_q, [dstend, -32]
> + ret
> +
> + /* Align loop64 below to 16 bytes. */
> + nop
> +
> + /* Copy more than 128 bytes. */
> +L(copy_long):
> + /* Copy 16 bytes and then align src to 16-byte alignment. */
> + ldr D_q, [src]
> + and tmp1, src, 15
> + bic src, src, 15
> + sub dst, dstin, tmp1
> + add count, count, tmp1 /* Count is now 16 too large. */
> + ldp A_q, B_q, [src, 16]
> + str D_q, [dstin]
> + ldp C_q, D_q, [src, 48]
> + subs count, count, 128 + 16 /* Test and readjust count. */
> + b.ls L(copy64_from_end)
> +L(loop64):
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [src, 80]
> + stp C_q, D_q, [dst, 48]
> + ldp C_q, D_q, [src, 112]
> + add src, src, 64
> + add dst, dst, 64
> + subs count, count, 64
> + b.hi L(loop64)
> +
> + /* Write the last iteration and copy 64 bytes from the end. */
> +L(copy64_from_end):
> + ldp E_q, F_q, [srcend, -64]
> + stp A_q, B_q, [dst, 16]
> + ldp A_q, B_q, [srcend, -32]
> + stp C_q, D_q, [dst, 48]
> + stp E_q, F_q, [dstend, -64]
> + stp A_q, B_q, [dstend, -32]
> + ret
> +
> +END (__memcpy_simd)
> +libc_hidden_builtin_def (__memcpy_simd)
> +
> +
> +ENTRY (__memmove_simd)
> + DELOUSE (0)
> + DELOUSE (1)
> + DELOUSE (2)
> +
> + add srcend, src, count
> + add dstend, dstin, count
> + cmp count, 128
> + b.hi L(move_long)
> + cmp count, 32
> + b.hi L(copy32_128)
> +
> + /* Small moves: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> + ldr A_q, [src]
> + ldr B_q, [srcend, -16]
> + str A_q, [dstin]
> + str B_q, [dstend, -16]
> + ret
> +
> +L(move_long):
> + /* Only use backward copy if there is an overlap. */
> + sub tmp1, dstin, src
> + cbz tmp1, L(move0)
> + cmp tmp1, count
> + b.hs L(copy_long)
> +
> + /* Large backwards copy for overlapping copies.
> + Copy 16 bytes and then align srcend to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldr D_q, [srcend, -16]
> + and tmp1, srcend, 15
> + bic srcend, srcend, 15
> + sub count, count, tmp1
> + ldp A_q, B_q, [srcend, -32]
> + str D_q, [dstend, -16]
> + ldp C_q, D_q, [srcend, -64]
> + sub dstend, dstend, tmp1
> + subs count, count, 128
> + b.ls L(copy64_from_start)
> +
> +L(loop64_backwards):
> + stp A_q, B_q, [dstend, -32]
> + ldp A_q, B_q, [srcend, -96]
> + stp C_q, D_q, [dstend, -64]
> + ldp C_q, D_q, [srcend, -128]
> + sub srcend, srcend, 64
> + sub dstend, dstend, 64
> + subs count, count, 64
> + b.hi L(loop64_backwards)
> +
> + /* Write the last iteration and copy 64 bytes from the start. */
> +L(copy64_from_start):
> + ldp E_q, F_q, [src, 32]
> + stp A_q, B_q, [dstend, -32]
> + ldp A_q, B_q, [src]
> + stp C_q, D_q, [dstend, -64]
> + stp E_q, F_q, [dstin, 32]
> + stp A_q, B_q, [dstin]
> +L(move0):
> + ret
> +
> +END (__memmove_simd)
> +libc_hidden_builtin_def (__memmove_simd)
> diff --git a/sysdeps/aarch64/multiarch/memmove.c b/sysdeps/aarch64/multiarch/memmove.c
> index ed5a47f6f83e7b0afcec60cb9fa0f09999eaacae..1229f8b89296eddd2e711490bb7fc0b35726b6f5 100644
> --- a/sysdeps/aarch64/multiarch/memmove.c
> +++ b/sysdeps/aarch64/multiarch/memmove.c
> @@ -29,6 +29,7 @@
> extern __typeof (__redirect_memmove) __libc_memmove;
>
> extern __typeof (__redirect_memmove) __memmove_generic attribute_hidden;
> +extern __typeof (__redirect_memmove) __memmove_simd attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_thunderx2 attribute_hidden;
> extern __typeof (__redirect_memmove) __memmove_falkor attribute_hidden;
> @@ -40,7 +41,7 @@ libc_ifunc (__libc_memmove,
> ? __memmove_falkor
> : (IS_THUNDERX2 (midr) || IS_THUNDERX2PA (midr)
> ? __memmove_thunderx2
> - : __memmove_generic))));
> + : (IS_ARES (midr) ? __memmove_simd : __memmove_generic)))));
>
> # undef memmove
> strong_alias (__libc_memmove, memmove);
>
>
--
Cheers,
Carlos.
More information about the Libc-alpha
mailing list