This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 2/3] aarch64: Optimized memset specific to AmpereComputing skylark
- From: Szabolcs Nagy <szabolcs dot nagy at arm dot com>
- To: Xue Feng <innat_xue at hotmail dot com>, "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>
- Cc: nd at arm dot com, "marcus dot shawcroft at linaro dot org" <marcus dot shawcroft at linaro dot org>, Feng Xue <feng dot xue at amperecomputing dot com>
- Date: Mon, 15 Oct 2018 12:53:58 +0100
- Subject: Re: [PATCH 2/3] aarch64: Optimized memset specific to AmpereComputing skylark
- References: <PS1PR0201MB17716AE205DA496C7E476A7A84E20@PS1PR0201MB1771.apcprd02.prod.outlook.com>
On 12/10/18 12:41, Xue Feng wrote:
> This version uses general register based memory store instead of
> vector register based, for the former is faster than the latter
> in skylark.
>
if that's the only difference compared to memset_falkor.S
then it would be nice to know how much it matters for
performance on falkor and skylark to use x vs q regs.
if it's < 10% then don't add another memset variant.
> The fact that DC ZVA size in skylark is 64-byte, is used by IFUNC
> dispatch to select this memset, so that cost of runtime-check on
> DC ZVA size can be saved.
>
> * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> Add memset_skylark.
> * sysdeps/aarch64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
> * sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
> Add IS_SKYLARK check for ifunc dispatch.
> * sysdeps/aarch64/multiarch/memset_skylark.S: New file.
> ---
> ChangeLog | 10 ++
> sysdeps/aarch64/multiarch/Makefile | 3 +-
> sysdeps/aarch64/multiarch/ifunc-impl-list.c | 1 +
> sysdeps/aarch64/multiarch/memset.c | 5 +-
> sysdeps/aarch64/multiarch/memset_skylark.S | 176 ++++++++++++++++++++++++++++
> 5 files changed, 193 insertions(+), 2 deletions(-)
> create mode 100644 sysdeps/aarch64/multiarch/memset_skylark.S
>
> diff --git a/ChangeLog b/ChangeLog
> index 2533c9d..28370f9 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,13 @@
> +2018-10-12 Feng Xue <feng.xue@amperecomputing.com>
> +
> + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> + Add memset_skylark.
> + * sysdeps/aarch64/multiarch/ifunc-impl-list.c
> + (__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
> + * sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
> + Add IS_SKYLARK check for ifunc dispatch.
> + * sysdeps/aarch64/multiarch/memset_skylark.S: New file.
> +
> 2018-10-11 Feng Xue <feng.xue@amperecomputing.com>
>
> * manual/tunables.texi (Tunable glibc.cpu.name): Add skylark.
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index b1a5f59..828ce4f 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,5 +1,6 @@
> ifeq ($(subdir),string)
> sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> - memcpy_falkor memmove_falkor memset_generic memset_falkor \
> + memcpy_falkor memmove_falkor \
> + memset_generic memset_falkor memset_skylark \
> strlen_generic strlen_asimd
> endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index af44665..baf01a0 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -51,6 +51,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> /* Enable this on non-falkor processors too so that other cores
> can do a comparative analysis with __memset_generic. */
> IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> + IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
> IFUNC_IMPL (i, name, strlen,
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index d74ed3a..b732a7e 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -29,12 +29,15 @@
> extern __typeof (__redirect_memset) __libc_memset;
>
> extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_skylark attribute_hidden;
> extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
>
> libc_ifunc (__libc_memset,
> ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
> ? __memset_falkor
> - : __memset_generic));
> + : (IS_SKYLARK (midr) && zva_size == 64
> + ? __memset_skylark
> + : __memset_generic)));
>
> # undef memset
> strong_alias (__libc_memset, memset);
> diff --git a/sysdeps/aarch64/multiarch/memset_skylark.S b/sysdeps/aarch64/multiarch/memset_skylark.S
> new file mode 100644
> index 0000000..22bf576
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_skylark.S
> @@ -0,0 +1,176 @@
> +/* Optimized memset for AmpereComputing skylark processor.
> + Copyright (C) 2018 Free Software Foundation, Inc.
> +
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library. If not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +#include "memset-reg.h"
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_skylark
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses
> + *
> + */
> +
> +ENTRY_ALIGN (MEMSET, 6)
> +
> + DELOUSE (0)
> + DELOUSE (2)
> +
> + bfi valw, valw, 8, 8
> + bfi valw, valw, 16, 16
> + bfi val, val, 32, 32
> +
> + add dstend, dstin, count
> +
> + cmp count, 96
> + b.hi L(set_long)
> + cmp count, 16
> + b.hs L(set_medium)
> +
> + /* Set 0..15 bytes. */
> + tbz count, 3, 1f
> + str val, [dstin]
> + str val, [dstend, -8]
> + ret
> +
> + .p2align 3
> +1: tbz count, 2, 2f
> + str valw, [dstin]
> + str valw, [dstend, -4]
> + ret
> +2: cbz count, 3f
> + strb valw, [dstin]
> + tbz count, 1, 3f
> + strh valw, [dstend, -2]
> +3: ret
> +
> + .p2align 3
> + /* Set 16..96 bytes. */
> +L(set_medium):
> + stp val, val, [dstin]
> + tbnz count, 6, L(set96)
> + stp val, val, [dstend, -16]
> + tbz count, 5, 1f
> + stp val, val, [dstin, 16]
> + stp val, val, [dstend, -32]
> +1: ret
> +
> + .p2align 4
> + /* Set 64..96 bytes. Write 64 bytes from the start and
> + 32 bytes from the end. */
> +L(set96):
> + stp val, val, [dstin, 16]
> + stp val, val, [dstin, 32]
> + stp val, val, [dstin, 48]
> + stp val, val, [dstend, -32]
> + stp val, val, [dstend, -16]
> + ret
> +
> + .p2align 4
> +L(set_long):
> + stp val, val, [dstin]
> + cmp count, 512
> + ccmp val, 0, 0, cs
> + bic dst, dstin, 15
> + b.eq L(zva_64)
> +
> + /* Small-size or non-zero memset does not use DC ZVA. */
> + sub count, dstend, dst
> +
> + /*
> + * Adjust count and bias for loop. By substracting extra 1 from count,
> + * it is easy to use tbz instruction to check whether loop tailing
> + * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
> + */
> + sub count, count, 64+16+1
> + nop
> +
> +1: stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> + stp val, val, [dst, 48]
> + stp val, val, [dst, 64]!
> + subs count, count, 64
> + b.hs 1b
> +
> + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
> + stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> +1: stp val, val, [dstend, -32]
> + stp val, val, [dstend, -16]
> + ret
> +
> + .p2align 3
> +L(zva_64):
> + stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> + stp val, val, [dst, 48]
> + bic dst, dst, 63
> +
> + /*
> + * Previous memory writes might cross cache line boundary, and cause
> + * cache line partially dirty. Zeroing this kind of cache line using
> + * DC ZVA will incur extra cost, for it requires loading untouched
> + * part of the line from memory before zeoring.
> + *
> + * So, write the first 64 byte aligned block using stp to force
> + * fully dirty cache line.
> + */
> + stp val, val, [dst, 64]
> + stp val, val, [dst, 80]
> + stp val, val, [dst, 96]
> + stp val, val, [dst, 112]
> +
> + sub count, dstend, dst
> + /*
> + * Adjust count and bias for loop. By substracting extra 1 from count,
> + * it is easy to use tbz instruction to check whether loop tailing
> + * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
> + */
> + sub count, count, 128+64+64+1
> + add dst, dst, 128
> + nop
> +
> + /* DC ZVA sets 64 bytes each time. */
> +1: dc zva, dst
> + add dst, dst, 64
> + subs count, count, 64
> + b.hs 1b
> +
> + /*
> + * Write the last 64 byte aligned block using stp to force fully
> + * dirty cache line.
> + */
> + stp val, val, [dst, 0]
> + stp val, val, [dst, 16]
> + stp val, val, [dst, 32]
> + stp val, val, [dst, 48]
> +
> + tbz count, 5, 1f /* Remaining count is less than 33 bytes? */
> + stp val, val, [dst, 64]
> + stp val, val, [dst, 80]
> +1: stp val, val, [dstend, -32]
> + stp val, val, [dstend, -16]
> + ret
> +
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> +
> +#endif
>