This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 2/3] aarch64: Optimized memset specific to AmpereComputing skylark


On 12/10/18 12:41, Xue Feng wrote:
> This version uses general register based memory store instead of
> vector register based, for the former is faster than the latter
> in skylark.
> 

if that's the only difference compared to memset_falkor.S
then it would be nice to know how much it matters for
performance on falkor and skylark to use x vs q regs.

if it's < 10% then don't add another memset variant.

> The fact that DC ZVA size in skylark is 64-byte, is used by IFUNC
> dispatch to select this memset, so that cost of runtime-check on
> DC ZVA size can be saved.
> 
>     * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
>     Add memset_skylark.
>     * sysdeps/aarch64/multiarch/ifunc-impl-list.c
>     (__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
>     * sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
>     Add IS_SKYLARK check for ifunc dispatch.
>     * sysdeps/aarch64/multiarch/memset_skylark.S: New file.
> ---
>  ChangeLog                                   |  10 ++
>  sysdeps/aarch64/multiarch/Makefile          |   3 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c |   1 +
>  sysdeps/aarch64/multiarch/memset.c          |   5 +-
>  sysdeps/aarch64/multiarch/memset_skylark.S  | 176 ++++++++++++++++++++++++++++
>  5 files changed, 193 insertions(+), 2 deletions(-)
>  create mode 100644 sysdeps/aarch64/multiarch/memset_skylark.S
> 
> diff --git a/ChangeLog b/ChangeLog
> index 2533c9d..28370f9 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,13 @@
> +2018-10-12  Feng Xue  <feng.xue@amperecomputing.com>
> +
> +	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
> +	Add memset_skylark.
> +	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +	(__libc_ifunc_impl_list): Add __memset_skylark to memset ifunc.
> +	* sysdeps/aarch64/multiarch/memset.c (libc_ifunc):
> +	Add IS_SKYLARK check for ifunc dispatch.
> +	* sysdeps/aarch64/multiarch/memset_skylark.S: New file.
> +
>  2018-10-11  Feng Xue  <feng.xue@amperecomputing.com>
>  
>  	* manual/tunables.texi (Tunable glibc.cpu.name): Add skylark.
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index b1a5f59..828ce4f 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,5 +1,6 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> -		   memcpy_falkor memmove_falkor memset_generic memset_falkor \
> +		   memcpy_falkor memmove_falkor \
> +		   memset_generic memset_falkor memset_skylark \
>  		   strlen_generic strlen_asimd
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index af44665..baf01a0 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -51,6 +51,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  	      /* Enable this on non-falkor processors too so that other cores
>  		 can do a comparative analysis with __memset_generic.  */
>  	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
> +	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>  
>    IFUNC_IMPL (i, name, strlen,
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> index d74ed3a..b732a7e 100644
> --- a/sysdeps/aarch64/multiarch/memset.c
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -29,12 +29,15 @@
>  extern __typeof (__redirect_memset) __libc_memset;
>  
>  extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_skylark attribute_hidden;
>  extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
>  
>  libc_ifunc (__libc_memset,
>  	    ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64
>  	     ? __memset_falkor
> -	     : __memset_generic));
> +	     : (IS_SKYLARK (midr) && zva_size == 64
> +	       ? __memset_skylark
> +	       : __memset_generic)));
>  
>  # undef memset
>  strong_alias (__libc_memset, memset);
> diff --git a/sysdeps/aarch64/multiarch/memset_skylark.S b/sysdeps/aarch64/multiarch/memset_skylark.S
> new file mode 100644
> index 0000000..22bf576
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_skylark.S
> @@ -0,0 +1,176 @@
> +/* Optimized memset for AmpereComputing skylark processor.
> +   Copyright (C) 2018 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +#include "memset-reg.h"
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_skylark
> +
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses
> + *
> + */
> +
> +ENTRY_ALIGN (MEMSET, 6)
> +
> +	DELOUSE (0)
> +	DELOUSE (2)
> +
> +	bfi	valw, valw, 8, 8
> +	bfi	valw, valw, 16, 16
> +	bfi	val, val, 32, 32
> +
> +	add	dstend, dstin, count
> +
> +	cmp	count, 96
> +	b.hi	L(set_long)
> +	cmp	count, 16
> +	b.hs	L(set_medium)
> +
> +	/* Set 0..15 bytes.  */
> +	tbz	count, 3, 1f
> +	str	val, [dstin]
> +	str	val, [dstend, -8]
> +	ret
> +
> +	.p2align 3
> +1:	tbz	count, 2, 2f
> +	str	valw, [dstin]
> +	str	valw, [dstend, -4]
> +	ret
> +2:	cbz	count, 3f
> +	strb	valw, [dstin]
> +	tbz	count, 1, 3f
> +	strh	valw, [dstend, -2]
> +3:	ret
> +
> +	.p2align 3
> +	/* Set 16..96 bytes.  */
> +L(set_medium):
> +	stp	val, val, [dstin]
> +	tbnz	count, 6, L(set96)
> +	stp	val, val, [dstend, -16]
> +	tbz	count, 5, 1f
> +	stp	val, val, [dstin, 16]
> +	stp	val, val, [dstend, -32]
> +1:	ret
> +
> +	.p2align 4
> +	/* Set 64..96 bytes.  Write 64 bytes from the start and
> +	   32 bytes from the end.  */
> +L(set96):
> +	stp	val, val, [dstin, 16]
> +	stp	val, val, [dstin, 32]
> +	stp	val, val, [dstin, 48]
> +	stp	val, val, [dstend, -32]
> +	stp	val, val, [dstend, -16]
> +	ret
> +
> +	.p2align 4
> +L(set_long):
> +	stp	val, val, [dstin]
> +	cmp	count, 512
> +	ccmp	val, 0, 0, cs
> +	bic	dst, dstin, 15
> +	b.eq	L(zva_64)
> +
> +	/* Small-size or non-zero memset does not use DC ZVA. */
> +	sub	count, dstend, dst
> +
> +	/*
> +	 * Adjust count and bias for loop. By substracting extra 1 from count,
> +	 * it is easy to use tbz instruction to check whether loop tailing
> +	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
> +	 */
> +	sub	count, count, 64+16+1
> +	nop
> +
> +1:	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +	stp	val, val, [dst, 48]
> +	stp	val, val, [dst, 64]!
> +	subs	count, count, 64
> +	b.hs	1b
> +
> +	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
> +	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +1:	stp	val, val, [dstend, -32]
> +	stp	val, val, [dstend, -16]
> +	ret
> +
> +	.p2align 3
> +L(zva_64):
> +	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +	stp	val, val, [dst, 48]
> +	bic	dst, dst, 63
> +
> +	/*
> +	 * Previous memory writes might cross cache line boundary, and cause
> +	 * cache line partially dirty. Zeroing this kind of cache line using
> +	 * DC ZVA will incur extra cost, for it requires loading untouched
> +	 * part of the line from memory before zeoring.
> +	 *
> +	 * So, write the first 64 byte aligned block using stp to force
> +	 * fully dirty cache line.
> +	 */
> +	stp	val, val, [dst, 64]
> +	stp	val, val, [dst, 80]
> +	stp	val, val, [dst, 96]
> +	stp	val, val, [dst, 112]
> +
> +	sub	count, dstend, dst
> +	/*
> +	 * Adjust count and bias for loop. By substracting extra 1 from count,
> +	 * it is easy to use tbz instruction to check whether loop tailing
> +	 * count is less than 33 bytes, so as to bypass 2 unneccesary stps.
> +	 */
> +	sub	count, count, 128+64+64+1
> +	add	dst, dst, 128
> +	nop
> +
> +	/* DC ZVA sets 64 bytes each time. */
> +1:	dc	zva, dst
> +	add	dst, dst, 64
> +	subs	count, count, 64
> +	b.hs	1b
> +
> +	/*
> +	 * Write the last 64 byte aligned block using stp to force fully
> +	 * dirty cache line.
> +	 */
> +	stp	val, val, [dst, 0]
> +	stp	val, val, [dst, 16]
> +	stp	val, val, [dst, 32]
> +	stp	val, val, [dst, 48]
> +
> +	tbz	count, 5, 1f	/* Remaining count is less than 33 bytes? */
> +	stp	val, val, [dst, 64]
> +	stp	val, val, [dst, 80]
> +1:	stp	val, val, [dstend, -32]
> +	stp	val, val, [dstend, -16]
> +	ret
> +
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> +
> +#endif
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]