This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v2] aarch64: Hoist ZVA check out of the memset function


On 03/10/17 12:52, Siddhesh Poyarekar wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver and store the result into a static variable,
> which can be read in case of non-standard zva sizes.  This effectively
> adds 3 ifunc variants for memset - one for cases where zva is
> disabled, one for 64 byte zva and another for 128 byte zva.  I have
> retained the older memset as __memset_generic for internal libc.so use
> so that the change impact is minimal.  We should eventually have a
> discussion on what is more expensive, reading dczid_el0 on every
> memset invocation or the indirection due to PLT.
> 
> The gains due to this are significant for falkor, with gains as high
> as 80% in some cases.  Likewise for mustang, although the numbers are
> slightly lower.  Here's a sample from the falkor tests:
> 

the principle is ok, but the code is still not clear to me.

> +/* Memset entry point.  */
> +ENTRY_ALIGN (MEMSET, 6)
>  
>  	DELOUSE (0)
>  	DELOUSE (2)
> @@ -46,9 +147,9 @@ ENTRY_ALIGN (__memset, 6)
>  	add	dstend, dstin, count
>  
>  	cmp	count, 96
> -	b.hi	L(set_long)
> +	b.hi	MEMSET_L(set_long)
>  	cmp	count, 16
> -	b.hs	L(set_medium)
> +	b.hs	MEMSET_L(set_medium)
>  	mov	val, v0.D[0]
>  
>  	/* Set 0..15 bytes.  */
> @@ -68,9 +169,9 @@ ENTRY_ALIGN (__memset, 6)
>  3:	ret
>  
>  	/* Set 17..96 bytes.  */
> -L(set_medium):
> +MEMSET_L(set_medium):

why do we need different label names for the different memsets?

i'd expect a localized change where there are 4 variants of a
piece of code under different ifdefs.

now it's hard to see what's going on, e.g. how code alignment of
various parts of memset is affected.

> +#else
> +	/* Memset called through PLT, so we need only one of the ZVA
> +	   variants.  */
> +# ifdef MEMSET_ZVA
> +	and	valw, valw, 255
> +# endif
> +	bic	dst, dstin, 15
> +	str	q0, [dstin]
> +# ifdef MEMSET_ZVA
> +	cmp	count, 256
> +	ccmp	valw, 0, 0, cs
> +	b.eq	MEMSET_L(try_zva)
> +# endif
> +MEMSET_L(no_zva):
> +	do_no_zva
> +# if defined MEMSET_ZVA
> +MEMSET_L(try_zva):
> +#  if MEMSET_ZVA == 64
> +	do_zva_64
> +#  elif MEMSET_ZVA == 128
> +	do_zva_128
> +#  else
> +	adrp	zva_len, __aarch64_zva_size
> +	ldr	zva_len, [zva_len, #:lo12:__aarch64_zva_size]
> +	do_zva_default

i don't think we need this global,
i'd just do mrs	dczid_el0 here.
(this code should be unreachable on current cpus)

> +#  endif
> +# endif
> +#endif
>  

> +++ b/sysdeps/aarch64/multiarch/memset_zva.S
> @@ -0,0 +1,41 @@
> +/* Memset for aarch64, ZVA enabled.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_zva_64
> +# define MEMSET_ZVA 64
> +# define MEMSET_L(label) L(label ## _zva64)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_128
> +# define MEMSET_ZVA 128
> +# define MEMSET_L(label) L(label ## _zva128)
> +# include <sysdeps/aarch64/memset.S>
> +
> +# undef MEMSET
> +# undef MEMSET_ZVA
> +# undef MEMSET_L
> +# define MEMSET __memset_zva_default
> +# define MEMSET_ZVA 1
> +# define MEMSET_L(label) L(label ## _zvadef)
> +# include <sysdeps/aarch64/memset.S>
> +#endif

i think this is not ok, these should be in different files.

you put code that is never used together in the same object file.

(ideally the ifunc code for various cpus should be grouped
together by cpu and on different pages for each cpu so you
dont even load the code at runtime that is not for your cpu)

> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
> new file mode 100644
> index 0000000..969cf14
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
> @@ -0,0 +1,25 @@
> +/* Memset for aarch64 for use within the dynamic linker.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (rtld)
> +# define MEMSET memset
> +# define INTERNAL_MEMSET
> +# define MEMSET_L(label) L(label)
> +# include <sysdeps/aarch64/memset.S>
> +#endif

is this performance critical?
(i don't know if it's only used for early dynlinker startup code
or all libc internal memsets)

if not then we don't need it to be different variant, it could be
just a copy of the _nozva/_defaultzva case

> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>  
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>  
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }

from this code it's not clear to me that cpu_features->zva_size
is always initialized.

> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>  
>  #endif /* _CPU_FEATURES_AARCH64_H  */
> 


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]