[PATCH] powerpc: Optimized strcpy for POWER9

Adhemerval Zanella adhemerval.zanella@linaro.org
Mon May 4 12:58:34 GMT 2020



On 01/05/2020 01:52, Anton Blanchard via Libc-alpha wrote:
> This version is significantly faster on small strings and relatively
> unaligned large strings.

So it seems that it uses the ISA 3.0 partial stores to optimize vector
instructions usage, could you add it on the commit message?

The power8 version also adds an stpcpy version (which I am not sure is
really a gain for short strings) based on its strcpy.  Maybe it could
be a good thing to check if this could be adapted to be build as
stpcpy as well.

Usually for such optimizations we try to get a baseline benchmark results
using glibc benchtests.  Could you post the results for before and after?

(Some of the glibc benchtests does have some shortcomings, but at least it 
documents some of the expected gains and performance differences with the 
new implementation.)

> ---
>  sysdeps/powerpc/powerpc64/le/power9/strcpy.S  | 144 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   4 +
>  .../powerpc64/multiarch/strcpy-power9.S       |  26 ++++
>  sysdeps/powerpc/powerpc64/multiarch/strcpy.c  |   7 +
>  5 files changed, 182 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> new file mode 100644
> index 0000000000..5749228054
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -0,0 +1,144 @@
> +/* Optimized strcpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRCPY
> +# define STRCPY strcpy
> +#endif
> +
> +/* Implements the function
> +
> +   char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (STRCPY, 4)
> +	CALL_MCOUNT 2
> +
> +	/* NULL string optimisation  */
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	cmpwi	r0,0
> +	beqlr

Is it a worth optimization? None of other strcpy arch optimizations does
it.

> +
> +	addi	r4,r4,1
> +	addi	r11,r3,1
> +
> +	vspltisb v18,0		/* Zeroes in v18  */
> +
> +	neg	r5,r4
> +	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9
> +	bgt	L(no_null)
> +
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	blr
> +
> +L(no_null):
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +
> +L(loop):

Should we eforce alignment here?

> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)

Out of curiosity, why unroll 4x time here?

> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +
> +	b	L(loop)
> +
> +L(tail1):
> +	vctzlsbb r8,v6
> +	addi	r8,r8,1
> +	sldi	r9,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r9
> +	blr
> +
> +L(tail2):
> +	stxv	32+v0,0(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	addi	r8,r8,1		/* Add null terminator  */
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +END (STRCPY)
> +libc_hidden_builtin_def (strcpy)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index ea936bf9ed..db11345053 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  		   strncase-power8
>  
>  ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += strcmp-power9 strncmp-power9
> +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index b9fef3f43c..ad11ede20e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
>    IFUNC_IMPL (i, name, strcpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __strcpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __strcpy_power8)
>  	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> new file mode 100644
> index 0000000000..d22aa0a8d6
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strcpy implementation for POWER9/PPC64.
> +   Copyright (C) 2016-2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRCPY __strcpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
> +#endif

Ok.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> index 66ea93b8f4..dcdee5181f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> @@ -25,9 +25,16 @@
>  extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
>  extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
>  extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
> +# endif
>  #undef strcpy
>  
>  libc_ifunc_redirected (__redirect_strcpy, strcpy,
> +# ifdef __LITTLE_ENDIAN__
> +			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +			? __strcpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __strcpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

Ok.


More information about the Libc-alpha mailing list