[PATCH] powerpc: Optimized strcpy for POWER9
Adhemerval Zanella
adhemerval.zanella@linaro.org
Mon May 4 12:58:34 GMT 2020
On 01/05/2020 01:52, Anton Blanchard via Libc-alpha wrote:
> This version is significantly faster on small strings and relatively
> unaligned large strings.
So it seems that it uses the ISA 3.0 partial stores to optimize vector
instructions usage, could you add it on the commit message?
The power8 version also adds an stpcpy version (which I am not sure is
really a gain for short strings) based on its strcpy. Maybe it could
be a good thing to check if this could be adapted to be build as
stpcpy as well.
Usually for such optimizations we try to get a baseline benchmark results
using glibc benchtests. Could you post the results for before and after?
(Some of the glibc benchtests does have some shortcomings, but at least it
documents some of the expected gains and performance differences with the
new implementation.)
> ---
> sysdeps/powerpc/powerpc64/le/power9/strcpy.S | 144 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 4 +
> .../powerpc64/multiarch/strcpy-power9.S | 26 ++++
> sysdeps/powerpc/powerpc64/multiarch/strcpy.c | 7 +
> 5 files changed, 182 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> new file mode 100644
> index 0000000000..5749228054
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
> @@ -0,0 +1,144 @@
> +/* Optimized strcpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#ifndef STRCPY
> +# define STRCPY strcpy
> +#endif
> +
> +/* Implements the function
> +
> + char * [r3] strcpy (char *dest [r3], const char *src [r4])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
> +ENTRY_TOCLESS (STRCPY, 4)
> + CALL_MCOUNT 2
> +
> + /* NULL string optimisation */
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + cmpwi r0,0
> + beqlr
Is it a worth optimization? None of other strcpy arch optimizations does
it.
> +
> + addi r4,r4,1
> + addi r11,r3,1
> +
> + vspltisb v18,0 /* Zeroes in v18 */
> +
> + neg r5,r4
> + rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + addi r8,r8,1 /* Add null terminator */
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpd r8,r9
> + bgt L(no_null)
> +
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + blr
> +
> +L(no_null):
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> +
> +L(loop):
Should we eforce alignment here?
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
Out of curiosity, why unroll 4x time here?
> +
> + addi r4,r4,64
> + addi r11,r11,64
> +
> + b L(loop)
> +
> +L(tail1):
> + vctzlsbb r8,v6
> + addi r8,r8,1
> + sldi r9,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r9
> + blr
> +
> +L(tail2):
> + stxv 32+v0,0(r11)
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + addi r8,r8,1 /* Add null terminator */
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
> +
> +L(tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + addi r8,r8,1 /* Add null terminator */
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
> +
> +L(tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + addi r8,r8,1 /* Add null terminator */
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
> +END (STRCPY)
> +libc_hidden_builtin_def (strcpy)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index ea936bf9ed..db11345053 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
> strncase-power8
>
> ifneq (,$(filter %le,$(config-machine)))
> -sysdep_routines += strcmp-power9 strncmp-power9
> +sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index b9fef3f43c..ad11ede20e 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -85,6 +85,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */
> IFUNC_IMPL (i, name, strcpy,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __strcpy_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __strcpy_power8)
> IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> new file mode 100644
> index 0000000000..d22aa0a8d6
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strcpy implementation for POWER9/PPC64.
> + Copyright (C) 2016-2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRCPY __strcpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strcpy.S>
> +#endif
Ok.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> index 66ea93b8f4..dcdee5181f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
> @@ -25,9 +25,16 @@
> extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
> extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
> extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
> +# endif
> #undef strcpy
>
> libc_ifunc_redirected (__redirect_strcpy, strcpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strcpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __strcpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
Ok.
More information about the Libc-alpha
mailing list