This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 3/3] sparc: M7 optimized memcpy/mempcpy/memmove/memset/bzero.
On 27/09/2017 13:09, Patrick McGehearty wrote:
> Tested in sparcv9-*-* and sparc64-*-* targets in both multi and
> non-multi arch configurations.
>
> Support added to identify Sparc M7/T7/S7/M8/T8 processor capability.
> Usual "make check" correctness tests run with no regressions.
> Performance tests run on Sparc S7 using new code and old niagara4 code.
>
> Optimizations for memcpy also apply to mempcpy and memmove
> where they share code. Optimizations for memset also apply
> to bzero as they share code.
>
> For memcpy/mempcpy/memmove, performance comparison with niagara4 code:
> Long word aligned data
> 0-127 bytes - minimal changes
> 128-1023 bytes - 7-30% gain
> 1024+ bytes - 1-7% gain (in cache); 30-100% gain (out of cache)
> Word aligned data
> 0-127 bytes - 50%+ gain
> 128-1023 bytes - 10-200% gain
> 1024+ bytes - 0-15% gain (in cache); 5-50% gain (out of cache)
> Unaligned data
> 0-127 bytes - 0-70%+ gain
> 128-447 bytes - 40-80%+ gain
> 448-511 bytes - 1-3% loss
> 512-4096 bytes - 2-3% gain (in cache); 0-20% gain (out of cache)
> 4096+ bytes - +/- 3% (in cache); 20-50% gain (out of cache)
>
> For memset/bzero, performance comparison with niagara4 code:
> For memset nonzero data,
> 256-1023 bytes - 60-90% gain (in cache); 5% gain (out of cache)
> 1K+ bytes - 80-260% gain (in cache); 40-80% gain (out of cache)
> For memset zero data (and bzero),
> 256-1023 bytes - 80-120% gain (in cache), 0% gain (out of cache)
> 1024+ bytes - 2-4x gain (in cache), 10-35% gain (out of cache)
> ---
> ChangeLog | 20 +
> sysdeps/sparc/sparc32/sparcv9/multiarch/Makefile | 3 +-
> .../sparcv9/multiarch/memcpy-memmove-niagara7.S | 2 +
> sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S | 2 +
> .../sparc32/sparcv9/multiarch/memset-niagara7.S | 2 +
> .../sparc/sparc32/sparcv9/multiarch/rtld-memmove.c | 1 +
> sysdeps/sparc/sparc64/multiarch/Makefile | 3 +-
> sysdeps/sparc/sparc64/multiarch/ifunc-impl-list.c | 13 +
> .../sparc64/multiarch/memcpy-memmove-niagara7.S | 979 ++++++++++++++++++++
> sysdeps/sparc/sparc64/multiarch/memcpy.S | 28 +-
> sysdeps/sparc/sparc64/multiarch/memmove.S | 72 ++
> sysdeps/sparc/sparc64/multiarch/memset-niagara7.S | 334 +++++++
> sysdeps/sparc/sparc64/multiarch/memset.S | 28 +-
> sysdeps/sparc/sparc64/multiarch/rtld-memmove.c | 1 +
> 14 files changed, 1482 insertions(+), 6 deletions(-)
> create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memcpy-memmove-niagara7.S
> create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memmove.S
> create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/memset-niagara7.S
> create mode 100644 sysdeps/sparc/sparc32/sparcv9/multiarch/rtld-memmove.c
> create mode 100644 sysdeps/sparc/sparc64/multiarch/memcpy-memmove-niagara7.S
> create mode 100644 sysdeps/sparc/sparc64/multiarch/memmove.S
> create mode 100644 sysdeps/sparc/sparc64/multiarch/memset-niagara7.S
> create mode 100644 sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
I would prefer if you split this patch in a memcpy/mempcpy/memmove and
a memset/bzero one since they are separated implementations.
I also noticed this whitespace issues, so you might want to check this out:
---
patch03:886: trailing whitespace.
* lines from memory. Use ST_CHUNK stores to first element of each cache
patch03:890: space before tab in indent.
andn %o2, 0x3f, %o5 /* %o5 is multiple of block size */
patch03:891: space before tab in indent.
and %o2, 0x3f, %o2 /* residue bytes in %o2 */
patch03:892: trailing whitespace.
patch03:1216: new blank line at EOF.
+
warning: 5 lines add whitespace errors.
---
> diff --git a/sysdeps/sparc/sparc64/multiarch/memcpy.S b/sysdeps/sparc/sparc64/multiarch/memcpy.S
> index b6396ee..d72f4b1 100644
> --- a/sysdeps/sparc/sparc64/multiarch/memcpy.S
> +++ b/sysdeps/sparc/sparc64/multiarch/memcpy.S
> @@ -27,7 +27,19 @@ ENTRY(memcpy)
> # ifdef SHARED
> SETUP_PIC_REG_LEAF(o3, o5)
> # endif
> - set HWCAP_SPARC_CRYPTO, %o1
> + set HWCAP_SPARC_ADP, %o1
> + andcc %o0, %o1, %g0
> + be 1f
> + nop
> +# ifdef SHARED
> + sethi %gdop_hix22(__memcpy_niagara7), %o1
> + xor %o1, %gdop_lox10(__memcpy_niagara7), %o1
> +# else
> + set __memcpy_niagara7, %o1
> +# endif
> + ba 10f
> + nop
> +1: set HWCAP_SPARC_CRYPTO, %o1
> andcc %o0, %o1, %g0
> be 1f
> andcc %o0, HWCAP_SPARC_N2, %g0
> @@ -89,7 +101,19 @@ ENTRY(__mempcpy)
> # ifdef SHARED
> SETUP_PIC_REG_LEAF(o3, o5)
> # endif
> - set HWCAP_SPARC_CRYPTO, %o1
> + set HWCAP_SPARC_ADP, %o1
> + andcc %o0, %o1, %g0
> + be 1f
> + nop
> +# ifdef SHARED
> + sethi %gdop_hix22(__mempcpy_niagara7), %o1
> + xor %o1, %gdop_lox10(__mempcpy_niagara7), %o1
> +# else
> + set __mempcpy_niagara7, %o1
> +# endif
> + ba 10f
> + nop
> +1: set HWCAP_SPARC_CRYPTO, %o1
> andcc %o0, %o1, %g0
> be 1f
> andcc %o0, HWCAP_SPARC_N2, %g0
Since you are touching this file, I think it would be better to change the
ifunc selector to a C implementation by using the __ifunc/sparc_libc_ifunc
macros.
> diff --git a/sysdeps/sparc/sparc64/multiarch/memmove.S b/sysdeps/sparc/sparc64/multiarch/memmove.S
> new file mode 100644
> index 0000000..43bdf08
> --- /dev/null
> +++ b/sysdeps/sparc/sparc64/multiarch/memmove.S
> @@ -0,0 +1,72 @@
> +/* Multiple versions of memmove and bcopy
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2017 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, write to the Free
> + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
> + 02111-1307 USA. */
> +
> +#include <sysdep.h>
> +
> +#if IS_IN (libc)
> + .text
> +ENTRY(memmove)
> + .type memmove, @gnu_indirect_function
> +# ifdef SHARED
> + SETUP_PIC_REG_LEAF(o3, o5)
> +# endif
> + set HWCAP_SPARC_ADP, %o1
> + andcc %o0, %o1, %g0
> + be 1f
> + nop
> +# ifdef SHARED
> + sethi %gdop_hix22(__memmove_niagara7), %o1
> + xor %o1, %gdop_lox10(__memmove_niagara7), %o1
> +# else
> + set __memmove_niagara7, %o1
> +# endif
> + ba 10f
> + nop
> +1:
> +# ifdef SHARED
> + sethi %gdop_hix22(__memmove_ultra1), %o1
> + xor %o1, %gdop_lox10(__memmove_ultra1), %o1
> +# else
> + set __memmove_ultra1, %o1
> +# endif
> +10:
> +# ifdef SHARED
> + add %o3, %o1, %o1
> +# endif
> + retl
> + mov %o1, %o0
> +END(memmove)
> +
> +libc_hidden_builtin_def (memmove)
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +#undef weak_alias
> +#define weak_alias(x, y)
> +#undef libc_hidden_def
> +#define libc_hidden_def(name)
> +
> +#define memmove __memmove_ultra1
> +#define __memmove __memmove_ultra1
> +
> +#endif
> +
> +#include "../memmove.S"
> +
As before, I think new ifunc resolvers should be avoid to be coded in assembly.
Also there is an extra line on this file.
> diff --git a/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c b/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
> new file mode 100644
> index 0000000..66fe118
> --- /dev/null
> +++ b/sysdeps/sparc/sparc64/multiarch/rtld-memmove.c
> @@ -0,0 +1 @@
> +#include <../rtld-memmove.c>
Avoid using relative patch, include the default memmove using full paths.