This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 4/4] S390: Implement mempcpy with help of memcpy. [BZ #19765]
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Tue, 26 Apr 2016 10:32:55 -0300
- Subject: Re: [PATCH 4/4] S390: Implement mempcpy with help of memcpy. [BZ #19765]
- Authentication-results: sourceware.org; auth=none
- References: <1461672469-2107-1-git-send-email-stli at linux dot vnet dot ibm dot com> <1461672469-2107-4-git-send-email-stli at linux dot vnet dot ibm dot com>
On 26/04/2016 09:07, Stefan Liebler wrote:
> There exist optimized memcpy functions on s390, but no optimized mempcpy.
> This patch adds mempcpy entry points in memcpy.S files, which
> use the memcpy implementation. Now mempcpy itself is also an IFUNC function
> as memcpy is and the variants are listed in ifunc-impl-list.c.
>
> The s390 string.h defines _HAVE_STRING_ARCH_mempcpy and __mempcpy/mempcpy
> definitions. If n is constant and small enough, __mempcpy/mempcpy is an inlined
> function, which uses memcpy and returns dest + n. In this constant case,
> GCC emits instructions like mvi or mvc and avoids the function call to memcpy.
> If n is not constant, then __mempcpy is called.
>
AFAIk GLIBC does not have any ports that implements mempcpy and not
memcpy and IMHO the inline version to convert mempcpy to memcpy is
always a gain.
So I would suggest to just avoid adding arch-specific bits and move the
mempcpy inline optimization at string/string.h to string/string2.h as
--
#define mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
#define __mempcpy(dest, src, n) __mempcpy_inline (dest, src, n)
__extern_always_inline void *
__mempcpy_inline (void *__restrict __dest,
const void *__restrict __src, size_t __n)
{
return (char *) memcpy (__dest, __src, __n) + __n;
}
--
This is convert any mempcpy to mempcy plus length and provide
the symbol only for older binaries.
> ChangeLog:
>
> [BZ #19765]
> * sysdeps/s390/bits/string.h (mempcpy) Redirect to
> __mempcpy_inline_memcpy or __mempcpy_inline_mempcpy.
> (__mempcpy) Likewise.
> (__mempcpy_inline_mempcpy): New inline function.
> (__mempcpy_inline_memcpy): Likewise.
> (_HAVE_STRING_ARCH_mempcpy): New define.
> * sysdeps/s390/mempcpy.S: New File.
> * sysdeps/s390/multiarch/mempcpy.c: Likewise.
> * sysdeps/s390/multiarch/Makefile (sysdep_routines): Add mempcpy.
> * sysdeps/s390/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
> Add mempcpy variants.
> * sysdeps/s390/s390-32/memcpy.S: Add mempcpy entry point.
> (memcpy): Adjust to be usable from mempcpy entry point.
> (__memcpy_mvcle): Likewise.
> * sysdeps/s390/s390-64/memcpy.S: Likewise.
> * sysdeps/s390/s390-32/multiarch/memcpy-s390.S: Add entry points
> ____mempcpy_z196, ____mempcpy_z10 and add __GI_ symbols for mempcpy.
> (__memcpy_z196): Adjust to be usable from mempcpy entry point.
> (__memcpy_z10): Likewise.
> * sysdeps/s390/s390-64/multiarch/memcpy-s390x.S: Likewise.
> ---
> sysdeps/s390/bits/string.h | 36 +++++++++++++++++++
> sysdeps/s390/mempcpy.S | 19 ++++++++++
> sysdeps/s390/multiarch/Makefile | 3 +-
> sysdeps/s390/multiarch/ifunc-impl-list.c | 7 ++++
> sysdeps/s390/multiarch/mempcpy.c | 26 ++++++++++++++
> sysdeps/s390/s390-32/memcpy.S | 50 ++++++++++++++++-----------
> sysdeps/s390/s390-32/multiarch/memcpy-s390.S | 31 +++++++++++++++--
> sysdeps/s390/s390-64/memcpy.S | 47 ++++++++++++++-----------
> sysdeps/s390/s390-64/multiarch/memcpy-s390x.S | 29 ++++++++++++++--
> 9 files changed, 203 insertions(+), 45 deletions(-)
> create mode 100644 sysdeps/s390/mempcpy.S
> create mode 100644 sysdeps/s390/multiarch/mempcpy.c
>
> diff --git a/sysdeps/s390/bits/string.h b/sysdeps/s390/bits/string.h
> index 39e0b7f..eae3c8e 100644
> --- a/sysdeps/s390/bits/string.h
> +++ b/sysdeps/s390/bits/string.h
> @@ -24,6 +24,42 @@
> /* Use the unaligned string inline ABI. */
> #define _STRING_INLINE_unaligned 1
>
> +#if defined __USE_GNU && defined __OPTIMIZE__ \
> + && defined __extern_always_inline && __GNUC_PREREQ (3,2)
> +# if !defined _FORCE_INLINES && !defined _HAVE_STRING_ARCH_mempcpy
> +
> +/* Don't inline mempcpy into memcpy as s390 has an optimized mempcpy. */
> +# define _HAVE_STRING_ARCH_mempcpy 1
> +
> +__extern_always_inline void *
> +__mempcpy_inline_memcpy (void *__restrict __dest,
> + const void *__restrict __src, size_t __n)
> +{
> + return (char *) memcpy (__dest, __src, __n) + __n;
> +}
> +
> +__extern_always_inline void *
> +__mempcpy_inline_mempcpy (void *__restrict __dest,
> + const void *__restrict __src, size_t __n)
> +{
> + return __mempcpy (__dest, __src, __n);
> +}
> +
> +/* If n is constant and small enough, __mempcpy/mempcpy is an inlined function,
> + which uses memcpy and returns dest + n. In this constant case, GCC emits
> + instructions like mvi or mvc and avoids the function call to memcpy.
> + If n is not constant, then __mempcpy is called. */
> +# undef mempcpy
> +# undef __mempcpy
> +# define __mempcpy(dest, src, n) \
> + (__extension__ (__builtin_constant_p (n) && n <= 65536 \
> + ? __mempcpy_inline_memcpy (dest, src, n) \
> + : __mempcpy_inline_mempcpy (dest, src, n)))
> +# define mempcpy(dest, src, n) __mempcpy (dest, src, n)
> +
> +# endif
> +#endif
> +
> /* We only provide optimizations if the user selects them and if
> GNU CC is used. */
> #if !defined __NO_STRING_INLINES && defined __USE_STRING_INLINES \
> diff --git a/sysdeps/s390/mempcpy.S b/sysdeps/s390/mempcpy.S
> new file mode 100644
> index 0000000..c62074b
> --- /dev/null
> +++ b/sysdeps/s390/mempcpy.S
> @@ -0,0 +1,19 @@
> +/* CPU specific mempcpy without multiarch - 32/64 bit S/390 version.
> + Copyright (C) 2016 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +/* mempcpy is implemented in memcpy.S. */
> diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
> index 0805b07..89324ca 100644
> --- a/sysdeps/s390/multiarch/Makefile
> +++ b/sysdeps/s390/multiarch/Makefile
> @@ -18,7 +18,8 @@ sysdep_routines += strlen strlen-vx strlen-c \
> memchr memchr-vx \
> rawmemchr rawmemchr-vx rawmemchr-c \
> memccpy memccpy-vx memccpy-c \
> - memrchr memrchr-vx memrchr-c
> + memrchr memrchr-vx memrchr-c \
> + mempcpy
> endif
>
> ifeq ($(subdir),wcsmbs)
> diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
> index 62a4359..89898ec 100644
> --- a/sysdeps/s390/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
> @@ -69,6 +69,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> S390_IS_Z10 (stfle_bits), __memcpy_z10)
> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_default))
>
> + IFUNC_IMPL (i, name, mempcpy,
> + IFUNC_IMPL_ADD (array, i, mempcpy,
> + S390_IS_Z196 (stfle_bits), ____mempcpy_z196)
> + IFUNC_IMPL_ADD (array, i, mempcpy,
> + S390_IS_Z10 (stfle_bits), ____mempcpy_z10)
> + IFUNC_IMPL_ADD (array, i, mempcpy, 1, ____mempcpy_default))
> +
> #endif /* SHARED */
>
> #ifdef HAVE_S390_VX_ASM_SUPPORT
> diff --git a/sysdeps/s390/multiarch/mempcpy.c b/sysdeps/s390/multiarch/mempcpy.c
> new file mode 100644
> index 0000000..34d8329
> --- /dev/null
> +++ b/sysdeps/s390/multiarch/mempcpy.c
> @@ -0,0 +1,26 @@
> +/* Multiple versions of mempcpy.
> + Copyright (C) 2016 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +
> +#if defined SHARED && IS_IN (libc)
> +# include <ifunc-resolve.h>
> +s390_libc_ifunc (__mempcpy)
> +
> +__asm__ (".weak mempcpy\n\t"
> + ".set mempcpy,__mempcpy\n\t");
> +#endif
> diff --git a/sysdeps/s390/s390-32/memcpy.S b/sysdeps/s390/s390-32/memcpy.S
> index 2ac51ab..6be5104 100644
> --- a/sysdeps/s390/s390-32/memcpy.S
> +++ b/sysdeps/s390/s390-32/memcpy.S
> @@ -25,12 +25,23 @@
> %r3 = address of source memory area
> %r4 = number of bytes to copy. */
>
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
> -ENTRY(memcpy)
> + .text
> +ENTRY(__mempcpy)
> + .machine "g5"
> + lr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_G5_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
> #endif
> +
> +ENTRY(memcpy)
> .machine "g5"
> + lr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_G5_start:
> st %r13,52(%r15)
> .cfi_offset 13, -44
> basr %r13,0
> @@ -41,14 +52,13 @@ ENTRY(memcpy)
> lr %r5,%r4
> srl %r5,8
> ltr %r5,%r5
> - lr %r1,%r2
> jne .L_G5_13
> ex %r4,.L_G5_17-.L_G5_16(%r13)
> .L_G5_4:
> l %r13,52(%r15)
> br %r14
> .L_G5_13:
> - chi %r5,4096 # Switch to mvcle for copies >1MB
> + chi %r5,4096 # Switch to mvcle for copies >1MB
> jh __memcpy_mvcle
> .L_G5_12:
> mvc 0(256,%r1),0(%r3)
> @@ -59,24 +69,24 @@ ENTRY(memcpy)
> j .L_G5_4
> .L_G5_17:
> mvc 0(1,%r1),0(%r3)
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
> END(memcpy)
> +#ifndef USE_MULTIARCH
> libc_hidden_builtin_def (memcpy)
> #endif
>
> ENTRY(__memcpy_mvcle)
> - # Using as standalone function will result in unexpected
> - # results since the length field is incremented by 1 in order to
> - # compensate the changes already done in the functions above.
> - ahi %r4,1 # length + 1
> - lr %r5,%r4 # source length
> - lr %r4,%r3 # source address
> - lr %r3,%r5 # destination length = source length
> + # Using as standalone function will result in unexpected
> + # results since the length field is incremented by 1 in order to
> + # compensate the changes already done in the functions above.
> + lr %r0,%r2 # backup return dest [ + n ]
> + ahi %r4,1 # length + 1
> + lr %r5,%r4 # source length
> + lr %r4,%r3 # source address
> + lr %r2,%r1 # destination address
> + lr %r3,%r5 # destination length = source length
> .L_MVCLE_1:
> - mvcle %r2,%r4,0 # thats it, MVCLE is your friend
> - jo .L_MVCLE_1
> - lr %r2,%r1 # return destination address
> - br %r14
> + mvcle %r2,%r4,0 # thats it, MVCLE is your friend
> + jo .L_MVCLE_1
> + lr %r2,%r0 # return destination address
> + br %r14
> END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> index 92ffaea..297a894 100644
> --- a/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> +++ b/sysdeps/s390/s390-32/multiarch/memcpy-s390.S
> @@ -29,14 +29,23 @@
>
> #if defined SHARED && IS_IN (libc)
>
> +ENTRY(____mempcpy_z196)
> + .machine "z196"
> + .machinemode "zarch_nohighgprs"
> + lr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_Z196_start
> +END(____mempcpy_z196)
> +
> ENTRY(__memcpy_z196)
> .machine "z196"
> .machinemode "zarch_nohighgprs"
> + lr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
> llgfr %r4,%r4
> ltgr %r4,%r4
> je .L_Z196_4
> aghi %r4,-1
> - lr %r1,%r2
> srlg %r5,%r4,8
> ltgr %r5,%r5
> jne .L_Z196_5
> @@ -60,13 +69,22 @@ ENTRY(__memcpy_z196)
> mvc 0(1,%r1),0(%r3)
> END(__memcpy_z196)
>
> +ENTRY(____mempcpy_z10)
> + .machine "z10"
> + .machinemode "zarch_nohighgprs"
> + lr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_Z10_start
> +END(____mempcpy_z10)
> +
> ENTRY(__memcpy_z10)
> .machine "z10"
> .machinemode "zarch_nohighgprs"
> + lr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
> llgfr %r4,%r4
> cgije %r4,0,.L_Z10_4
> aghi %r4,-1
> - lr %r1,%r2
> srlg %r5,%r4,8
> cgijlh %r5,0,.L_Z10_13
> .L_Z10_3:
> @@ -88,14 +106,23 @@ ENTRY(__memcpy_z10)
> mvc 0(1,%r1),0(%r3)
> END(__memcpy_z10)
>
> +# define __mempcpy ____mempcpy_default
> #endif /* SHARED && IS_IN (libc) */
>
> +#define memcpy __memcpy_default
> #include "../memcpy.S"
> +#undef memcpy
>
> #if defined SHARED && IS_IN (libc)
> .globl __GI_memcpy
> .set __GI_memcpy,__memcpy_default
> +.globl __GI_mempcpy
> +.set __GI_mempcpy,____mempcpy_default
> +.globl __GI___mempcpy
> +.set __GI___mempcpy,____mempcpy_default
> #else
> .globl memcpy
> .set memcpy,__memcpy_default
> +.weak mempcpy
> +.set mempcpy,__mempcpy
> #endif
> diff --git a/sysdeps/s390/s390-64/memcpy.S b/sysdeps/s390/s390-64/memcpy.S
> index 9d60a14..e3ace0c 100644
> --- a/sysdeps/s390/s390-64/memcpy.S
> +++ b/sysdeps/s390/s390-64/memcpy.S
> @@ -27,19 +27,27 @@
>
>
> .text
> +ENTRY(__mempcpy)
> + .machine "z900"
> + lgr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_Z900_start
> +END(__mempcpy)
> +#ifndef USE_MULTIARCH
> +libc_hidden_def (__mempcpy)
> +weak_alias (__mempcpy, mempcpy)
> +libc_hidden_builtin_def (mempcpy)
> +#endif
>
> -#ifdef USE_MULTIARCH
> -ENTRY(__memcpy_default)
> -#else
> ENTRY(memcpy)
> -#endif
> .machine "z900"
> + lgr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_Z900_start:
> ltgr %r4,%r4
> je .L_Z900_4
> aghi %r4,-1
> srlg %r5,%r4,8
> ltgr %r5,%r5
> - lgr %r1,%r2
> jne .L_Z900_13
> .L_Z900_3:
> larl %r5,.L_Z900_15
> @@ -57,25 +65,24 @@ ENTRY(memcpy)
> j .L_Z900_3
> .L_Z900_15:
> mvc 0(1,%r1),0(%r3)
> -
> -#ifdef USE_MULTIARCH
> -END(__memcpy_default)
> -#else
> END(memcpy)
> +#ifndef USE_MULTIARCH
> libc_hidden_builtin_def (memcpy)
> #endif
>
> ENTRY(__memcpy_mvcle)
> - # Using as standalone function will result in unexpected
> - # results since the length field is incremented by 1 in order to
> - # compensate the changes already done in the functions above.
> - aghi %r4,1 # length + 1
> - lgr %r5,%r4 # source length
> - lgr %r4,%r3 # source address
> - lgr %r3,%r5 # destination length = source length
> + # Using as standalone function will result in unexpected
> + # results since the length field is incremented by 1 in order to
> + # compensate the changes already done in the functions above.
> + lgr %r0,%r2 # backup return dest [ + n ]
> + aghi %r4,1 # length + 1
> + lgr %r5,%r4 # source length
> + lgr %r4,%r3 # source address
> + lgr %r2,%r1 # destination address
> + lgr %r3,%r5 # destination length = source length
> .L_MVCLE_1:
> - mvcle %r2,%r4,0 # thats it, MVCLE is your friend
> - jo .L_MVCLE_1
> - lgr %r2,%r1 # return destination address
> - br %r14
> + mvcle %r2,%r4,0 # thats it, MVCLE is your friend
> + jo .L_MVCLE_1
> + lgr %r2,%r0 # return destination address
> + br %r14
> END(__memcpy_mvcle)
> diff --git a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> index 8f54526..0f5a36e 100644
> --- a/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> +++ b/sysdeps/s390/s390-64/multiarch/memcpy-s390x.S
> @@ -29,12 +29,20 @@
>
> #if defined SHARED && IS_IN (libc)
>
> +ENTRY(____mempcpy_z196)
> + .machine "z196"
> + lgr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_Z196_start
> +END(____mempcpy_z196)
> +
> ENTRY(__memcpy_z196)
> .machine "z196"
> + lgr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_Z196_start:
> ltgr %r4,%r4
> je .L_Z196_4
> aghi %r4,-1
> - lgr %r1,%r2
> srlg %r5,%r4,8
> ltgr %r5,%r5
> jne .L_Z196_5
> @@ -58,11 +66,19 @@ ENTRY(__memcpy_z196)
> mvc 0(1,%r1),0(%r3)
> END(__memcpy_z196)
>
> +ENTRY(____mempcpy_z10)
> + .machine "z10"
> + lgr %r1,%r2 # Use as dest
> + la %r2,0(%r4,%r2) # Return dest + n
> + j .L_Z10_start
> +END(____mempcpy_z10)
> +
> ENTRY(__memcpy_z10)
> .machine "z10"
> + lgr %r1,%r2 # r1: Use as dest ; r2: Return dest
> +.L_Z10_start:
> cgije %r4,0,.L_Z10_4
> aghi %r4,-1
> - lgr %r1,%r2
> srlg %r5,%r4,8
> cgijlh %r5,0,.L_Z10_13
> .L_Z10_3:
> @@ -84,14 +100,23 @@ ENTRY(__memcpy_z10)
> mvc 0(1,%r1),0(%r3)
> END(__memcpy_z10)
>
> +# define __mempcpy ____mempcpy_default
> #endif /* SHARED && IS_IN (libc) */
>
> +#define memcpy __memcpy_default
> #include "../memcpy.S"
> +#undef memcpy
>
> #if defined SHARED && IS_IN (libc)
> .globl __GI_memcpy
> .set __GI_memcpy,__memcpy_default
> +.globl __GI_mempcpy
> +.set __GI_mempcpy,____mempcpy_default
> +.globl __GI___mempcpy
> +.set __GI___mempcpy,____mempcpy_default
> #else
> .globl memcpy
> .set memcpy,__memcpy_default
> +.weak mempcpy
> +.set mempcpy,__mempcpy
> #endif
>