This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] [aarch64] Add a falkor variant for strlen


On Mon, Aug 13, 2018 at 4:44 AM Siddhesh Poyarekar
<siddhesh@sourceware.org> wrote:
>
> This variant of strlen uses vector loads and operations to reduce the
> size of the code and also eliminate the non-ascii fallback.  This
> works very well for falkor because of its two vector units and
> efficient vector ops.  In the best case it reduces latency of cases in
> bench-strlen by 48%, with gains throughout the benchmark.
> strlen-walk also sees uniform gains in the 5%-15% range.
>
> Overall the routine appears to work better than the stock one for falkor
> regardless of the benchmark, length of string or cache state.
>
> The same cannot be said of a53 and a72 though.  a53 performance was
> greatly reduced and for a72 it was a bit of a mixed bag, slightly on the
> negative side but I reckon it might be fast in some situations.
>
>         * sysdeps/aarch64/strlen.S (__strlen): Rename to STRLEN.
>         [!STRLEN](STRLEN): Set to __strlen.
>         * sysdeps/aarch64/multiarch/strlen.c: New file.
>         * sysdeps/aarch64/multiarch/strlen_generic.S: Likewise.
>         * sysdeps/aarch64/multiarch/strlen_falkor.S: Likewise.
>         * sysdeps/aarch64/multiarch/ifunc-impl-list.c
>         (__libc_ifunc_impl_list): Add strlen.
>         * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add
>         strlen_generic and strlen_falkor.

Maybe change the name of strlen_falkor to strlen_simd so it can be
used by another processor and not be so confusing to them.

Thanks,
Andrew

>
> CC: szabolcs.nagy@arm.com
> ---
>  sysdeps/aarch64/multiarch/Makefile          |   3 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c |   4 +
>  sysdeps/aarch64/multiarch/strlen.c          |  39 +++++
>  sysdeps/aarch64/multiarch/strlen_falkor.S   | 167 ++++++++++++++++++++
>  sysdeps/aarch64/multiarch/strlen_generic.S  |  42 +++++
>  sysdeps/aarch64/strlen.S                    |  10 +-
>  6 files changed, 261 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/aarch64/multiarch/strlen.c
>  create mode 100644 sysdeps/aarch64/multiarch/strlen_falkor.S
>  create mode 100644 sysdeps/aarch64/multiarch/strlen_generic.S
>
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 57ffdf7238..746d3aedb1 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,5 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
> -                  memcpy_falkor memmove_falkor memset_generic memset_falkor
> +                  memcpy_falkor memmove_falkor memset_generic memset_falkor \
> +                  strlen_generic strlen_falkor
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index e55be80103..fbe3a38a76 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -53,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
>               IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
> +  IFUNC_IMPL (i, name, strlen,
> +             IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_falkor)
> +             IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
> +
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c
> new file mode 100644
> index 0000000000..4de3437662
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/strlen.c
> @@ -0,0 +1,39 @@
> +/* Multiple versions of strlen. AARCH64 version.
> +   Copyright (C) 2018 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine strlen so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef strlen
> +# define strlen __redirect_strlen
> +# include <string.h>
> +# include <init-arch.h>
> +
> +extern __typeof (__redirect_strlen) __strlen;
> +
> +extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
> +extern __typeof (__redirect_strlen) __strlen_falkor attribute_hidden;
> +
> +libc_ifunc (__strlen,
> +           (IS_FALKOR (midr) ? __strlen_falkor : __strlen_generic));
> +
> +# undef strlen
> +strong_alias (__strlen, strlen);
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/strlen_falkor.S b/sysdeps/aarch64/multiarch/strlen_falkor.S
> new file mode 100644
> index 0000000000..fed4dcd46f
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/strlen_falkor.S
> @@ -0,0 +1,167 @@
> +/* Copyright (C) 2018 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Assumptions:
> +
> +   ARMv8-a, AArch64, Falkor, unaligned accesses, min page size 4k.  */
> +
> +/* To test the page crossing code path more thoroughly, compile with
> +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> +   entry path.  This option is not intended for production use.  */
> +
> +/* Arguments and results.  */
> +#define srcin          x0
> +#define len            x0
> +
> +/* Locals and temporaries.  */
> +#define src            x1
> +#define data1          x2
> +#define data2          x3
> +#define has_nul1       x4
> +#define has_nul2       x5
> +#define tmp1           x4
> +#define tmp2           x5
> +#define tmp3           x6
> +#define tmp4           x7
> +#define zeroones       x8
> +#define dataq          q2
> +#define datav          v2
> +#define datab2         b3
> +#define dataq2         q3
> +#define datav2         v3
> +
> +#ifdef TEST_PAGE_CROSS
> +# define MIN_PAGE_SIZE 16
> +#else
> +# define MIN_PAGE_SIZE 4096
> +#endif
> +
> +       /* Since strings are short on average, we check the first 16 bytes
> +          of the string for a NUL character.  In order to do an unaligned load
> +          safely we have to do a page cross check first.  If there is a NUL
> +          byte we calculate the length from the 2 8-byte words using
> +          conditional select to reduce branch mispredictions (it is unlikely
> +          strlen_falkor will be repeatedly called on strings with the same
> +          length).
> +
> +          If the string is longer than 16 bytes, we align src so don't need
> +          further page cross checks, and process 16 bytes per iteration.
> +
> +          If the page cross check fails, we read 16 bytes from an aligned
> +          address, remove any characters before the string, and continue
> +          in the main loop using aligned loads.  Since strings crossing a
> +          page in the first 16 bytes are rare (probability of
> +          16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> +
> +          AArch64 systems have a minimum page size of 4k.  We don't bother
> +          checking for larger page sizes - the cost of setting up the correct
> +          page size is just not worth the extra gain from a small reduction in
> +          the cases taking the slow path.  Note that we only care about
> +          whether the first fetch, which may be misaligned, crosses a page
> +          boundary.  */
> +
> +ENTRY_ALIGN (__strlen_falkor, 6)
> +       DELOUSE (0)
> +       DELOUSE (1)
> +       and     tmp1, srcin, MIN_PAGE_SIZE - 1
> +       cmp     tmp1, MIN_PAGE_SIZE - 16
> +       b.gt    L(page_cross)
> +       ldr     dataq, [srcin]
> +#ifdef __AARCH64EB__
> +       rev64   datav.16b, datav.16b
> +#endif
> +
> +       /* Get the minimum value and keep going if it is not zero.  */
> +       uminv   datab2, datav.16b
> +       mov     tmp1, datav2.d[0]
> +       cbnz    tmp1, L(main_loop_entry)
> +
> +       cmeq    datav.16b, datav.16b, #0
> +       mov     data1, datav.d[0]
> +       mov     data2, datav.d[1]
> +       cmp     data1, 0
> +       csel    data1, data1, data2, ne
> +       mov     len, 8
> +       rev     data1, data1
> +       clz     tmp1, data1
> +       csel    len, xzr, len, ne
> +       add     len, len, tmp1, lsr 3
> +       ret
> +
> +L(main_loop_entry):
> +       bic     src, srcin, 15
> +
> +L(main_loop):
> +       ldr     dataq, [src, 16]!
> +L(page_cross_entry):
> +       /* Get the minimum value and keep going if it is not zero.  */
> +       uminv   datab2, datav.16b
> +       mov     tmp1, datav2.d[0]
> +       cbnz    tmp1, L(main_loop)
> +
> +L(tail):
> +#ifdef __AARCH64EB__
> +       rev64   datav.16b, datav.16b
> +#endif
> +       /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
> +          pair of scalars and then compute the length from the earliest NULL
> +          byte.  */
> +       cmeq    datav.16b, datav.16b, #0
> +       mov     data1, datav.d[0]
> +       mov     data2, datav.d[1]
> +       cmp     data1, 0
> +       csel    data1, data1, data2, ne
> +       sub     len, src, srcin
> +       rev     data1, data1
> +       add     tmp2, len, 8
> +       clz     tmp1, data1
> +       csel    len, len, tmp2, ne
> +       add     len, len, tmp1, lsr 3
> +       ret
> +
> +       /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> +          srcin to 0xff, so we ignore any NUL bytes before the string.
> +          Then continue in the aligned loop.  */
> +L(page_cross):
> +       mov     tmp3, 63
> +       bic     src, srcin, 15
> +       and     tmp1, srcin, 7
> +       ands    tmp2, srcin, 8
> +       ldr     dataq, [src]
> +       lsl     tmp1, tmp1, 3
> +       csel    tmp2, tmp2, tmp1, eq
> +       csel    tmp1, tmp1, tmp3, eq
> +       mov     tmp4, -1
> +#ifdef __AARCH64EB__
> +       /* Big-endian.  Early bytes are at MSB.  */
> +       lsr     tmp1, tmp4, tmp1
> +       lsr     tmp2, tmp4, tmp2
> +#else
> +       /* Little-endian.  Early bytes are at LSB.  */
> +       lsl     tmp1, tmp4, tmp1
> +       lsl     tmp2, tmp4, tmp2
> +#endif
> +       mov     datav2.d[0], tmp1
> +       mov     datav2.d[1], tmp2
> +       orn     datav.16b, datav.16b, datav2.16b
> +       b       L(page_cross_entry)
> +END (__strlen_falkor)
> +weak_alias (__strlen_falkor, strlen_falkor)
> +libc_hidden_builtin_def (strlen_falkor)
> diff --git a/sysdeps/aarch64/multiarch/strlen_generic.S b/sysdeps/aarch64/multiarch/strlen_generic.S
> new file mode 100644
> index 0000000000..a74b0877dc
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/strlen_generic.S
> @@ -0,0 +1,42 @@
> +/* A Generic Optimized strlen implementation for AARCH64.
> +   Copyright (C) 2018 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* The actual strlen and memmove code is in ../strlen.S.  If we are
> +   building libc this file defines __strlen_generic and __memmove_generic.
> +   Otherwise the include of ../strlen.S will define the normal __strlen
> +   and__memmove entry points.  */
> +
> +#include <sysdep.h>
> +
> +#if IS_IN (libc)
> +
> +# define STRLEN __strlen_generic
> +
> +/* Do not hide the generic versions of strlen and memmove, we use them
> +   internally.  */
> +# undef libc_hidden_builtin_def
> +# define libc_hidden_builtin_def(name)
> +
> +# ifdef SHARED
> +/* It doesn't make sense to send libc-internal strlen calls through a PLT. */
> +       .globl __GI_strlen; __GI_strlen = __strlen_generic
> +# endif
> +
> +#endif
> +
> +#include "../strlen.S"
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index eb773ef532..521ebc3b75 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -23,6 +23,10 @@
>   * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
>   */
>
> +#ifndef STRLEN
> +# define STRLEN __strlen
> +#endif
> +
>  /* To test the page crossing code path more thoroughly, compile with
>     -DTEST_PAGE_CROSS - this will force all calls through the slower
>     entry path.  This option is not intended for production use.  */
> @@ -84,7 +88,7 @@
>            whether the first fetch, which may be misaligned, crosses a page
>            boundary.  */
>
> -ENTRY_ALIGN (__strlen, 6)
> +ENTRY_ALIGN (STRLEN, 6)
>         DELOUSE (0)
>         DELOUSE (1)
>         and     tmp1, srcin, MIN_PAGE_SIZE - 1
> @@ -215,6 +219,6 @@ L(page_cross):
>         csel    data1, data1, tmp4, eq
>         csel    data2, data2, tmp2, eq
>         b       L(page_cross_entry)
> -END (__strlen)
> -weak_alias (__strlen, strlen)
> +END (STRLEN)
> +weak_alias (STRLEN, strlen)
>  libc_hidden_builtin_def (strlen)
> --
> 2.17.1
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]