[PATCH] aarch64: Optimize string functions with shrn instruction

Wed Jul 6 07:42:52 GMT 2022

Friendly ping

On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:

> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst.8b, src.8h, 4` (shift
> right every 2 bytes by 4 and narrow to 1 byte) and has
> same latency on all SIMD ARMv8 targets as ADDP. There
> are also possible gaps for memcmp but that's for
> another patch.
>
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> ---
>  sysdeps/aarch64/memchr.S    | 25 +++++++++----------------
>  sysdeps/aarch64/memrchr.S   | 25 +++++++++----------------
>  sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
>  sysdeps/aarch64/strcpy.S    | 32 ++++++++++++--------------------
>  sysdeps/aarch64/strlen.S    | 25 +++++++++----------------
>  sysdeps/aarch64/strnlen.S   | 25 +++++++++----------------
>  6 files changed, 59 insertions(+), 102 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..2053a977b6 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -41,24 +41,21 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>
>  #define vrepchr                v0
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (MEMCHR)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (MEMCHR)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -111,8 +105,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         add     tmp, srcin, cntin
>         sub     cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
>  #define synd           x5
>  #define shift          x6
>  #define        tmp             x7
> -#define wtmp           w7
>  #define end            x8
>  #define endm1          x9
>
> @@ -45,18 +44,16 @@
>  #define qdata          q1
>  #define vdata          v1
>  #define vhas_chr       v2
> -#define vrepmask       v3
> -#define vend           v4
> -#define dend           d4
> +#define vend           v3
> +#define dend           d3
>
>  /*
>     Core algorithm:
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__memrchr)
>         PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src]
>         dup     vrepchr.16b, chrin
> -       mov     wtmp, 0xf00f
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         neg     shift, end, lsl 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsl     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
>         fmov    synd, dend
>         cbz     synd, L(loop32)
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>
>         add     tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
>  #define src            x2
>  #define tmp1           x1
>  #define tmp2           x3
> -#define tmp2w          w3
>
>  #define vrepchr                v0
>  #define vdata          v1
>  #define qdata          q1
>  #define vhas_nul       v2
>  #define vhas_chr       v3
> -#define vrepmask       v4
> -#define vend           v5
> -#define dend           d5
> +#define vend           v4
> +#define dend           d4
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strchrnul)
>         PTR_ARG (0)
>         bic     src, srcin, 15
>         dup     vrepchr.16b, chrin
>         ld1     {vdata.16b}, [src]
> -       mov     tmp2w, 0xf00f
> -       dup     vrepmask.8h, tmp2w
>         cmeq    vhas_chr.16b, vdata.16b, vrepchr.16b
>         cmhs    vhas_chr.16b, vhas_chr.16b, vdata.16b
>         lsl     tmp2, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>         lsr     tmp1, tmp1, tmp2        /* Mask padding bits.  */
>         cbz     tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
>         fmov    tmp1, dend
>         cbz     tmp1, L(loop)
>
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    tmp1, dend
>  #ifndef __AARCH64EB__
>         rbit    tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
>  #define len            x4
>  #define synd           x4
>  #define        tmp             x5
> -#define wtmp           w5
>  #define shift          x5
>  #define data1          x6
>  #define dataw1         w6
> @@ -50,9 +49,8 @@
>  #define dataq          q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>  #define dataq2         q1
>
>  #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
>  # define IFSTPCPY(X,...)
>  #endif
>
> -/* Core algorithm:
> -
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +/*
> +   Core algorithm:
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting leading zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRCPY)
>         PTR_ARG (0)
>         PTR_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbnz    synd, L(tail)
>
>         ldr     dataq, [src, 16]!
>         cmeq    vhas_nul.16b, vdata.16b, 0
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         cbz     synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
>         rbit    synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
>  #define src            x1
>  #define        synd            x2
>  #define tmp            x3
> -#define wtmp           w3
>  #define shift          x4
>
>  #define data           q0
>  #define vdata          v0
>  #define vhas_nul       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /* Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (STRLEN)
>         PTR_ARG (0)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         ld1     {vdata.16b}, [src]
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_nul.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
>         fmov    synd, dend
>         cbz     synd, L(loop)
>
> -       and     vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> -       addp    vend.16b, vhas_nul.16b, vhas_nul.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_nul.8h, 4         /* 128->64 */
>         sub     result, src, srcin
>         fmov    synd, dend
>  #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
>  #define src            x2
>  #define synd           x3
>  #define        shift           x4
> -#define wtmp           w4
>  #define tmp            x4
>  #define cntrem         x5
>
>  #define qdata          q0
>  #define vdata          v0
>  #define vhas_chr       v1
> -#define vrepmask       v2
> -#define vend           v3
> -#define dend           d3
> +#define vend           v2
> +#define dend           d2
>
>  /*
>     Core algorithm:
>
> -   For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> -   per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> -   requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> -   set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> -   bits in the syndrome reflect the order in which things occur in the
> original
> -   string, counting trailing zeros identifies exactly which byte
> matched.  */
> +   For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> +   per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> +   by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> +   which things occur in the original string, counting trailing zeros
> identifies
> +   exactly which byte matched.  */
>
>  ENTRY (__strnlen)
>         PTR_ARG (0)
>         SIZE_ARG (1)
>         bic     src, srcin, 15
> -       mov     wtmp, 0xf00f
>         cbz     cntin, L(nomatch)
>         ld1     {vdata.16b}, [src], 16
> -       dup     vrepmask.8h, wtmp
>         cmeq    vhas_chr.16b, vdata.16b, 0
>         lsl     shift, srcin, 2
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         fmov    synd, dend
>         lsr     synd, synd, shift
>         cbz     synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
>         cbz     synd, L(loop32)
>
>  L(end):
> -       and     vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> -       addp    vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64
> */
> +       shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
>         sub     src, src, 16
>         mov     synd, vend.d[0]
>         sub     result, src, srcin
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
>