[PATCH] aarch64: Optimize string functions with shrn instruction
Danila Kutenin
danilak@google.com
Wed Jul 6 07:42:52 GMT 2022
Friendly ping
On Mon, Jun 27, 2022 at 5:12 PM Danila Kutenin <danilak@google.com> wrote:
> We found that string functions were using AND+ADDP
> to find the nibble/syndrome mask but there is an easier
> opportunity through `SHRN dst.8b, src.8h, 4` (shift
> right every 2 bytes by 4 and narrow to 1 byte) and has
> same latency on all SIMD ARMv8 targets as ADDP. There
> are also possible gaps for memcmp but that's for
> another patch.
>
> We see 10-20% savings for small-mid size cases (<=128)
> which are primary cases for general workloads.
> ---
> sysdeps/aarch64/memchr.S | 25 +++++++++----------------
> sysdeps/aarch64/memrchr.S | 25 +++++++++----------------
> sysdeps/aarch64/strchrnul.S | 29 +++++++++++------------------
> sysdeps/aarch64/strcpy.S | 32 ++++++++++++--------------------
> sysdeps/aarch64/strlen.S | 25 +++++++++----------------
> sysdeps/aarch64/strnlen.S | 25 +++++++++----------------
> 6 files changed, 59 insertions(+), 102 deletions(-)
>
> diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
> index b060eee97d..2053a977b6 100644
> --- a/sysdeps/aarch64/memchr.S
> +++ b/sysdeps/aarch64/memchr.S
> @@ -41,24 +41,21 @@
> #define synd x5
> #define shift x6
> #define tmp x7
> -#define wtmp w7
>
> #define vrepchr v0
> #define qdata q1
> #define vdata v1
> #define vhas_chr v2
> -#define vrepmask v3
> -#define vend v4
> -#define dend d4
> +#define vend v3
> +#define dend d3
>
> /*
> Core algorithm:
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting leading zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (MEMCHR)
> PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (MEMCHR)
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src]
> dup vrepchr.16b, chrin
> - mov wtmp, 0xf00f
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> lsl shift, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(start_loop)
> @@ -111,8 +105,7 @@ L(loop32_2):
> fmov synd, dend
> cbz synd, L(loop32)
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> add tmp, srcin, cntin
> sub cntrem, tmp, src
> diff --git a/sysdeps/aarch64/memrchr.S b/sysdeps/aarch64/memrchr.S
> index e0efbad91c..5179320720 <(517)%20932-0720> 100644
> --- a/sysdeps/aarch64/memrchr.S
> +++ b/sysdeps/aarch64/memrchr.S
> @@ -37,7 +37,6 @@
> #define synd x5
> #define shift x6
> #define tmp x7
> -#define wtmp w7
> #define end x8
> #define endm1 x9
>
> @@ -45,18 +44,16 @@
> #define qdata q1
> #define vdata v1
> #define vhas_chr v2
> -#define vrepmask v3
> -#define vend v4
> -#define dend d4
> +#define vend v3
> +#define dend d3
>
> /*
> Core algorithm:
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting leading zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (__memrchr)
> PTR_ARG (0)
> @@ -67,12 +64,9 @@ ENTRY (__memrchr)
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src]
> dup vrepchr.16b, chrin
> - mov wtmp, 0xf00f
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> neg shift, end, lsl 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsl synd, synd, shift
> cbz synd, L(start_loop)
> @@ -109,8 +103,7 @@ L(loop32_2):
> fmov synd, dend
> cbz synd, L(loop32)
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
>
> add tmp, src, 15
> diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
> index 442726fd49..ee154ab74b 100644
> --- a/sysdeps/aarch64/strchrnul.S
> +++ b/sysdeps/aarch64/strchrnul.S
> @@ -33,38 +33,32 @@
> #define src x2
> #define tmp1 x1
> #define tmp2 x3
> -#define tmp2w w3
>
> #define vrepchr v0
> #define vdata v1
> #define qdata q1
> #define vhas_nul v2
> #define vhas_chr v3
> -#define vrepmask v4
> -#define vend v5
> -#define dend d5
> +#define vend v4
> +#define dend d4
>
> -/* Core algorithm:
> -
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting leading zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (__strchrnul)
> PTR_ARG (0)
> bic src, srcin, 15
> dup vrepchr.16b, chrin
> ld1 {vdata.16b}, [src]
> - mov tmp2w, 0xf00f
> - dup vrepmask.8h, tmp2w
> cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
> cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
> lsl tmp2, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov tmp1, dend
> lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
> cbz tmp1, L(loop)
> @@ -83,8 +77,7 @@ L(loop):
> fmov tmp1, dend
> cbz tmp1, L(loop)
>
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov tmp1, dend
> #ifndef __AARCH64EB__
> rbit tmp1, tmp1
> diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S
> index da53170ece..78d27b4aa6 100644
> --- a/sysdeps/aarch64/strcpy.S
> +++ b/sysdeps/aarch64/strcpy.S
> @@ -40,7 +40,6 @@
> #define len x4
> #define synd x4
> #define tmp x5
> -#define wtmp w5
> #define shift x5
> #define data1 x6
> #define dataw1 w6
> @@ -50,9 +49,8 @@
> #define dataq q0
> #define vdata v0
> #define vhas_nul v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
> #define dataq2 q1
>
> #ifdef BUILD_STPCPY
> @@ -63,34 +61,29 @@
> # define IFSTPCPY(X,...)
> #endif
>
> -/* Core algorithm:
> -
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> +/*
> + Core algorithm:
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting leading zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (STRCPY)
> PTR_ARG (0)
> PTR_ARG (1)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> ld1 {vdata.16b}, [src]
> - dup vrepmask.8h, wtmp
> cmeq vhas_nul.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbnz synd, L(tail)
>
> ldr dataq, [src, 16]!
> cmeq vhas_nul.16b, vdata.16b, 0
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> cbz synd, L(start_loop)
>
> @@ -162,8 +155,7 @@ L(loop):
> fmov synd, dend
> cbz synd, L(loop)
>
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> #ifndef __AARCH64EB__
> rbit synd, synd
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index a2310871c2..3a5d088407 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -34,35 +34,29 @@
> #define src x1
> #define synd x2
> #define tmp x3
> -#define wtmp w3
> #define shift x4
>
> #define data q0
> #define vdata v0
> #define vhas_nul v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
>
> /* Core algorithm:
>
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting trailing zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (STRLEN)
> PTR_ARG (0)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> ld1 {vdata.16b}, [src]
> - dup vrepmask.8h, wtmp
> cmeq vhas_nul.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(loop)
> @@ -80,8 +74,7 @@ L(loop):
> fmov synd, dend
> cbz synd, L(loop)
>
> - and vhas_nul.16b, vhas_nul.16b, vrepmask.16b
> - addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64
> */
> + shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
> sub result, src, srcin
> fmov synd, dend
> #ifndef __AARCH64EB__
> diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
> index 0dbecb0ce9..282bddc9aa 100644
> --- a/sysdeps/aarch64/strnlen.S
> +++ b/sysdeps/aarch64/strnlen.S
> @@ -33,39 +33,33 @@
> #define src x2
> #define synd x3
> #define shift x4
> -#define wtmp w4
> #define tmp x4
> #define cntrem x5
>
> #define qdata q0
> #define vdata v0
> #define vhas_chr v1
> -#define vrepmask v2
> -#define vend v3
> -#define dend d3
> +#define vend v2
> +#define dend d2
>
> /*
> Core algorithm:
>
> - For each 16-byte chunk we calculate a 64-bit syndrome value with four
> bits
> - per byte. For even bytes, bits 0-3 are set if the relevant byte
> matched the
> - requested character or the byte is NUL. Bits 4-7 must be zero. Bits
> 4-7 are
> - set likewise for odd bytes so that adjacent bytes can be merged. Since
> the
> - bits in the syndrome reflect the order in which things occur in the
> original
> - string, counting trailing zeros identifies exactly which byte
> matched. */
> + For each 16-byte chunk we calculate a 64-bit nibble mask value with
> four bits
> + per byte. We take 4 bits of every comparison byte with shift right and
> narrow
> + by 4 instruction. Since the bits in the nibble mask reflect the order
> in
> + which things occur in the original string, counting trailing zeros
> identifies
> + exactly which byte matched. */
>
> ENTRY (__strnlen)
> PTR_ARG (0)
> SIZE_ARG (1)
> bic src, srcin, 15
> - mov wtmp, 0xf00f
> cbz cntin, L(nomatch)
> ld1 {vdata.16b}, [src], 16
> - dup vrepmask.8h, wtmp
> cmeq vhas_chr.16b, vdata.16b, 0
> lsl shift, srcin, 2
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> fmov synd, dend
> lsr synd, synd, shift
> cbz synd, L(start_loop)
> @@ -103,8 +97,7 @@ L(loop32_2):
> cbz synd, L(loop32)
>
> L(end):
> - and vhas_chr.16b, vhas_chr.16b, vrepmask.16b
> - addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64
> */
> + shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
> sub src, src, 16
> mov synd, vend.d[0]
> sub result, src, srcin
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>
>
More information about the Libc-alpha
mailing list