This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
RE: [PATCH][AArch64] Optimize strlen
- From: "Wilco Dijkstra" <wdijkstr at arm dot com>
- To: <libc-alpha at sourceware dot org>
- Date: Fri, 27 Feb 2015 15:06:05 -0000
- Subject: RE: [PATCH][AArch64] Optimize strlen
- Authentication-results: sourceware.org; auth=none
- References:
ping
> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 16 January 2015 17:39
> To: libc-alpha@sourceware.org
> Subject: [PATCH][AArch64] Optimize strlen
>
> Optimize the strlen implementation by using a page cross check and a fast check
> for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> and on random tests it is ~80% better.
>
> ChangeLog:
>
> 2015-01-16 Wilco Dijkstra <wdijkstr@arm.com>
>
> * sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
>
> ---
> sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
> 1 file changed, 165 insertions(+), 65 deletions(-)
>
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index 54271b9..7ca47d9 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -20,9 +20,13 @@
>
> /* Assumptions:
> *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> */
>
> +/* To test the page crossing code path more thoroughly, compile with
> + -DTEST_PAGE_CROSS - this will force all calls through the slower
> + entry path. This option is not intended for production use. */
> +
> /* Arguments and results. */
> #define srcin x0
> #define len x0
> @@ -31,87 +35,183 @@
> #define src x1
> #define data1 x2
> #define data2 x3
> -#define data2a x4
> -#define has_nul1 x5
> -#define has_nul2 x6
> -#define tmp1 x7
> -#define tmp2 x8
> -#define tmp3 x9
> -#define tmp4 x10
> -#define zeroones x11
> -#define pos x12
> +#define has_nul1 x4
> +#define has_nul2 x5
> +#define tmp1 x4
> +#define tmp2 x5
> +#define tmp3 x6
> +#define tmp4 x7
> +#define zeroones x8
> +
> + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> + can be done in parallel across the entire word. A faster check
> + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> + false hits for characters 129..255. */
>
> #define REP8_01 0x0101010101010101
> #define REP8_7f 0x7f7f7f7f7f7f7f7f
> #define REP8_80 0x8080808080808080
>
> - /* Start of critial section -- keep to one 64Byte cache line. */
> +#ifdef TEST_PAGE_CROSS
> +# define MIN_PAGE_SIZE 15
> +#else
> +# define MIN_PAGE_SIZE 4096
> +#endif
> +
> + /* Since strings are short on average, we check the first 16 bytes
> + of the string for a NUL character. In order to do an unaligned ldp
> + safely we have to do a page cross check first. If there is a NUL
> + byte we calculate the length from the 2 8-byte words using
> + conditional select to reduce branch mispredictions (it is unlikely
> + strlen will be repeatedly called on strings with the same length).
> +
> + If the string is longer than 16 bytes, we align src so don't need
> + further page cross checks, and process 32 bytes per iteration
> + using the fast NUL check. If we encounter non-ASCII characters,
> + fallback to a second loop using the full NUL check.
> +
> + If the page cross check fails, we read 16 bytes from an aligned
> + address, remove any characters before the string, and continue
> + in the main loop using aligned loads. Since strings crossing a
> + page in the first 16 bytes are rare (probability of
> + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> +
> + AArch64 systems have a minimum page size of 4k. We don't bother
> + checking for larger page sizes - the cost of setting up the correct
> + page size is just not worth the extra gain from a small reduction in
> + the cases taking the slow path. Note that we only care about
> + whether the first fetch, which may be misaligned, crosses a page
> + boundary. */
> +
> ENTRY_ALIGN (strlen, 6)
> - mov zeroones, #REP8_01
> - bic src, srcin, #15
> - ands tmp1, srcin, #15
> - b.ne L(misaligned)
> - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> - can be done in parallel across the entire word. */
> - /* The inner loop deals with two Dwords at a time. This has a
> - slightly higher start-up cost, but we should win quite quickly,
> - especially on cores with a high number of issue slots per
> - cycle, as we get much better parallelism out of the operations. */
> -L(loop):
> - ldp data1, data2, [src], #16
> -L(realigned):
> + and tmp1, srcin, MIN_PAGE_SIZE - 1
> + mov zeroones, REP8_01
> + cmp tmp1, MIN_PAGE_SIZE - 16
> + b.gt L(page_cross)
> + ldp data1, data2, [srcin]
> +#ifdef __AARCH64EB__
> + /* For big-endian, carry propagation (if the final byte in the
> + string is 0x01) means we cannot use has_nul1/2 directly.
> + Since we expect strings to be small and early-exit,
> + byte-swap the data now so has_null1/2 will be correct. */
> + rev data1, data1
> + rev data2, data2
> +#endif
> sub tmp1, data1, zeroones
> - orr tmp2, data1, #REP8_7f
> + orr tmp2, data1, REP8_7f
> sub tmp3, data2, zeroones
> - orr tmp4, data2, #REP8_7f
> - bic has_nul1, tmp1, tmp2
> - bics has_nul2, tmp3, tmp4
> - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> - b.eq L(loop)
> - /* End of critical section -- keep to one 64Byte cache line. */
> + orr tmp4, data2, REP8_7f
> + bics has_nul1, tmp1, tmp2
> + bic has_nul2, tmp3, tmp4
> + ccmp has_nul2, 0, 0, eq
> + beq L(main_loop_entry)
>
> - sub len, src, srcin
> - cbz has_nul1, L(nul_in_data2)
> -#ifdef __AARCH64EB__
> - mov data2, data1
> -#endif
> - sub len, len, #8
> - mov has_nul2, has_nul1
> -L(nul_in_data2):
> + /* Enter with C = has_nul1 == 0. */
> + csel has_nul1, has_nul1, has_nul2, cc
> + mov len, 8
> + rev has_nul1, has_nul1
> + clz tmp1, has_nul1
> + csel len, xzr, len, cc
> + add len, len, tmp1, lsr 3
> + ret
> +
> + /* The inner loop processes 32 bytes per iteration and uses the fast
> + NUL check. If we encounter non-ASCII characters, use a second
> + loop with the accurate NUL check. */
> + .p2align 4
> +L(main_loop_entry):
> + bic src, srcin, 15
> + sub src, src, 16
> +L(main_loop):
> + ldp data1, data2, [src, 32]!
> +.Lpage_cross_entry:
> + sub tmp1, data1, zeroones
> + sub tmp3, data2, zeroones
> + orr tmp2, tmp1, tmp3
> + tst tmp2, zeroones, lsl 7
> + bne 1f
> + ldp data1, data2, [src, 16]
> + sub tmp1, data1, zeroones
> + sub tmp3, data2, zeroones
> + orr tmp2, tmp1, tmp3
> + tst tmp2, zeroones, lsl 7
> + beq L(main_loop)
> + add src, src, 16
> +1:
> + /* The fast check failed, so do the slower, accurate NUL check. */
> + orr tmp2, data1, REP8_7f
> + orr tmp4, data2, REP8_7f
> + bics has_nul1, tmp1, tmp2
> + bic has_nul2, tmp3, tmp4
> + ccmp has_nul2, 0, 0, eq
> + beq L(nonascii_loop)
> +
> + /* Enter with C = has_nul1 == 0. */
> +L(tail):
> #ifdef __AARCH64EB__
> /* For big-endian, carry propagation (if the final byte in the
> - string is 0x01) means we cannot use has_nul directly. The
> + string is 0x01) means we cannot use has_nul1/2 directly. The
> easiest way to get the correct byte is to byte-swap the data
> and calculate the syndrome a second time. */
> - rev data2, data2
> - sub tmp1, data2, zeroones
> - orr tmp2, data2, #REP8_7f
> - bic has_nul2, tmp1, tmp2
> + csel data1, data1, data2, cc
> + rev data1, data1
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, REP8_7f
> + bic has_nul1, tmp1, tmp2
> +#else
> + csel has_nul1, has_nul1, has_nul2, cc
> #endif
> - sub len, len, #8
> - rev has_nul2, has_nul2
> - clz pos, has_nul2
> - add len, len, pos, lsr #3 /* Bits to bytes. */
> - RET
> -
> -L(misaligned):
> - cmp tmp1, #8
> - neg tmp1, tmp1
> - ldp data1, data2, [src], #16
> - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> - mov tmp2, #~0
> + sub len, src, srcin
> + rev has_nul1, has_nul1
> + add tmp2, len, 8
> + clz tmp1, has_nul1
> + csel len, len, tmp2, cc
> + add len, len, tmp1, lsr 3
> + ret
> +
> +L(nonascii_loop):
> + ldp data1, data2, [src, 16]!
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, REP8_7f
> + sub tmp3, data2, zeroones
> + orr tmp4, data2, REP8_7f
> + bics has_nul1, tmp1, tmp2
> + bic has_nul2, tmp3, tmp4
> + ccmp has_nul2, 0, 0, eq
> + bne L(tail)
> + ldp data1, data2, [src, 16]!
> + sub tmp1, data1, zeroones
> + orr tmp2, data1, REP8_7f
> + sub tmp3, data2, zeroones
> + orr tmp4, data2, REP8_7f
> + bics has_nul1, tmp1, tmp2
> + bic has_nul2, tmp3, tmp4
> + ccmp has_nul2, 0, 0, eq
> + beq L(nonascii_loop)
> + b L(tail)
> +
> + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> + srcin to 0x7f, so we ignore any NUL bytes before the string.
> + Then continue in the aligned loop. */
> +L(page_cross):
> + bic src, srcin, 15
> + ldp data1, data2, [src]
> + lsl tmp1, srcin, 3
> + mov tmp4, -1
> #ifdef __AARCH64EB__
> - /* Big-endian. Early bytes are at MSB. */
> - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> + /* Big-endian. Early bytes are at MSB. */
> + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> #else
> /* Little-endian. Early bytes are at LSB. */
> - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> #endif
> - orr data1, data1, tmp2
> - orr data2a, data2, tmp2
> - csinv data1, data1, xzr, le
> - csel data2, data2, data2a, le
> - b L(realigned)
> + orr tmp1, tmp1, REP8_80
> + orn data1, data1, tmp1
> + orn tmp2, data2, tmp1
> + tst srcin, 8
> + csel data1, data1, tmp4, eq
> + csel data2, data2, tmp2, eq
> + b L(page_cross_entry)
> END (strlen)
> libc_hidden_builtin_def (strlen)
> --
> 1.9.1