This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
RE: [PATCH][AArch64][PING] Optimize strlen
- From: "Wilco Dijkstra" <wdijkstr at arm dot com>
- To: <libc-alpha at sourceware dot org>
- Cc: <marcus dot shawcroft at gmail dot com>
- Date: Mon, 6 Jul 2015 12:27:06 +0100
- Subject: RE: [PATCH][AArch64][PING] Optimize strlen
- Authentication-results: sourceware.org; auth=none
- References:
> Wilco Dijkstra wrote:
> ping
>
> > > -----Original Message-----
> > > From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> > > Sent: 16 January 2015 17:39
> > > To: libc-alpha@sourceware.org
> > > Subject: [PATCH][AArch64] Optimize strlen
> > >
> > > Optimize the strlen implementation by using a page cross check and a fast check
> > > for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> > > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> > > and on random tests it is ~80% better.
> > >
> > > ChangeLog:
> > >
> > > 2015-01-16 Wilco Dijkstra <wdijkstr@arm.com>
> > >
> > > * sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> > >
> > > ---
> > > sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
> > > 1 file changed, 165 insertions(+), 65 deletions(-)
> > >
> > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > index 54271b9..7ca47d9 100644
> > > --- a/sysdeps/aarch64/strlen.S
> > > +++ b/sysdeps/aarch64/strlen.S
> > > @@ -20,9 +20,13 @@
> > >
> > > /* Assumptions:
> > > *
> > > - * ARMv8-a, AArch64
> > > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> > > */
> > >
> > > +/* To test the page crossing code path more thoroughly, compile with
> > > + -DTEST_PAGE_CROSS - this will force all calls through the slower
> > > + entry path. This option is not intended for production use. */
> > > +
> > > /* Arguments and results. */
> > > #define srcin x0
> > > #define len x0
> > > @@ -31,87 +35,183 @@
> > > #define src x1
> > > #define data1 x2
> > > #define data2 x3
> > > -#define data2a x4
> > > -#define has_nul1 x5
> > > -#define has_nul2 x6
> > > -#define tmp1 x7
> > > -#define tmp2 x8
> > > -#define tmp3 x9
> > > -#define tmp4 x10
> > > -#define zeroones x11
> > > -#define pos x12
> > > +#define has_nul1 x4
> > > +#define has_nul2 x5
> > > +#define tmp1 x4
> > > +#define tmp2 x5
> > > +#define tmp3 x6
> > > +#define tmp4 x7
> > > +#define zeroones x8
> > > +
> > > + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > > + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > > + can be done in parallel across the entire word. A faster check
> > > + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> > > + false hits for characters 129..255. */
> > >
> > > #define REP8_01 0x0101010101010101
> > > #define REP8_7f 0x7f7f7f7f7f7f7f7f
> > > #define REP8_80 0x8080808080808080
> > >
> > > - /* Start of critial section -- keep to one 64Byte cache line. */
> > > +#ifdef TEST_PAGE_CROSS
> > > +# define MIN_PAGE_SIZE 15
> > > +#else
> > > +# define MIN_PAGE_SIZE 4096
> > > +#endif
> > > +
> > > + /* Since strings are short on average, we check the first 16 bytes
> > > + of the string for a NUL character. In order to do an unaligned ldp
> > > + safely we have to do a page cross check first. If there is a NUL
> > > + byte we calculate the length from the 2 8-byte words using
> > > + conditional select to reduce branch mispredictions (it is unlikely
> > > + strlen will be repeatedly called on strings with the same length).
> > > +
> > > + If the string is longer than 16 bytes, we align src so don't need
> > > + further page cross checks, and process 32 bytes per iteration
> > > + using the fast NUL check. If we encounter non-ASCII characters,
> > > + fallback to a second loop using the full NUL check.
> > > +
> > > + If the page cross check fails, we read 16 bytes from an aligned
> > > + address, remove any characters before the string, and continue
> > > + in the main loop using aligned loads. Since strings crossing a
> > > + page in the first 16 bytes are rare (probability of
> > > + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> > > +
> > > + AArch64 systems have a minimum page size of 4k. We don't bother
> > > + checking for larger page sizes - the cost of setting up the correct
> > > + page size is just not worth the extra gain from a small reduction in
> > > + the cases taking the slow path. Note that we only care about
> > > + whether the first fetch, which may be misaligned, crosses a page
> > > + boundary. */
> > > +
> > > ENTRY_ALIGN (strlen, 6)
> > > - mov zeroones, #REP8_01
> > > - bic src, srcin, #15
> > > - ands tmp1, srcin, #15
> > > - b.ne L(misaligned)
> > > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > > - can be done in parallel across the entire word. */
> > > - /* The inner loop deals with two Dwords at a time. This has a
> > > - slightly higher start-up cost, but we should win quite quickly,
> > > - especially on cores with a high number of issue slots per
> > > - cycle, as we get much better parallelism out of the operations. */
> > > -L(loop):
> > > - ldp data1, data2, [src], #16
> > > -L(realigned):
> > > + and tmp1, srcin, MIN_PAGE_SIZE - 1
> > > + mov zeroones, REP8_01
> > > + cmp tmp1, MIN_PAGE_SIZE - 16
> > > + b.gt L(page_cross)
> > > + ldp data1, data2, [srcin]
> > > +#ifdef __AARCH64EB__
> > > + /* For big-endian, carry propagation (if the final byte in the
> > > + string is 0x01) means we cannot use has_nul1/2 directly.
> > > + Since we expect strings to be small and early-exit,
> > > + byte-swap the data now so has_null1/2 will be correct. */
> > > + rev data1, data1
> > > + rev data2, data2
> > > +#endif
> > > sub tmp1, data1, zeroones
> > > - orr tmp2, data1, #REP8_7f
> > > + orr tmp2, data1, REP8_7f
> > > sub tmp3, data2, zeroones
> > > - orr tmp4, data2, #REP8_7f
> > > - bic has_nul1, tmp1, tmp2
> > > - bics has_nul2, tmp3, tmp4
> > > - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> > > - b.eq L(loop)
> > > - /* End of critical section -- keep to one 64Byte cache line. */
> > > + orr tmp4, data2, REP8_7f
> > > + bics has_nul1, tmp1, tmp2
> > > + bic has_nul2, tmp3, tmp4
> > > + ccmp has_nul2, 0, 0, eq
> > > + beq L(main_loop_entry)
> > >
> > > - sub len, src, srcin
> > > - cbz has_nul1, L(nul_in_data2)
> > > -#ifdef __AARCH64EB__
> > > - mov data2, data1
> > > -#endif
> > > - sub len, len, #8
> > > - mov has_nul2, has_nul1
> > > -L(nul_in_data2):
> > > + /* Enter with C = has_nul1 == 0. */
> > > + csel has_nul1, has_nul1, has_nul2, cc
> > > + mov len, 8
> > > + rev has_nul1, has_nul1
> > > + clz tmp1, has_nul1
> > > + csel len, xzr, len, cc
> > > + add len, len, tmp1, lsr 3
> > > + ret
> > > +
> > > + /* The inner loop processes 32 bytes per iteration and uses the fast
> > > + NUL check. If we encounter non-ASCII characters, use a second
> > > + loop with the accurate NUL check. */
> > > + .p2align 4
> > > +L(main_loop_entry):
> > > + bic src, srcin, 15
> > > + sub src, src, 16
> > > +L(main_loop):
> > > + ldp data1, data2, [src, 32]!
> > > +.Lpage_cross_entry:
> > > + sub tmp1, data1, zeroones
> > > + sub tmp3, data2, zeroones
> > > + orr tmp2, tmp1, tmp3
> > > + tst tmp2, zeroones, lsl 7
> > > + bne 1f
> > > + ldp data1, data2, [src, 16]
> > > + sub tmp1, data1, zeroones
> > > + sub tmp3, data2, zeroones
> > > + orr tmp2, tmp1, tmp3
> > > + tst tmp2, zeroones, lsl 7
> > > + beq L(main_loop)
> > > + add src, src, 16
> > > +1:
> > > + /* The fast check failed, so do the slower, accurate NUL check. */
> > > + orr tmp2, data1, REP8_7f
> > > + orr tmp4, data2, REP8_7f
> > > + bics has_nul1, tmp1, tmp2
> > > + bic has_nul2, tmp3, tmp4
> > > + ccmp has_nul2, 0, 0, eq
> > > + beq L(nonascii_loop)
> > > +
> > > + /* Enter with C = has_nul1 == 0. */
> > > +L(tail):
> > > #ifdef __AARCH64EB__
> > > /* For big-endian, carry propagation (if the final byte in the
> > > - string is 0x01) means we cannot use has_nul directly. The
> > > + string is 0x01) means we cannot use has_nul1/2 directly. The
> > > easiest way to get the correct byte is to byte-swap the data
> > > and calculate the syndrome a second time. */
> > > - rev data2, data2
> > > - sub tmp1, data2, zeroones
> > > - orr tmp2, data2, #REP8_7f
> > > - bic has_nul2, tmp1, tmp2
> > > + csel data1, data1, data2, cc
> > > + rev data1, data1
> > > + sub tmp1, data1, zeroones
> > > + orr tmp2, data1, REP8_7f
> > > + bic has_nul1, tmp1, tmp2
> > > +#else
> > > + csel has_nul1, has_nul1, has_nul2, cc
> > > #endif
> > > - sub len, len, #8
> > > - rev has_nul2, has_nul2
> > > - clz pos, has_nul2
> > > - add len, len, pos, lsr #3 /* Bits to bytes. */
> > > - RET
> > > -
> > > -L(misaligned):
> > > - cmp tmp1, #8
> > > - neg tmp1, tmp1
> > > - ldp data1, data2, [src], #16
> > > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> > > - mov tmp2, #~0
> > > + sub len, src, srcin
> > > + rev has_nul1, has_nul1
> > > + add tmp2, len, 8
> > > + clz tmp1, has_nul1
> > > + csel len, len, tmp2, cc
> > > + add len, len, tmp1, lsr 3
> > > + ret
> > > +
> > > +L(nonascii_loop):
> > > + ldp data1, data2, [src, 16]!
> > > + sub tmp1, data1, zeroones
> > > + orr tmp2, data1, REP8_7f
> > > + sub tmp3, data2, zeroones
> > > + orr tmp4, data2, REP8_7f
> > > + bics has_nul1, tmp1, tmp2
> > > + bic has_nul2, tmp3, tmp4
> > > + ccmp has_nul2, 0, 0, eq
> > > + bne L(tail)
> > > + ldp data1, data2, [src, 16]!
> > > + sub tmp1, data1, zeroones
> > > + orr tmp2, data1, REP8_7f
> > > + sub tmp3, data2, zeroones
> > > + orr tmp4, data2, REP8_7f
> > > + bics has_nul1, tmp1, tmp2
> > > + bic has_nul2, tmp3, tmp4
> > > + ccmp has_nul2, 0, 0, eq
> > > + beq L(nonascii_loop)
> > > + b L(tail)
> > > +
> > > + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> > > + srcin to 0x7f, so we ignore any NUL bytes before the string.
> > > + Then continue in the aligned loop. */
> > > +L(page_cross):
> > > + bic src, srcin, 15
> > > + ldp data1, data2, [src]
> > > + lsl tmp1, srcin, 3
> > > + mov tmp4, -1
> > > #ifdef __AARCH64EB__
> > > - /* Big-endian. Early bytes are at MSB. */
> > > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> > > + /* Big-endian. Early bytes are at MSB. */
> > > + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> > > #else
> > > /* Little-endian. Early bytes are at LSB. */
> > > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> > > + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> > > #endif
> > > - orr data1, data1, tmp2
> > > - orr data2a, data2, tmp2
> > > - csinv data1, data1, xzr, le
> > > - csel data2, data2, data2a, le
> > > - b L(realigned)
> > > + orr tmp1, tmp1, REP8_80
> > > + orn data1, data1, tmp1
> > > + orn tmp2, data2, tmp1
> > > + tst srcin, 8
> > > + csel data1, data1, tmp4, eq
> > > + csel data2, data2, tmp2, eq
> > > + b L(page_cross_entry)
> > > END (strlen)
> > > libc_hidden_builtin_def (strlen)
> > > --
> > > 1.9.1