This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RE: [PATCH][AArch64][PING] Optimize strlen


> Wilco Dijkstra wrote:
> ping
> 
> > > -----Original Message-----
> > > From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> > > Sent: 16 January 2015 17:39
> > > To: libc-alpha@sourceware.org
> > > Subject: [PATCH][AArch64] Optimize strlen
> > >
> > > Optimize the strlen implementation by using a page cross check and a fast check
> > > for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> > > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> > > and on random tests it is ~80% better.
> > >
> > > ChangeLog:
> > >
> > > 2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>
> > >
> > > 	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> > >
> > > ---
> > >  sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
> > >  1 file changed, 165 insertions(+), 65 deletions(-)
> > >
> > > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > > index 54271b9..7ca47d9 100644
> > > --- a/sysdeps/aarch64/strlen.S
> > > +++ b/sysdeps/aarch64/strlen.S
> > > @@ -20,9 +20,13 @@
> > >
> > >  /* Assumptions:
> > >   *
> > > - * ARMv8-a, AArch64
> > > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> > >   */
> > >
> > > +/* To test the page crossing code path more thoroughly, compile with
> > > +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> > > +   entry path.  This option is not intended for production use.  */
> > > +
> > >  /* Arguments and results.  */
> > >  #define srcin		x0
> > >  #define len		x0
> > > @@ -31,87 +35,183 @@
> > >  #define src		x1
> > >  #define data1		x2
> > >  #define data2		x3
> > > -#define data2a		x4
> > > -#define has_nul1	x5
> > > -#define has_nul2	x6
> > > -#define tmp1		x7
> > > -#define tmp2		x8
> > > -#define tmp3		x9
> > > -#define tmp4		x10
> > > -#define zeroones	x11
> > > -#define pos		x12
> > > +#define has_nul1	x4
> > > +#define has_nul2	x5
> > > +#define tmp1		x4
> > > +#define tmp2		x5
> > > +#define tmp3		x6
> > > +#define tmp4		x7
> > > +#define zeroones	x8
> > > +
> > > +	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > > +	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > > +	   can be done in parallel across the entire word. A faster check
> > > +	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> > > +	   false hits for characters 129..255.	*/
> > >
> > >  #define REP8_01 0x0101010101010101
> > >  #define REP8_7f 0x7f7f7f7f7f7f7f7f
> > >  #define REP8_80 0x8080808080808080
> > >
> > > -	/* Start of critial section -- keep to one 64Byte cache line.  */
> > > +#ifdef TEST_PAGE_CROSS
> > > +# define MIN_PAGE_SIZE 15
> > > +#else
> > > +# define MIN_PAGE_SIZE 4096
> > > +#endif
> > > +
> > > +	/* Since strings are short on average, we check the first 16 bytes
> > > +	   of the string for a NUL character.  In order to do an unaligned ldp
> > > +	   safely we have to do a page cross check first.  If there is a NUL
> > > +	   byte we calculate the length from the 2 8-byte words using
> > > +	   conditional select to reduce branch mispredictions (it is unlikely
> > > +	   strlen will be repeatedly called on strings with the same length).
> > > +
> > > +	   If the string is longer than 16 bytes, we align src so don't need
> > > +	   further page cross checks, and process 32 bytes per iteration
> > > +	   using the fast NUL check.  If we encounter non-ASCII characters,
> > > +	   fallback to a second loop using the full NUL check.
> > > +
> > > +	   If the page cross check fails, we read 16 bytes from an aligned
> > > +	   address, remove any characters before the string, and continue
> > > +	   in the main loop using aligned loads.  Since strings crossing a
> > > +	   page in the first 16 bytes are rare (probability of
> > > +	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> > > +
> > > +	   AArch64 systems have a minimum page size of 4k.  We don't bother
> > > +	   checking for larger page sizes - the cost of setting up the correct
> > > +	   page size is just not worth the extra gain from a small reduction in
> > > +	   the cases taking the slow path.  Note that we only care about
> > > +	   whether the first fetch, which may be misaligned, crosses a page
> > > +	   boundary.  */
> > > +
> > >  ENTRY_ALIGN (strlen, 6)
> > > -	mov	zeroones, #REP8_01
> > > -	bic	src, srcin, #15
> > > -	ands	tmp1, srcin, #15
> > > -	b.ne	L(misaligned)
> > > -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > > -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > > -	   can be done in parallel across the entire word.  */
> > > -	/* The inner loop deals with two Dwords at a time.  This has a
> > > -	   slightly higher start-up cost, but we should win quite quickly,
> > > -	   especially on cores with a high number of issue slots per
> > > -	   cycle, as we get much better parallelism out of the operations.  */
> > > -L(loop):
> > > -	ldp	data1, data2, [src], #16
> > > -L(realigned):
> > > +	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> > > +	mov	zeroones, REP8_01
> > > +	cmp	tmp1, MIN_PAGE_SIZE - 16
> > > +	b.gt	L(page_cross)
> > > +	ldp	data1, data2, [srcin]
> > > +#ifdef __AARCH64EB__
> > > +	/* For big-endian, carry propagation (if the final byte in the
> > > +	   string is 0x01) means we cannot use has_nul1/2 directly.
> > > +	   Since we expect strings to be small and early-exit,
> > > +	   byte-swap the data now so has_null1/2 will be correct.  */
> > > +	rev	data1, data1
> > > +	rev	data2, data2
> > > +#endif
> > >  	sub	tmp1, data1, zeroones
> > > -	orr	tmp2, data1, #REP8_7f
> > > +	orr	tmp2, data1, REP8_7f
> > >  	sub	tmp3, data2, zeroones
> > > -	orr	tmp4, data2, #REP8_7f
> > > -	bic	has_nul1, tmp1, tmp2
> > > -	bics	has_nul2, tmp3, tmp4
> > > -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> > > -	b.eq	L(loop)
> > > -	/* End of critical section -- keep to one 64Byte cache line.  */
> > > +	orr	tmp4, data2, REP8_7f
> > > +	bics	has_nul1, tmp1, tmp2
> > > +	bic	has_nul2, tmp3, tmp4
> > > +	ccmp	has_nul2, 0, 0, eq
> > > +	beq	L(main_loop_entry)
> > >
> > > -	sub	len, src, srcin
> > > -	cbz	has_nul1, L(nul_in_data2)
> > > -#ifdef __AARCH64EB__
> > > -	mov	data2, data1
> > > -#endif
> > > -	sub	len, len, #8
> > > -	mov	has_nul2, has_nul1
> > > -L(nul_in_data2):
> > > +	/* Enter with C = has_nul1 == 0.  */
> > > +	csel	has_nul1, has_nul1, has_nul2, cc
> > > +	mov	len, 8
> > > +	rev	has_nul1, has_nul1
> > > +	clz	tmp1, has_nul1
> > > +	csel	len, xzr, len, cc
> > > +	add	len, len, tmp1, lsr 3
> > > +	ret
> > > +
> > > +	/* The inner loop processes 32 bytes per iteration and uses the fast
> > > +	   NUL check.  If we encounter non-ASCII characters, use a second
> > > +	   loop with the accurate NUL check.  */
> > > +	.p2align 4
> > > +L(main_loop_entry):
> > > +	bic	src, srcin, 15
> > > +	sub	src, src, 16
> > > +L(main_loop):
> > > +	ldp	data1, data2, [src, 32]!
> > > +.Lpage_cross_entry:
> > > +	sub	tmp1, data1, zeroones
> > > +	sub	tmp3, data2, zeroones
> > > +	orr	tmp2, tmp1, tmp3
> > > +	tst	tmp2, zeroones, lsl 7
> > > +	bne	1f
> > > +	ldp	data1, data2, [src, 16]
> > > +	sub	tmp1, data1, zeroones
> > > +	sub	tmp3, data2, zeroones
> > > +	orr	tmp2, tmp1, tmp3
> > > +	tst	tmp2, zeroones, lsl 7
> > > +	beq	L(main_loop)
> > > +	add	src, src, 16
> > > +1:
> > > +	/* The fast check failed, so do the slower, accurate NUL check.	 */
> > > +	orr	tmp2, data1, REP8_7f
> > > +	orr	tmp4, data2, REP8_7f
> > > +	bics	has_nul1, tmp1, tmp2
> > > +	bic	has_nul2, tmp3, tmp4
> > > +	ccmp	has_nul2, 0, 0, eq
> > > +	beq	L(nonascii_loop)
> > > +
> > > +	/* Enter with C = has_nul1 == 0.  */
> > > +L(tail):
> > >  #ifdef __AARCH64EB__
> > >  	/* For big-endian, carry propagation (if the final byte in the
> > > -	   string is 0x01) means we cannot use has_nul directly.  The
> > > +	   string is 0x01) means we cannot use has_nul1/2 directly.  The
> > >  	   easiest way to get the correct byte is to byte-swap the data
> > >  	   and calculate the syndrome a second time.  */
> > > -	rev	data2, data2
> > > -	sub	tmp1, data2, zeroones
> > > -	orr	tmp2, data2, #REP8_7f
> > > -	bic	has_nul2, tmp1, tmp2
> > > +	csel	data1, data1, data2, cc
> > > +	rev	data1, data1
> > > +	sub	tmp1, data1, zeroones
> > > +	orr	tmp2, data1, REP8_7f
> > > +	bic	has_nul1, tmp1, tmp2
> > > +#else
> > > +	csel	has_nul1, has_nul1, has_nul2, cc
> > >  #endif
> > > -	sub	len, len, #8
> > > -	rev	has_nul2, has_nul2
> > > -	clz	pos, has_nul2
> > > -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> > > -	RET
> > > -
> > > -L(misaligned):
> > > -	cmp	tmp1, #8
> > > -	neg	tmp1, tmp1
> > > -	ldp	data1, data2, [src], #16
> > > -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> > > -	mov	tmp2, #~0
> > > +	sub	len, src, srcin
> > > +	rev	has_nul1, has_nul1
> > > +	add	tmp2, len, 8
> > > +	clz	tmp1, has_nul1
> > > +	csel	len, len, tmp2, cc
> > > +	add	len, len, tmp1, lsr 3
> > > +	ret
> > > +
> > > +L(nonascii_loop):
> > > +	ldp	data1, data2, [src, 16]!
> > > +	sub	tmp1, data1, zeroones
> > > +	orr	tmp2, data1, REP8_7f
> > > +	sub	tmp3, data2, zeroones
> > > +	orr	tmp4, data2, REP8_7f
> > > +	bics	has_nul1, tmp1, tmp2
> > > +	bic	has_nul2, tmp3, tmp4
> > > +	ccmp	has_nul2, 0, 0, eq
> > > +	bne	L(tail)
> > > +	ldp	data1, data2, [src, 16]!
> > > +	sub	tmp1, data1, zeroones
> > > +	orr	tmp2, data1, REP8_7f
> > > +	sub	tmp3, data2, zeroones
> > > +	orr	tmp4, data2, REP8_7f
> > > +	bics	has_nul1, tmp1, tmp2
> > > +	bic	has_nul2, tmp3, tmp4
> > > +	ccmp	has_nul2, 0, 0, eq
> > > +	beq	L(nonascii_loop)
> > > +	b	L(tail)
> > > +
> > > +	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> > > +	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> > > +	   Then continue in the aligned loop.  */
> > > +L(page_cross):
> > > +	bic	src, srcin, 15
> > > +	ldp	data1, data2, [src]
> > > +	lsl	tmp1, srcin, 3
> > > +	mov	tmp4, -1
> > >  #ifdef __AARCH64EB__
> > > -	/* Big-endian.  Early bytes are at MSB.  */
> > > -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > > +	/* Big-endian.	Early bytes are at MSB.	 */
> > > +	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> > >  #else
> > >  	/* Little-endian.  Early bytes are at LSB.  */
> > > -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> > > +	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> > >  #endif
> > > -	orr	data1, data1, tmp2
> > > -	orr	data2a, data2, tmp2
> > > -	csinv	data1, data1, xzr, le
> > > -	csel	data2, data2, data2a, le
> > > -	b	L(realigned)
> > > +	orr	tmp1, tmp1, REP8_80
> > > +	orn	data1, data1, tmp1
> > > +	orn	tmp2, data2, tmp1
> > > +	tst	srcin, 8
> > > +	csel	data1, data1, tmp4, eq
> > > +	csel	data2, data2, tmp2, eq
> > > +	b	L(page_cross_entry)
> > >  END (strlen)
> > >  libc_hidden_builtin_def (strlen)
> > > --
> > > 1.9.1




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]