This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

RE: [PATCH][AArch64] Optimize strlen


ping

> -----Original Message-----
> From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> Sent: 16 January 2015 17:39
> To: libc-alpha@sourceware.org
> Subject: [PATCH][AArch64] Optimize strlen
> 
> Optimize the strlen implementation by using a page cross check and a fast check
> for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> and on random tests it is ~80% better.
> 
> ChangeLog:
> 
> 2015-01-16  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> 
> ---
>  sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
>  1 file changed, 165 insertions(+), 65 deletions(-)
> 
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index 54271b9..7ca47d9 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -20,9 +20,13 @@
> 
>  /* Assumptions:
>   *
> - * ARMv8-a, AArch64
> + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
>   */
> 
> +/* To test the page crossing code path more thoroughly, compile with
> +   -DTEST_PAGE_CROSS - this will force all calls through the slower
> +   entry path.  This option is not intended for production use.  */
> +
>  /* Arguments and results.  */
>  #define srcin		x0
>  #define len		x0
> @@ -31,87 +35,183 @@
>  #define src		x1
>  #define data1		x2
>  #define data2		x3
> -#define data2a		x4
> -#define has_nul1	x5
> -#define has_nul2	x6
> -#define tmp1		x7
> -#define tmp2		x8
> -#define tmp3		x9
> -#define tmp4		x10
> -#define zeroones	x11
> -#define pos		x12
> +#define has_nul1	x4
> +#define has_nul2	x5
> +#define tmp1		x4
> +#define tmp2		x5
> +#define tmp3		x6
> +#define tmp4		x7
> +#define zeroones	x8
> +
> +	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> +	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> +	   can be done in parallel across the entire word. A faster check
> +	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> +	   false hits for characters 129..255.	*/
> 
>  #define REP8_01 0x0101010101010101
>  #define REP8_7f 0x7f7f7f7f7f7f7f7f
>  #define REP8_80 0x8080808080808080
> 
> -	/* Start of critial section -- keep to one 64Byte cache line.  */
> +#ifdef TEST_PAGE_CROSS
> +# define MIN_PAGE_SIZE 15
> +#else
> +# define MIN_PAGE_SIZE 4096
> +#endif
> +
> +	/* Since strings are short on average, we check the first 16 bytes
> +	   of the string for a NUL character.  In order to do an unaligned ldp
> +	   safely we have to do a page cross check first.  If there is a NUL
> +	   byte we calculate the length from the 2 8-byte words using
> +	   conditional select to reduce branch mispredictions (it is unlikely
> +	   strlen will be repeatedly called on strings with the same length).
> +
> +	   If the string is longer than 16 bytes, we align src so don't need
> +	   further page cross checks, and process 32 bytes per iteration
> +	   using the fast NUL check.  If we encounter non-ASCII characters,
> +	   fallback to a second loop using the full NUL check.
> +
> +	   If the page cross check fails, we read 16 bytes from an aligned
> +	   address, remove any characters before the string, and continue
> +	   in the main loop using aligned loads.  Since strings crossing a
> +	   page in the first 16 bytes are rare (probability of
> +	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> +
> +	   AArch64 systems have a minimum page size of 4k.  We don't bother
> +	   checking for larger page sizes - the cost of setting up the correct
> +	   page size is just not worth the extra gain from a small reduction in
> +	   the cases taking the slow path.  Note that we only care about
> +	   whether the first fetch, which may be misaligned, crosses a page
> +	   boundary.  */
> +
>  ENTRY_ALIGN (strlen, 6)
> -	mov	zeroones, #REP8_01
> -	bic	src, srcin, #15
> -	ands	tmp1, srcin, #15
> -	b.ne	L(misaligned)
> -	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> -	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> -	   can be done in parallel across the entire word.  */
> -	/* The inner loop deals with two Dwords at a time.  This has a
> -	   slightly higher start-up cost, but we should win quite quickly,
> -	   especially on cores with a high number of issue slots per
> -	   cycle, as we get much better parallelism out of the operations.  */
> -L(loop):
> -	ldp	data1, data2, [src], #16
> -L(realigned):
> +	and	tmp1, srcin, MIN_PAGE_SIZE - 1
> +	mov	zeroones, REP8_01
> +	cmp	tmp1, MIN_PAGE_SIZE - 16
> +	b.gt	L(page_cross)
> +	ldp	data1, data2, [srcin]
> +#ifdef __AARCH64EB__
> +	/* For big-endian, carry propagation (if the final byte in the
> +	   string is 0x01) means we cannot use has_nul1/2 directly.
> +	   Since we expect strings to be small and early-exit,
> +	   byte-swap the data now so has_null1/2 will be correct.  */
> +	rev	data1, data1
> +	rev	data2, data2
> +#endif
>  	sub	tmp1, data1, zeroones
> -	orr	tmp2, data1, #REP8_7f
> +	orr	tmp2, data1, REP8_7f
>  	sub	tmp3, data2, zeroones
> -	orr	tmp4, data2, #REP8_7f
> -	bic	has_nul1, tmp1, tmp2
> -	bics	has_nul2, tmp3, tmp4
> -	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> -	b.eq	L(loop)
> -	/* End of critical section -- keep to one 64Byte cache line.  */
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(main_loop_entry)
> 
> -	sub	len, src, srcin
> -	cbz	has_nul1, L(nul_in_data2)
> -#ifdef __AARCH64EB__
> -	mov	data2, data1
> -#endif
> -	sub	len, len, #8
> -	mov	has_nul2, has_nul1
> -L(nul_in_data2):
> +	/* Enter with C = has_nul1 == 0.  */
> +	csel	has_nul1, has_nul1, has_nul2, cc
> +	mov	len, 8
> +	rev	has_nul1, has_nul1
> +	clz	tmp1, has_nul1
> +	csel	len, xzr, len, cc
> +	add	len, len, tmp1, lsr 3
> +	ret
> +
> +	/* The inner loop processes 32 bytes per iteration and uses the fast
> +	   NUL check.  If we encounter non-ASCII characters, use a second
> +	   loop with the accurate NUL check.  */
> +	.p2align 4
> +L(main_loop_entry):
> +	bic	src, srcin, 15
> +	sub	src, src, 16
> +L(main_loop):
> +	ldp	data1, data2, [src, 32]!
> +.Lpage_cross_entry:
> +	sub	tmp1, data1, zeroones
> +	sub	tmp3, data2, zeroones
> +	orr	tmp2, tmp1, tmp3
> +	tst	tmp2, zeroones, lsl 7
> +	bne	1f
> +	ldp	data1, data2, [src, 16]
> +	sub	tmp1, data1, zeroones
> +	sub	tmp3, data2, zeroones
> +	orr	tmp2, tmp1, tmp3
> +	tst	tmp2, zeroones, lsl 7
> +	beq	L(main_loop)
> +	add	src, src, 16
> +1:
> +	/* The fast check failed, so do the slower, accurate NUL check.	 */
> +	orr	tmp2, data1, REP8_7f
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(nonascii_loop)
> +
> +	/* Enter with C = has_nul1 == 0.  */
> +L(tail):
>  #ifdef __AARCH64EB__
>  	/* For big-endian, carry propagation (if the final byte in the
> -	   string is 0x01) means we cannot use has_nul directly.  The
> +	   string is 0x01) means we cannot use has_nul1/2 directly.  The
>  	   easiest way to get the correct byte is to byte-swap the data
>  	   and calculate the syndrome a second time.  */
> -	rev	data2, data2
> -	sub	tmp1, data2, zeroones
> -	orr	tmp2, data2, #REP8_7f
> -	bic	has_nul2, tmp1, tmp2
> +	csel	data1, data1, data2, cc
> +	rev	data1, data1
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	bic	has_nul1, tmp1, tmp2
> +#else
> +	csel	has_nul1, has_nul1, has_nul2, cc
>  #endif
> -	sub	len, len, #8
> -	rev	has_nul2, has_nul2
> -	clz	pos, has_nul2
> -	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> -	RET
> -
> -L(misaligned):
> -	cmp	tmp1, #8
> -	neg	tmp1, tmp1
> -	ldp	data1, data2, [src], #16
> -	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> -	mov	tmp2, #~0
> +	sub	len, src, srcin
> +	rev	has_nul1, has_nul1
> +	add	tmp2, len, 8
> +	clz	tmp1, has_nul1
> +	csel	len, len, tmp2, cc
> +	add	len, len, tmp1, lsr 3
> +	ret
> +
> +L(nonascii_loop):
> +	ldp	data1, data2, [src, 16]!
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	sub	tmp3, data2, zeroones
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	bne	L(tail)
> +	ldp	data1, data2, [src, 16]!
> +	sub	tmp1, data1, zeroones
> +	orr	tmp2, data1, REP8_7f
> +	sub	tmp3, data2, zeroones
> +	orr	tmp4, data2, REP8_7f
> +	bics	has_nul1, tmp1, tmp2
> +	bic	has_nul2, tmp3, tmp4
> +	ccmp	has_nul2, 0, 0, eq
> +	beq	L(nonascii_loop)
> +	b	L(tail)
> +
> +	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> +	   srcin to 0x7f, so we ignore any NUL bytes before the string.
> +	   Then continue in the aligned loop.  */
> +L(page_cross):
> +	bic	src, srcin, 15
> +	ldp	data1, data2, [src]
> +	lsl	tmp1, srcin, 3
> +	mov	tmp4, -1
>  #ifdef __AARCH64EB__
> -	/* Big-endian.  Early bytes are at MSB.  */
> -	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> +	/* Big-endian.	Early bytes are at MSB.	 */
> +	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
>  #else
>  	/* Little-endian.  Early bytes are at LSB.  */
> -	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> +	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
>  #endif
> -	orr	data1, data1, tmp2
> -	orr	data2a, data2, tmp2
> -	csinv	data1, data1, xzr, le
> -	csel	data2, data2, data2a, le
> -	b	L(realigned)
> +	orr	tmp1, tmp1, REP8_80
> +	orn	data1, data1, tmp1
> +	orn	tmp2, data2, tmp1
> +	tst	srcin, 8
> +	csel	data1, data1, tmp4, eq
> +	csel	data2, data2, tmp2, eq
> +	b	L(page_cross_entry)
>  END (strlen)
>  libc_hidden_builtin_def (strlen)
> --
> 1.9.1





Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]