[PATCH, AArch64]: Optimize strlen

Richard Earnshaw rearnsha@arm.com
Fri Jan 16 16:56:00 GMT 2015


On 16/01/15 16:29, Wilco Dijkstra wrote:
> Optimize the strlen implementation by using a page cross check and a fast check 
> for nul bytes which reverts to separate loop when a non-ASCII char is encountered. 
> Speedup on cortex-strings test is ~10%, long ASCII strings are processed ~60% 
> faster, and on random tests it is ~80% better.
> 
> OK for commit?
> 
> 

Please can you re-send with the patch in unified diff format.  ie diff -u.

You also need a ChangeLog entry.

R.


> strlen.txt
> 
> 
> Index: strlen.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/aarch64/strlen.S,v
> retrieving revision 1.1
> diff -r1.1 strlen.S
> 1c1
> < /* Copyright (c) 2013, Linaro Limited
> ---
>> /* Copyright (c) 2013-2015, Linaro Limited
> 7c7
> <          notice, this list of conditions and the following disclaimer.
> ---
>> 	 notice, this list of conditions and the following disclaimer.
> 9,10c9,10
> <          notice, this list of conditions and the following disclaimer in the
> <          documentation and/or other materials provided with the distribution.
> ---
>> 	 notice, this list of conditions and the following disclaimer in the
>> 	 documentation and/or other materials provided with the distribution.
> 12,13c12,13
> <          names of its contributors may be used to endorse or promote products
> <          derived from this software without specific prior written permission.
> ---
>> 	 names of its contributors may be used to endorse or promote products
>> 	 derived from this software without specific prior written permission.
> 33c33
> <  * ARMv8-a, AArch64
> ---
>>  * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> 35a36,39
>> /* To test the page crossing code path more thoroughly, compile with
>>    -DTEST_PAGE_CROSS - this will force all calls through the slower
>>    entry path.  This option is not intended for production use.	 */
>>
> 44,52c48,56
> < #define data2a		x4
> < #define has_nul1	x5
> < #define has_nul2	x6
> < #define tmp1		x7
> < #define tmp2		x8
> < #define tmp3		x9
> < #define tmp4		x10
> < #define zeroones	x11
> < #define pos		x12
> ---
>> #define has_nul1	x4
>> #define has_nul2	x5
>> #define tmp1		x4
>> #define tmp2		x5
>> #define tmp3		x6
>> #define tmp4		x7
>> #define zeroones	x8
>>
>> #define L(l) .L ## l
> 61a66,71
>> 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
>> 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
>> 	   can be done in parallel across the entire word. A faster check
>> 	   (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
>> 	   false hits for characters 129..255.	*/
>>
> 66c76,106
> < 	/* Start of critial section -- keep to one 64Byte cache line.  */
> ---
>> #ifdef TEST_PAGE_CROSS
>> # define MIN_PAGE_SIZE 15
>> #else
>> # define MIN_PAGE_SIZE 4096
>> #endif
>>
>> 	/* Since strings are short on average, we check the first 16 bytes
>> 	   of the string for a NUL character.  In order to do an unaligned ldp
>> 	   safely we have to do a page cross check first.  If there is a NUL
>> 	   byte we calculate the length from the 2 8-byte words using
>> 	   conditional select to reduce branch mispredictions (it is unlikely
>> 	   strlen will be repeatedly called on strings with the same length).
>>
>> 	   If the string is longer than 16 bytes, we align src so don't need
>> 	   further page cross checks, and process 32 bytes per iteration
>> 	   using the fast NUL check.  If we encounter non-ASCII characters,
>> 	   fallback to a second loop using the full NUL check.
>>
>> 	   If the page cross check fails, we read 16 bytes from an aligned
>> 	   address, remove any characters before the string, and continue
>> 	   in the main loop using aligned loads.  Since strings crossing a
>> 	   page in the first 16 bytes are rare (probability of
>> 	   16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
>>
>> 	   AArch64 systems have a minimum page size of 4k.  We don't bother
>> 	   checking for larger page sizes - the cost of setting up the correct
>> 	   page size is just not worth the extra gain from a small reduction in
>> 	   the cases taking the slow path.  Note that we only care about
>> 	   whether the first fetch, which may be misaligned, crosses a page
>> 	   boundary.  */
>>
> 68,81c108,120
> < 	mov	zeroones, #REP8_01
> < 	bic	src, srcin, #15
> < 	ands	tmp1, srcin, #15
> < 	b.ne	.Lmisaligned
> < 	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> < 	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> < 	   can be done in parallel across the entire word.  */
> < 	/* The inner loop deals with two Dwords at a time.  This has a
> < 	   slightly higher start-up cost, but we should win quite quickly,
> < 	   especially on cores with a high number of issue slots per
> < 	   cycle, as we get much better parallelism out of the operations.  */
> < .Lloop:
> < 	ldp	data1, data2, [src], #16
> < .Lrealigned:
> ---
>> 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
>> 	mov	zeroones, REP8_01
>> 	cmp	tmp1, MIN_PAGE_SIZE - 16
>> 	b.gt	L(page_cross)
>> 	ldp	data1, data2, [srcin]
>> #ifdef __AARCH64EB__
>> 	/* For big-endian, carry propagation (if the final byte in the
>> 	   string is 0x01) means we cannot use has_nul1/2 directly.
>> 	   Since we expect strings to be small and early-exit,
>> 	   byte-swap the data now so has_null1/2 will be correct.  */
>> 	rev	data1, data1
>> 	rev	data2, data2
>> #endif
> 83c122
> < 	orr	tmp2, data1, #REP8_7f
> ---
>> 	orr	tmp2, data1, REP8_7f
> 85,90c124,137
> < 	orr	tmp4, data2, #REP8_7f
> < 	bic	has_nul1, tmp1, tmp2
> < 	bics	has_nul2, tmp3, tmp4
> < 	ccmp	has_nul1, #0, #0, eq	/* NZCV = 0000  */
> < 	b.eq	.Lloop
> < 	/* End of critical section -- keep to one 64Byte cache line.  */
> ---
>> 	orr	tmp4, data2, REP8_7f
>> 	bics	has_nul1, tmp1, tmp2
>> 	bic	has_nul2, tmp3, tmp4
>> 	ccmp	has_nul2, 0, 0, eq
>> 	beq	L(main_loop_entry)
>>
>> 	/* Enter with C = has_nul1 == 0.  */
>> 	csel	has_nul1, has_nul1, has_nul2, cc
>> 	mov	len, 8
>> 	rev	has_nul1, has_nul1
>> 	clz	tmp1, has_nul1
>> 	csel	len, xzr, len, cc
>> 	add	len, len, tmp1, lsr 3
>> 	ret
> 92,99c139,171
> < 	sub	len, src, srcin
> < 	cbz	has_nul1, .Lnul_in_data2
> < #ifdef __AARCH64EB__
> < 	mov	data2, data1
> < #endif
> < 	sub	len, len, #8
> < 	mov	has_nul2, has_nul1
> < .Lnul_in_data2:
> ---
>> 	/* The inner loop processes 32 bytes per iteration and uses the fast
>> 	   NUL check.  If we encounter non-ASCII characters, use a second
>> 	   loop with the accurate NUL check.  */
>> 	.p2align 4
>> L(main_loop_entry):
>> 	bic	src, srcin, 15
>> 	sub	src, src, 16
>> L(main_loop):
>> 	ldp	data1, data2, [src, 32]!
>> .Lpage_cross_entry:
>> 	sub	tmp1, data1, zeroones
>> 	sub	tmp3, data2, zeroones
>> 	orr	tmp2, tmp1, tmp3
>> 	tst	tmp2, zeroones, lsl 7
>> 	bne	1f
>> 	ldp	data1, data2, [src, 16]
>> 	sub	tmp1, data1, zeroones
>> 	sub	tmp3, data2, zeroones
>> 	orr	tmp2, tmp1, tmp3
>> 	tst	tmp2, zeroones, lsl 7
>> 	beq	L(main_loop)
>> 	add	src, src, 16
>> 1:
>> 	/* The fast check failed, so do the slower, accurate NUL check.	 */
>> 	orr	tmp2, data1, REP8_7f
>> 	orr	tmp4, data2, REP8_7f
>> 	bics	has_nul1, tmp1, tmp2
>> 	bic	has_nul2, tmp3, tmp4
>> 	ccmp	has_nul2, 0, 0, eq
>> 	beq	L(nonascii_loop)
>>
>> 	/* Enter with C = has_nul1 == 0.  */
>> L(tail):
> 102c174
> < 	   string is 0x01) means we cannot use has_nul directly.  The
> ---
>> 	   string is 0x01) means we cannot use has_nul1/2 directly.  The
> 105,108c177,183
> < 	rev	data2, data2
> < 	sub	tmp1, data2, zeroones
> < 	orr	tmp2, data2, #REP8_7f
> < 	bic	has_nul2, tmp1, tmp2
> ---
>> 	csel	data1, data1, data2, cc
>> 	rev	data1, data1
>> 	sub	tmp1, data1, zeroones
>> 	orr	tmp2, data1, REP8_7f
>> 	bic	has_nul1, tmp1, tmp2
>> #else
>> 	csel	has_nul1, has_nul1, has_nul2, cc
> 110,113c185,190
> < 	sub	len, len, #8
> < 	rev	has_nul2, has_nul2
> < 	clz	pos, has_nul2
> < 	add	len, len, pos, lsr #3		/* Bits to bytes.  */
> ---
>> 	sub	len, src, srcin
>> 	rev	has_nul1, has_nul1
>> 	add	tmp2, len, 8
>> 	clz	tmp1, has_nul1
>> 	csel	len, len, tmp2, cc
>> 	add	len, len, tmp1, lsr 3
> 116,121c193,221
> < .Lmisaligned:
> < 	cmp	tmp1, #8
> < 	neg	tmp1, tmp1
> < 	ldp	data1, data2, [src], #16
> < 	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
> < 	mov	tmp2, #~0
> ---
>> L(nonascii_loop):
>> 	ldp	data1, data2, [src, 16]!
>> 	sub	tmp1, data1, zeroones
>> 	orr	tmp2, data1, REP8_7f
>> 	sub	tmp3, data2, zeroones
>> 	orr	tmp4, data2, REP8_7f
>> 	bics	has_nul1, tmp1, tmp2
>> 	bic	has_nul2, tmp3, tmp4
>> 	ccmp	has_nul2, 0, 0, eq
>> 	bne	L(tail)
>> 	ldp	data1, data2, [src, 16]!
>> 	sub	tmp1, data1, zeroones
>> 	orr	tmp2, data1, REP8_7f
>> 	sub	tmp3, data2, zeroones
>> 	orr	tmp4, data2, REP8_7f
>> 	bics	has_nul1, tmp1, tmp2
>> 	bic	has_nul2, tmp3, tmp4
>> 	ccmp	has_nul2, 0, 0, eq
>> 	beq	L(nonascii_loop)
>> 	b	L(tail)
>>
>> 	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
>> 	   srcin to 0x7f, so we ignore any NUL bytes before the string.
>> 	   Then continue in the aligned loop.  */
>> L(page_cross):
>> 	bic	src, srcin, 15
>> 	ldp	data1, data2, [src]
>> 	lsl	tmp1, srcin, 3
>> 	mov	tmp4, -1
> 123,124c223,224
> < 	/* Big-endian.  Early bytes are at MSB.  */
> < 	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> ---
>> 	/* Big-endian.	Early bytes are at MSB.	 */
>> 	lsr	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> 127c227
> < 	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
> ---
>> 	lsl	tmp1, tmp4, tmp1	/* Shift (tmp1 & 63).  */
> 129,133c229,235
> < 	orr	data1, data1, tmp2
> < 	orr	data2a, data2, tmp2
> < 	csinv	data1, data1, xzr, le
> < 	csel	data2, data2, data2a, le
> < 	b	.Lrealigned
> ---
>> 	orr	tmp1, tmp1, REP8_80
>> 	orn	data1, data1, tmp1
>> 	orn	tmp2, data2, tmp1
>> 	tst	srcin, 8
>> 	csel	data1, data1, tmp4, eq
>> 	csel	data2, data2, tmp2, eq
>> 	b	L(page_cross_entry)




More information about the Newlib mailing list