From: Wilco Dijkstra <wilco.dijkstra@arm.com>
Date: Thu, 1 Jul 2021 14:30:42 +0000 (+0100)
Subject: AArch64: Improve strnlen performance
X-Git-Tag: glibc-2.34~164
X-Git-Url: https://sourceware.org/git/?a=commitdiff_plain;h=252cad02d4c63540501b9b8c988cb91248563224;p=glibc.git

AArch64: Improve strnlen performance

Optimize strnlen by avoiding UMINV which is slow on most cores. On Neoverse N1
large strings are 1.8x faster than the current version, and bench-strnlen is
50% faster overall. This version is MTE compatible.

Reviewed-by: Szabolcs Nagy  <szabolcs.nagy@arm.com>
---

diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 2b57575c55..37e9eed412 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -22,197 +22,105 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
-#define len		x0
-#define limit		x1
+#define cntin		x1
+#define result		x0
 
-/* Locals and temporaries.  */
 #define src		x2
-#define data1		x3
-#define data2		x4
-#define data2a		x5
-#define has_nul1	x6
-#define has_nul2	x7
-#define tmp1		x8
-#define tmp2		x9
-#define tmp3		x10
-#define tmp4		x11
-#define zeroones	x12
-#define pos		x13
-#define limit_wd	x14
-
-#define dataq		q2
-#define datav		v2
-#define datab2		b3
-#define dataq2		q3
-#define datav2		v3
-#define REP8_01 0x0101010101010101
-#define REP8_7f 0x7f7f7f7f7f7f7f7f
-#define REP8_80 0x8080808080808080
-
-ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+#define synd		x3
+#define	shift		x4
+#define wtmp		w4
+#define tmp		x4
+#define cntrem		x5
+
+#define qdata		q0
+#define vdata		v0
+#define vhas_chr	v1
+#define vrepmask	v2
+#define vend		v3
+#define dend		d3
+
+/*
+   Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
+
+ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
-	cbz	limit, L(hit_limit)
-	mov	zeroones, #REP8_01
-	bic	src, srcin, #15
-	ands	tmp1, srcin, #15
-	b.ne	L(misaligned)
-	/* Calculate the number of full and partial words -1.  */
-	sub	limit_wd, limit, #1	/* Limit != 0, so no underflow.  */
-	lsr	limit_wd, limit_wd, #4	/* Convert to Qwords.  */
-
-	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
-	   (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
-	   can be done in parallel across the entire word.  */
-	/* The inner loop deals with two Dwords at a time.  This has a
-	   slightly higher start-up cost, but we should win quite quickly,
-	   especially on cores with a high number of issue slots per
-	   cycle, as we get much better parallelism out of the operations.  */
-
-	/* Start of critial section -- keep to one 64Byte cache line.  */
-
-	ldp	data1, data2, [src], #16
-L(realigned):
-	sub	tmp1, data1, zeroones
-	orr	tmp2, data1, #REP8_7f
-	sub	tmp3, data2, zeroones
-	orr	tmp4, data2, #REP8_7f
-	bic	has_nul1, tmp1, tmp2
-	bic	has_nul2, tmp3, tmp4
-	subs	limit_wd, limit_wd, #1
-	orr	tmp1, has_nul1, has_nul2
-	ccmp	tmp1, #0, #0, pl	/* NZCV = 0000  */
-	b.eq	L(loop)
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	orr	tmp1, has_nul1, has_nul2
-	cbz	tmp1, L(hit_limit)	/* No null in final Qword.  */
-
-	/* We know there's a null in the final Qword.  The easiest thing
-	   to do now is work out the length of the string and return
-	   MIN (len, limit).  */
-
-	sub	len, src, srcin
-	cbz	has_nul1, L(nul_in_data2)
-#ifdef __AARCH64EB__
-	mov	data2, data1
+	bic	src, srcin, 15
+	mov	wtmp, 0xf00f
+	cbz	cntin, L(nomatch)
+	ld1	{vdata.16b}, [src], 16
+	dup	vrepmask.8h, wtmp
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	lsl	shift, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	lsr	synd, synd, shift
+	cbz	synd, L(start_loop)
+L(finish):
+	rbit	synd, synd
+	clz	synd, synd
+	lsr	result, synd, 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
+
+L(start_loop):
+	sub	tmp, src, srcin
+	subs	cntrem, cntin, tmp
+	b.ls	L(nomatch)
+
+	/* Make sure that it won't overread by a 16-byte chunk */
+	add	tmp, cntrem, 15
+	tbnz	tmp, 4, L(loop32_2)
+
+	.p2align 5
+L(loop32):
+	ldr	qdata, [src], 16
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbnz	synd, L(end)
+L(loop32_2):
+	ldr	qdata, [src], 16
+	subs	cntrem, cntrem, 32
+	cmeq	vhas_chr.16b, vdata.16b, 0
+	b.ls	L(end)
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	synd, dend
+	cbz	synd, L(loop32)
+
+L(end):
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	sub	src, src, 16
+	mov	synd, vend.d[0]
+	sub	result, src, srcin
+#ifndef __AARCH64EB__
+	rbit	synd, synd
 #endif
-	sub	len, len, #8
-	mov	has_nul2, has_nul1
-L(nul_in_data2):
-#ifdef __AARCH64EB__
-	/* For big-endian, carry propagation (if the final byte in the
-	   string is 0x01) means we cannot use has_nul directly.  The
-	   easiest way to get the correct byte is to byte-swap the data
-	   and calculate the syndrome a second time.  */
-	rev	data2, data2
-	sub	tmp1, data2, zeroones
-	orr	tmp2, data2, #REP8_7f
-	bic	has_nul2, tmp1, tmp2
-#endif
-	sub	len, len, #8
-	rev	has_nul2, has_nul2
-	clz	pos, has_nul2
-	add	len, len, pos, lsr #3		/* Bits to bytes.  */
-	cmp	len, limit
-	csel	len, len, limit, ls		/* Return the lower value.  */
-	RET
-
-L(loop):
-	ldr	dataq, [src], #16
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	subs	limit_wd, limit_wd, #1
-	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
-	b.eq	L(loop_end)
-	ldr	dataq, [src], #16
-	uminv	datab2, datav.16b
-	mov	tmp1, datav2.d[0]
-	subs	limit_wd, limit_wd, #1
-	ccmp	tmp1, #0, #4, pl	/* NZCV = 0000  */
-	b.ne	L(loop)
-L(loop_end):
-	/* End of critical section -- keep to one 64Byte cache line.  */
-
-	cbnz	tmp1, L(hit_limit)	/* No null in final Qword.  */
-
-	/* We know there's a null in the final Qword.  The easiest thing
-	   to do now is work out the length of the string and return
-	   MIN (len, limit).  */
-
-#ifdef __AARCH64EB__
-	rev64	datav.16b, datav.16b
-#endif
-	/* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
-	   pair of scalars and then compute the length from the earliest NULL
-	   byte.  */
-
-	cmeq	datav.16b, datav.16b, #0
-#ifdef __AARCH64EB__
-	mov	data1, datav.d[1]
-	mov	data2, datav.d[0]
-#else
-	mov	data1, datav.d[0]
-	mov	data2, datav.d[1]
-#endif
-	cmp	data1, 0
-	csel	data1, data1, data2, ne
-	sub	len, src, srcin
-	sub	len, len, #16
-	rev	data1, data1
-	add	tmp2, len, 8
-	clz	tmp1, data1
-	csel	len, len, tmp2, ne
-	add	len, len, tmp1, lsr 3
-	cmp	len, limit
-	csel	len, len, limit, ls		/* Return the lower value.  */
-	RET
-
-L(misaligned):
-	/* Deal with a partial first word.
-	   We're doing two things in parallel here;
-	   1) Calculate the number of words (but avoiding overflow if
-	      limit is near ULONG_MAX) - to do this we need to work out
-	      limit + tmp1 - 1 as a 65-bit value before shifting it;
-	   2) Load and mask the initial data words - we force the bytes
-	      before the ones we are interested in to 0xff - this ensures
-	      early bytes will not hit any zero detection.  */
-	sub	limit_wd, limit, #1
-	neg	tmp4, tmp1
-	cmp	tmp1, #8
-
-	and	tmp3, limit_wd, #15
-	lsr	limit_wd, limit_wd, #4
-	mov	tmp2, #~0
-
-	ldp	data1, data2, [src], #16
-	lsl	tmp4, tmp4, #3		/* Bytes beyond alignment -> bits.  */
-	add	tmp3, tmp3, tmp1
-
-#ifdef __AARCH64EB__
-	/* Big-endian.  Early bytes are at MSB.  */
-	lsl	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#else
-	/* Little-endian.  Early bytes are at LSB.  */
-	lsr	tmp2, tmp2, tmp4	/* Shift (tmp1 & 63).  */
-#endif
-	add	limit_wd, limit_wd, tmp3, lsr #4
-
-	orr	data1, data1, tmp2
-	orr	data2a, data2, tmp2
+	clz	synd, synd
+	add	result, result, synd, lsr 2
+	cmp	cntin, result
+	csel	result, cntin, result, ls
+	ret
 
-	csinv	data1, data1, xzr, le
-	csel	data2, data2, data2a, le
-	b	L(realigned)
+L(nomatch):
+	mov	result, cntin
+	ret
 
-L(hit_limit):
-	mov	len, limit
-	RET
 END (__strnlen)
 libc_hidden_def (__strnlen)
 weak_alias (__strnlen, strnlen)