[PATCH][AArch64] Cleanup memset

Wed Feb 26 16:12:00 GMT 2020

Cleanup memset.  Remove unnecessary code for unused ZVA sizes.
For zero memsets it's faster use DC ZVA for >= 160 bytes rather
than 256. Add a define which allows skipping the ZVA size test when
the ZVA size is known to be 64 - reading dczid_el0 may be expensive.
This simplifies the Falkor memset implementation.

--

diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
index 872a6b511de2bb97939e644c78676a6212c65d6d..22da7381e68bdfaa6086e39cc980c77150242d21 100644
--- a/sysdeps/aarch64/memset-reg.h
+++ b/sysdeps/aarch64/memset-reg.h
@@ -22,9 +22,4 @@
 #define count	x2
 #define dst	x3
 #define dstend	x4
-#define tmp1	x5
-#define tmp1w	w5
-#define tmp2	x6
-#define tmp2w	w6
-#define zva_len x7
-#define zva_lenw w7
+#define zva_val x5
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index ac577f1660e2706e2024e42d0efc09120c041af2..4654732e1bdd4a7e8f673b79ed66be408bed726b 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -78,114 +78,49 @@ L(set96):
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-	nop
+	.p2align 4
 L(set_long):
 	and	valw, valw, 255
 	bic	dst, dstin, 15
 	str	q0, [dstin]
-	cmp	count, 256
-	ccmp	valw, 0, 0, cs
-	b.eq	L(try_zva)
-L(no_zva):
-	sub	count, dstend, dst	/* Count is 16 too large.  */
-	sub	dst, dst, 16		/* Dst is biased by -32.  */
-	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
-1:	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]!
-L(tail64):
-	subs	count, count, 64
-	b.hi	1b
-2:	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-
-L(try_zva):
-#ifdef ZVA_MACRO
-	zva_macro
-#else
-	.p2align 3
-	mrs	tmp1, dczid_el0
-	tbnz	tmp1w, 4, L(no_zva)
-	and	tmp1w, tmp1w, 15
-	cmp	tmp1w, 4	/* ZVA size is 64 bytes.  */
-	b.ne	 L(zva_128)
-
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.
-	 */
-L(zva_64):
+	cmp	count, 160
+	ccmp	valw, 0, 0, hs
+	b.ne	L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+	mrs	zva_val, dczid_el0
+	and	zva_val, zva_val, 31
+	cmp	zva_val, 4		/* ZVA size is 64 bytes.  */
+	b.ne	L(no_zva)
+#endif
 	str	q0, [dst, 16]
 	stp	q0, q0, [dst, 32]
 	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-	nop
-1:	dc	zva, dst
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	.p2align 4
+L(zva_loop):
 	add	dst, dst, 64
+	dc	zva, dst
 	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
+	b.hi	L(zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-	.p2align 3
-L(zva_128):
-	cmp	tmp1w, 5	/* ZVA size is 128 bytes.  */
-	b.ne	L(zva_other)
-
-	str	q0, [dst, 16]
+L(no_zva):
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	dst, dst, 16		/* Dst is biased by -32.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
 	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	bic	dst, dst, 127
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+128	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 128
-	subs	count, count, 128
-	b.hi	1b
-	stp	q0, q0, [dstend, -128]
-	stp	q0, q0, [dstend, -96]
+	stp	q0, q0, [dst, 64]!
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
 	stp	q0, q0, [dstend, -64]
 	stp	q0, q0, [dstend, -32]
 	ret
 
-L(zva_other):
-	mov	tmp2w, 4
-	lsl	zva_lenw, tmp2w, tmp1w
-	add	tmp1, zva_len, 64	/* Max alignment bytes written.	 */
-	cmp	count, tmp1
-	blo	L(no_zva)
-
-	sub	tmp2, zva_len, 1
-	add	tmp1, dst, zva_len
-	add	dst, dst, 16
-	subs	count, tmp1, dst	/* Actual alignment bytes to write.  */
-	bic	tmp1, tmp1, tmp2	/* Aligned dc zva start address.  */
-	beq	2f
-1:	stp	q0, q0, [dst], 64
-	stp	q0, q0, [dst, -32]
-	subs	count, count, 64
-	b.hi	1b
-2:	mov	dst, tmp1
-	sub	count, dstend, tmp1	/* Remaining bytes to write.  */
-	subs	count, count, zva_len
-	b.lo	4f
-3:	dc	zva, dst
-	add	dst, dst, zva_len
-	subs	count, count, zva_len
-	b.hs	3b
-4:	add	count, count, zva_len
-	sub	dst, dst, 32		/* Bias dst for tail loop.  */
-	b	L(tail64)
-#endif
-
 END (MEMSET)
 libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
index 54fd5abffb1b6638ef8a5fc29e58b2f67765b28a..bee4aed52aab69f5aaded367d40e8b64e406c545 100644
--- a/sysdeps/aarch64/multiarch/memset_falkor.S
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -24,30 +24,8 @@
    use this function only when ZVA is enabled.  */
 
 #if IS_IN (libc)
-.macro zva_macro
-	.p2align 4
-	/* Write the first and last 64 byte aligned block using stp rather
-	   than using DC ZVA.  This is faster on some cores.  */
-	str	q0, [dst, 16]
-	stp	q0, q0, [dst, 32]
-	bic	dst, dst, 63
-	stp	q0, q0, [dst, 64]
-	stp	q0, q0, [dst, 96]
-	sub	count, dstend, dst	/* Count is now 128 too large.	*/
-	sub	count, count, 128+64+64	/* Adjust count and bias for loop.  */
-	add	dst, dst, 128
-1:	dc	zva, dst
-	add	dst, dst, 64
-	subs	count, count, 64
-	b.hi	1b
-	stp	q0, q0, [dst, 0]
-	stp	q0, q0, [dst, 32]
-	stp	q0, q0, [dstend, -64]
-	stp	q0, q0, [dstend, -32]
-	ret
-.endm
-
-# define ZVA_MACRO zva_macro
+
+# define SKIP_ZVA_CHECK
 # define MEMSET __memset_falkor
 # include <sysdeps/aarch64/memset.S>
 #endif