[PATCH][AArch64] Cleanup memset
Wilco Dijkstra
Wilco.Dijkstra@arm.com
Wed Feb 26 16:12:00 GMT 2020
Cleanup memset. Remove unnecessary code for unused ZVA sizes.
For zero memsets it's faster use DC ZVA for >= 160 bytes rather
than 256. Add a define which allows skipping the ZVA size test when
the ZVA size is known to be 64 - reading dczid_el0 may be expensive.
This simplifies the Falkor memset implementation.
--
diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
index 872a6b511de2bb97939e644c78676a6212c65d6d..22da7381e68bdfaa6086e39cc980c77150242d21 100644
--- a/sysdeps/aarch64/memset-reg.h
+++ b/sysdeps/aarch64/memset-reg.h
@@ -22,9 +22,4 @@
#define count x2
#define dst x3
#define dstend x4
-#define tmp1 x5
-#define tmp1w w5
-#define tmp2 x6
-#define tmp2w w6
-#define zva_len x7
-#define zva_lenw w7
+#define zva_val x5
diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
index ac577f1660e2706e2024e42d0efc09120c041af2..4654732e1bdd4a7e8f673b79ed66be408bed726b 100644
--- a/sysdeps/aarch64/memset.S
+++ b/sysdeps/aarch64/memset.S
@@ -78,114 +78,49 @@ L(set96):
stp q0, q0, [dstend, -32]
ret
- .p2align 3
- nop
+ .p2align 4
L(set_long):
and valw, valw, 255
bic dst, dstin, 15
str q0, [dstin]
- cmp count, 256
- ccmp valw, 0, 0, cs
- b.eq L(try_zva)
-L(no_zva):
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-1: stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
-L(tail64):
- subs count, count, 64
- b.hi 1b
-2: stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
-L(try_zva):
-#ifdef ZVA_MACRO
- zva_macro
-#else
- .p2align 3
- mrs tmp1, dczid_el0
- tbnz tmp1w, 4, L(no_zva)
- and tmp1w, tmp1w, 15
- cmp tmp1w, 4 /* ZVA size is 64 bytes. */
- b.ne L(zva_128)
-
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores.
- */
-L(zva_64):
+ cmp count, 160
+ ccmp valw, 0, 0, hs
+ b.ne L(no_zva)
+
+#ifndef SKIP_ZVA_CHECK
+ mrs zva_val, dczid_el0
+ and zva_val, zva_val, 31
+ cmp zva_val, 4 /* ZVA size is 64 bytes. */
+ b.ne L(no_zva)
+#endif
str q0, [dst, 16]
stp q0, q0, [dst, 32]
bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
- nop
-1: dc zva, dst
+ sub count, dstend, dst /* Count is now 64 too large. */
+ sub count, count, 128 /* Adjust count and bias for loop. */
+
+ .p2align 4
+L(zva_loop):
add dst, dst, 64
+ dc zva, dst
subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
+ b.hi L(zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
- .p2align 3
-L(zva_128):
- cmp tmp1w, 5 /* ZVA size is 128 bytes. */
- b.ne L(zva_other)
-
- str q0, [dst, 16]
+L(no_zva):
+ sub count, dstend, dst /* Count is 16 too large. */
+ sub dst, dst, 16 /* Dst is biased by -32. */
+ sub count, count, 64 + 16 /* Adjust count and bias for loop. */
+L(no_zva_loop):
stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- bic dst, dst, 127
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+128 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 128
- subs count, count, 128
- b.hi 1b
- stp q0, q0, [dstend, -128]
- stp q0, q0, [dstend, -96]
+ stp q0, q0, [dst, 64]!
+ subs count, count, 64
+ b.hi L(no_zva_loop)
stp q0, q0, [dstend, -64]
stp q0, q0, [dstend, -32]
ret
-L(zva_other):
- mov tmp2w, 4
- lsl zva_lenw, tmp2w, tmp1w
- add tmp1, zva_len, 64 /* Max alignment bytes written. */
- cmp count, tmp1
- blo L(no_zva)
-
- sub tmp2, zva_len, 1
- add tmp1, dst, zva_len
- add dst, dst, 16
- subs count, tmp1, dst /* Actual alignment bytes to write. */
- bic tmp1, tmp1, tmp2 /* Aligned dc zva start address. */
- beq 2f
-1: stp q0, q0, [dst], 64
- stp q0, q0, [dst, -32]
- subs count, count, 64
- b.hi 1b
-2: mov dst, tmp1
- sub count, dstend, tmp1 /* Remaining bytes to write. */
- subs count, count, zva_len
- b.lo 4f
-3: dc zva, dst
- add dst, dst, zva_len
- subs count, count, zva_len
- b.hs 3b
-4: add count, count, zva_len
- sub dst, dst, 32 /* Bias dst for tail loop. */
- b L(tail64)
-#endif
-
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
index 54fd5abffb1b6638ef8a5fc29e58b2f67765b28a..bee4aed52aab69f5aaded367d40e8b64e406c545 100644
--- a/sysdeps/aarch64/multiarch/memset_falkor.S
+++ b/sysdeps/aarch64/multiarch/memset_falkor.S
@@ -24,30 +24,8 @@
use this function only when ZVA is enabled. */
#if IS_IN (libc)
-.macro zva_macro
- .p2align 4
- /* Write the first and last 64 byte aligned block using stp rather
- than using DC ZVA. This is faster on some cores. */
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- bic dst, dst, 63
- stp q0, q0, [dst, 64]
- stp q0, q0, [dst, 96]
- sub count, dstend, dst /* Count is now 128 too large. */
- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
- add dst, dst, 128
-1: dc zva, dst
- add dst, dst, 64
- subs count, count, 64
- b.hi 1b
- stp q0, q0, [dst, 0]
- stp q0, q0, [dst, 32]
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-.endm
-
-# define ZVA_MACRO zva_macro
+
+# define SKIP_ZVA_CHECK
# define MEMSET __memset_falkor
# include <sysdeps/aarch64/memset.S>
#endif
More information about the Libc-alpha
mailing list