This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: Feng Xue OS <fxue at os dot amperecomputing dot com>, Siddhesh Poyarekar <siddhesh at gotplt dot org>, 'GNU C Library' <libc-alpha at sourceware dot org>
- Cc: nd <nd at arm dot com>
- Date: Tue, 13 Aug 2019 13:10:49 +0000
- Subject: Re: [PATCH] aarch64: Add tunable glibc.memset.dc_zva_threshold
- Arc-authentication-results: i=1; mx.microsoft.com 1; spf=pass smtp.mailfrom=arm.com; dmarc=pass action=none header.from=arm.com; dkim=pass header.d=arm.com; arc=none
- Arc-message-signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=microsoft.com; s=arcselector9901; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=Ep2LKCIMMQRVWgjpfw8D4yoTWm1fGuO3RnSfpAdUWbg=; b=QGTSJdiw0gNWE2EyurRdNt+Eis2YVMlMnIMEFpq60z++m4KzsyQyl3ux7ec554mbA4Tqct+PS4xVecZTc32xFQg8q/g9lD+ShnWwcaehP3QjzPcVF3mDjYwwph3egG/Fbm6dHBJAYGFunGvEigcHhFhc1Lw50af9+v7EtTx+QrcbBXeTqiykn3WmO+96Cugm4kkje0T12uqB5vVpmbWGLDdGKd88WRm5afEXbdxM84TsCf+HwoI2mi5aaSd/XMLOKYY3RePkOzqJLYriMsf3Uxf3+bGel+AdzUJ6xANQtri5hXvX+WMKRN7zogAECrrqKhe/2ugM2I1Ha4KIRBKh1A==
- Arc-seal: i=1; a=rsa-sha256; s=arcselector9901; d=microsoft.com; cv=none; b=GYxF5v9VmvbKj2y86dJFQygrFCnkf2g/yuFbvvhk9E7pAg8gjBdMjmHwS1GM5QCuIz3dWxc8kF8HHW6bPXm2A5n2LjW2NG3pvS0///QK5mzaaONa6XhULVYU0c35Sw03cPRs1g7p1ohy5Rsy/DmJRazeXDLqzTosUw/nGObWwYx+8Q4tX4x7LVO8CY8cj3qy7UlzX84+phmWsqfOUHhZdhBiP8klyBEBtePfTX2wEOO6OW/dutcQri0zOJ+k3wlkmZdjs7bIxGpY2uVfM1efIoCHgDyim634poCJItB5LvPfov7LwMrqrTKZz1P2wlAES8K41CU4SmoHNsN9kiMOQA==
- Original-authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco dot Dijkstra at arm dot com;
- References: <VI1PR0801MB21270BD012CBA889798C1CB483D50@VI1PR0801MB2127.eurprd08.prod.outlook.com>,<0a810dde-3b92-4782-09cb-16cdbc8dbb75@gotplt.org>,<BYAPR01MB4869818C970E0A2F6FBAD4E9F7D70@BYAPR01MB4869.prod.exchangelabs.com>
Hi Feng,
> This version disable DC ZVA in emag.
That looks good to me.
diff --git a/sysdeps/aarch64/multiarch/memset_base64.S b/sysdeps/aarch64/multiarch/memset_base64.S
index 9a62325..c0cccba 100644
--- a/sysdeps/aarch64/multiarch/memset_base64.S
+++ b/sysdeps/aarch64/multiarch/memset_base64.S
@@ -23,6 +23,7 @@
# define MEMSET __memset_base64
#endif
+/* To disable DC ZVA, set this threshold to 0. */
#ifndef DC_ZVA_THRESHOLD
# define DC_ZVA_THRESHOLD 512
#endif
@@ -91,11 +92,12 @@ L(set96):
.p2align 4
L(set_long):
stp val, val, [dstin]
+ bic dst, dstin, 15
+#if DC_ZVA_THRESHOLD
cmp count, DC_ZVA_THRESHOLD
ccmp val, 0, 0, cs
- bic dst, dstin, 15
b.eq L(zva_64)
-
+#endif
/* Small-size or non-zero memset does not use DC ZVA. */
sub count, dstend, dst
@@ -105,7 +107,11 @@ L(set_long):
* count is less than 33 bytes, so as to bypass 2 unneccesary stps.
*/
sub count, count, 64+16+1
+
+#if DC_ZVA_THRESHOLD
+ /* Align loop on 16-byte boundary, this might be friendly to i-cache. */
nop
+#endif
1: stp val, val, [dst, 16]
stp val, val, [dst, 32]
@@ -121,6 +127,7 @@ L(set_long):
stp val, val, [dstend, -16]
ret
+#if DC_ZVA_THRESHOLD
.p2align 3
L(zva_64):
stp val, val, [dst, 16]
@@ -173,6 +180,7 @@ L(zva_64):
1: stp val, val, [dstend, -32]
stp val, val, [dstend, -16]
ret
+#endif
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
diff --git a/sysdeps/aarch64/multiarch/memset_emag.S b/sysdeps/aarch64/multiarch/memset_emag.S
index 1c1fabc..c2aed62 100644
--- a/sysdeps/aarch64/multiarch/memset_emag.S
+++ b/sysdeps/aarch64/multiarch/memset_emag.S
@@ -21,12 +21,14 @@
# define MEMSET __memset_emag
/*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
* memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we set threshold to
+ * zero to disable using DC ZVA, which is good for multi-process/thread
+ * workloads.
+ */
+
+# define DC_ZVA_THRESHOLD 0
# include "./memset_base64.S"
#endif
OK
Wilco