This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.27.9000-176-g30a81da
- From: siddhesh at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 6 Mar 2018 13:54:38 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.27.9000-176-g30a81da
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 30a81dae5b752f8aa5f96e7f7c341ec57cba3585 (commit)
from adc95fb06a22264349de21507de1a7e652a4052d (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=30a81dae5b752f8aa5f96e7f7c341ec57cba3585
commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue Mar 6 19:22:39 2018 +0530
aarch64: Optimized memcmp for medium to large sizes
This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources. The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang. On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.
* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
time.
diff --git a/ChangeLog b/ChangeLog
index d57baf6..23609b8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2018-03-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+ * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
+ time.
+
* benchtests/bench-strncmp.c (test_main): Remove 0 length tests.
(do_test_limit): Likewise.
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index ecd1206..8325d04 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -34,9 +34,12 @@
/* Internal variables. */
#define data1 x3
#define data1w w3
-#define data2 x4
-#define data2w w4
-#define tmp1 x5
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
ENTRY_ALIGN (memcmp, 6)
DELOUSE (0)
@@ -46,39 +49,70 @@ ENTRY_ALIGN (memcmp, 6)
subs limit, limit, 8
b.lo L(less8)
- /* Limit >= 8, so check first 8 bytes using unaligned loads. */
ldr data1, [src1], 8
ldr data2, [src2], 8
- and tmp1, src1, 7
- add limit, limit, tmp1
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
cmp data1, data2
bne L(return)
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop8)
+
/* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
- subs limit, limit, 8
- b.ls L(last_bytes)
-
- /* Loop performing 8 bytes per iteration using aligned src1.
- Limit is pre-decremented by 8 and must be larger than zero.
- Exit if <= 8 bytes left to do or if the data is not equal. */
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
-L(loop8):
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- subs limit, limit, 8
- ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
- b.eq L(loop8)
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
cmp data1, data2
bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
- /* Compare last 1-8 bytes using unaligned access. */
+ /* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 3 ++
sysdeps/aarch64/memcmp.S | 76 +++++++++++++++++++++++++++++++++-------------
2 files changed, 58 insertions(+), 21 deletions(-)
hooks/post-receive
--
GNU C Library master sources