This is the mail archive of the
newlib-cvs@sourceware.org
mailing list for the newlib project.
[newlib-cygwin] aarch64: optimize the unaligned case of memcmp
- From: Corinna Vinschen <corinna at sourceware dot org>
- To: newlib-cvs at sourceware dot org
- Date: 26 Jun 2017 08:22:57 -0000
- Subject: [newlib-cygwin] aarch64: optimize the unaligned case of memcmp
https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=9938a64ca9ef00a31720ea9ca48f6d84e0f7e98c
commit 9938a64ca9ef00a31720ea9ca48f6d84e0f7e98c
Author: Sebastian Pop <s.pop@samsung.com>
Date: Fri Jun 23 15:23:09 2017 -0500
aarch64: optimize the unaligned case of memcmp
This brings to newlib a performance improvement that we developed in Bionic
libc. That change has been submitted for review to Bionic libc:
https://android-review.googlesource.com/418279
A similar patch has been submitted for review in glibc:
https://sourceware.org/ml/libc-alpha/2017-06/msg01143.html
Patch written by Vikas Sinha and Sebastian Pop.
The performance was measured on the bionic-benchmarks on a hikey (aarch64 8xA53)
board. There was no performance change to the existing benchmark
and a performance improvement on the new benchmark for memcmp
on the unaligned side. The new benchmark has been submitted for
review at https://android-review.googlesource.com/414860
The overall performance improves by 18% for the small data set 8
and the performance improves by 450% for the large data set 64k.
The base is with the libc from /system/lib64. The bionic libc
with this patch is in /data.
hikey:/data # export LD_LIBRARY_PATH=/system/lib64
hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
Run on (8 X 2.4 MHz CPU s)
Benchmark Time CPU Iterations
----------------------------------------------------------------------
BM_string_memcmp/8 30 ns 30 ns 22955680 251.07MB/s
BM_string_memcmp/64 57 ns 57 ns 12349184 1076.99MB/s
BM_string_memcmp/512 305 ns 305 ns 2297163 1.56496GB/s
BM_string_memcmp/1024 571 ns 571 ns 1225211 1.66912GB/s
BM_string_memcmp/8k 4307 ns 4306 ns 162562 1.77177GB/s
BM_string_memcmp/16k 8676 ns 8675 ns 80676 1.75887GB/s
BM_string_memcmp/32k 19233 ns 19230 ns 36394 1.58695GB/s
BM_string_memcmp/64k 36986 ns 36984 ns 18952 1.65029GB/s
BM_string_memcmp_aligned/8 199 ns 199 ns 3519166 38.3336MB/s
BM_string_memcmp_aligned/64 386 ns 386 ns 1810734 158.073MB/s
BM_string_memcmp_aligned/512 1735 ns 1734 ns 403981 281.525MB/s
BM_string_memcmp_aligned/1024 3200 ns 3200 ns 218838 305.151MB/s
BM_string_memcmp_aligned/8k 25084 ns 25080 ns 28180 311.507MB/s
BM_string_memcmp_aligned/16k 51730 ns 51729 ns 13521 302.057MB/s
BM_string_memcmp_aligned/32k 103228 ns 103228 ns 6782 302.727MB/s
BM_string_memcmp_aligned/64k 207117 ns 207087 ns 3450 301.806MB/s
BM_string_memcmp_unaligned/8 339 ns 339 ns 2070998 22.5302MB/s
BM_string_memcmp_unaligned/64 1392 ns 1392 ns 502796 43.8454MB/s
BM_string_memcmp_unaligned/512 9194 ns 9194 ns 76133 53.1104MB/s
BM_string_memcmp_unaligned/1024 18325 ns 18323 ns 38206 53.2963MB/s
BM_string_memcmp_unaligned/8k 148579 ns 148574 ns 4713 52.5831MB/s
BM_string_memcmp_unaligned/16k 298169 ns 298120 ns 2344 52.4118MB/s
BM_string_memcmp_unaligned/32k 598813 ns 598797 ns 1085 52.188MB/s
BM_string_memcmp_unaligned/64k 1196079 ns 1196083 ns 540 52.2539MB/s
hikey:/data # export LD_LIBRARY_PATH=/data
hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
Run on (8 X 2.4 MHz CPU s)
Benchmark Time CPU Iterations
----------------------------------------------------------------------
BM_string_memcmp/8 30 ns 30 ns 23209918 252.802MB/s
BM_string_memcmp/64 57 ns 57 ns 12348447 1076.95MB/s
BM_string_memcmp/512 305 ns 305 ns 2296878 1.56471GB/s
BM_string_memcmp/1024 572 ns 571 ns 1224426 1.6689GB/s
BM_string_memcmp/8k 4309 ns 4308 ns 162491 1.77109GB/s
BM_string_memcmp/16k 9348 ns 9345 ns 74894 1.63285GB/s
BM_string_memcmp/32k 18329 ns 18322 ns 38249 1.6656GB/s
BM_string_memcmp/64k 36992 ns 36981 ns 18952 1.65045GB/s
BM_string_memcmp_aligned/8 199 ns 199 ns 3513925 38.3162MB/s
BM_string_memcmp_aligned/64 386 ns 386 ns 1814038 158.192MB/s
BM_string_memcmp_aligned/512 1735 ns 1735 ns 402279 281.502MB/s
BM_string_memcmp_aligned/1024 3204 ns 3202 ns 218761 304.941MB/s
BM_string_memcmp_aligned/8k 25577 ns 25569 ns 27406 305.548MB/s
BM_string_memcmp_aligned/16k 52143 ns 52123 ns 13522 299.769MB/s
BM_string_memcmp_aligned/32k 105169 ns 105127 ns 6637 297.26MB/s
BM_string_memcmp_aligned/64k 206508 ns 206383 ns 3417 302.835MB/s
BM_string_memcmp_unaligned/8 282 ns 282 ns 2482953 27.062MB/s
BM_string_memcmp_unaligned/64 542 ns 541 ns 1298317 112.77MB/s
BM_string_memcmp_unaligned/512 2152 ns 2152 ns 325267 226.915MB/s
BM_string_memcmp_unaligned/1024 4025 ns 4025 ns 173904 242.622MB/s
BM_string_memcmp_unaligned/8k 32276 ns 32271 ns 21818 242.09MB/s
BM_string_memcmp_unaligned/16k 65970 ns 65970 ns 10554 236.851MB/s
BM_string_memcmp_unaligned/32k 131241 ns 131242 ns 5129 238.11MB/s
BM_string_memcmp_unaligned/64k 266159 ns 266160 ns 2661 234.821MB/s
Diff:
---
newlib/libc/machine/aarch64/memcmp.S | 57 ++++++++++++++++++++++++++++++++++--
1 file changed, 55 insertions(+), 2 deletions(-)
diff --git a/newlib/libc/machine/aarch64/memcmp.S b/newlib/libc/machine/aarch64/memcmp.S
index 860384c..09be4c3 100644
--- a/newlib/libc/machine/aarch64/memcmp.S
+++ b/newlib/libc/machine/aarch64/memcmp.S
@@ -1,6 +1,7 @@
/* memcmp - compare memory
Copyright (c) 2013, Linaro Limited
+ Copyright (c) 2017, Samsung Austin R&D Center
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -152,9 +153,61 @@ def_fn memcmp p2align=6
.p2align 6
.Lmisaligned8:
+
+ cmp limit, #8
+ b.lo .LmisalignedLt8
+
+.LunalignedGe8 :
+
+ /* Load the first dword with both src potentially unaligned. */
+ ldr data1, [src1]
+ ldr data2, [src2]
+
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ cbnz diff, .Lnot_limit
+
+ /* Sources are not aligned: align one of the sources. */
+
+ and tmp1, src1, #0x7
+ orr tmp3, xzr, #0x8
+ sub pos, tmp3, tmp1
+
+ /* Increment SRC pointers by POS so SRC1 is word-aligned. */
+ add src1, src1, pos
+ add src2, src2, pos
+
+ sub limit, limit, pos
+ lsr limit_wd, limit, #3
+
+ cmp limit_wd, #0
+
+ /* save #bytes to go back to be able to read 8byte at end
+ pos=negative offset position to read 8 bytes when len%8 != 0 */
+ and limit, limit, #7
+ sub pos, limit, #8
+
+ b .Lstart_part_realigned
+
+ .p2align 5
+.Lloop_part_aligned:
+ ldr data1, [src1], #8
+ ldr data2, [src2], #8
+ subs limit_wd, limit_wd, #1
+.Lstart_part_realigned:
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ cbnz diff, .Lnot_limit
+ b.ne .Lloop_part_aligned
+
+ /* process leftover bytes: read the leftover bytes, starting with
+ negative offset - so we can load 8 bytes. */
+ ldr data1, [src1, pos]
+ ldr data2, [src2, pos]
+ eor diff, data1, data2 /* Non-zero if differences found. */
+ b .Lnot_limit
+
+.LmisalignedLt8:
sub limit, limit, #1
1:
- /* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
@@ -164,4 +217,4 @@ def_fn memcmp p2align=6
ret
.size memcmp, . - memcmp
-#endif
\ No newline at end of file
+#endif