This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] aarch64: Optimized memcmp for Kunpeng processor.
- From: Xuelei Zhang <zhangxuelei4 at huawei dot com>
- To: <libc-alpha at sourceware dot org>, <siddhesh at gotplt dot org>, <Szabolcs dot Nagy at arm dot com>, <Wilco.Dijkstra@arm dot comjiangyikun at huawei dot com>, <yikunkero at gmail dot com>
- Date: Thu, 17 Oct 2019 22:47:50 +0800
- Subject: [PATCH] aarch64: Optimized memcmp for Kunpeng processor.
The loop body is expanded from a 16-byte comparison to a 64-byte
comparison, and the usage of ldp is replaced by the Post-index
mode to the Base plus offset mode. Hence, compare can faster 18%
around > 128 bytes in all.
Here is the result.
Function: memcmp
Variant: default
simple_memcmp __memcmp_kunpeng __memcmp_generic
========================================================================================================================
length=1, align1=0, align2=0: 14.06 ( -9.76%) 12.81 ( 0.00%) 12.81
length=1, align1=0, align2=0: 13.59 (-10.13%) 12.66 ( -2.53%) 12.34
length=1, align1=0, align2=0: 13.12 ( -7.69%) 12.50 ( -2.56%) 12.19
length=2, align1=0, align2=0: 14.84 ( -6.74%) 14.06 ( -1.12%) 13.91
length=2, align1=0, align2=0: 14.69 ( -6.82%) 12.97 ( 5.68%) 13.75
length=2, align1=0, align2=0: 14.38 ( -5.75%) 13.28 ( 2.30%) 13.59
length=3, align1=0, align2=0: 15.47 ( 0.00%) 14.22 ( 8.08%) 15.47
length=3, align1=0, align2=0: 15.62 ( -8.70%) 13.91 ( 3.26%) 14.38
length=3, align1=0, align2=0: 15.47 ( -7.61%) 13.91 ( 3.26%) 14.38
length=4, align1=0, align2=0: 16.72 (-24.42%) 13.12 ( 2.33%) 13.44
length=4, align1=0, align2=0: 17.19 (-35.80%) 13.12 ( -3.70%) 12.66
length=4, align1=0, align2=0: 16.56 (-29.27%) 12.19 ( 4.88%) 12.81
length=5, align1=0, align2=0: 16.88 (-28.57%) 12.81 ( 2.38%) 13.12
length=5, align1=0, align2=0: 17.97 (-49.35%) 12.34 ( -2.60%) 12.03
length=5, align1=0, align2=0: 17.19 (-39.24%) 11.88 ( 3.80%) 12.34
length=6, align1=0, align2=0: 18.75 (-29.03%) 12.34 ( 15.05%) 14.53
length=6, align1=0, align2=0: 18.44 (-35.63%) 12.34 ( 9.20%) 13.59
length=6, align1=0, align2=0: 17.81 (-28.09%) 12.50 ( 10.11%) 13.91
length=7, align1=0, align2=0: 20.62 (-37.50%) 12.66 ( 15.63%) 15.00
length=7, align1=0, align2=0: 18.75 (-27.66%) 12.34 ( 15.96%) 14.69
length=7, align1=0, align2=0: 18.28 (-27.17%) 12.50 ( 13.04%) 14.38
length=8, align1=0, align2=0: 19.84 (-64.94%) 12.81 ( -6.49%) 12.03
length=8, align1=0, align2=0: 20.00 (-66.23%) 11.56 ( 3.90%) 12.03
length=8, align1=0, align2=0: 18.91 (-59.21%) 11.41 ( 3.95%) 11.88
length=9, align1=0, align2=0: 20.31 (-66.67%) 12.66 ( -3.85%) 12.19
length=9, align1=0, align2=0: 19.84 (-71.62%) 11.88 ( -2.70%) 11.56
length=9, align1=0, align2=0: 20.00 (-62.02%) 11.88 ( 3.80%) 12.34
length=10, align1=0, align2=0: 21.72 (-82.90%) 12.34 ( -3.95%) 11.88
length=10, align1=0, align2=0: 21.56 (-81.58%) 12.03 ( -1.32%) 11.88
length=10, align1=0, align2=0: 20.94 (-71.79%) 11.72 ( 3.85%) 12.19
length=11, align1=0, align2=0: 21.41 (-75.64%) 12.03 ( 1.28%) 12.19
length=11, align1=0, align2=0: 22.81 (-87.18%) 12.19 ( 0.00%) 12.19
length=11, align1=0, align2=0: 21.41 (-77.92%) 12.03 ( 0.00%) 12.03
length=12, align1=0, align2=0: 22.50 (-89.47%) 11.88 ( 0.00%) 11.88
length=12, align1=0, align2=0: 22.97 (-93.42%) 12.66 ( -6.58%) 11.88
length=12, align1=0, align2=0: 21.72 (-80.52%) 11.72 ( 2.60%) 12.03
length=13, align1=0, align2=0: 23.28 (-101.35%) 12.50 ( -8.11%) 11.56
length=13, align1=0, align2=0: 23.28 (-93.51%) 12.34 ( -2.60%) 12.03
length=13, align1=0, align2=0: 23.12 (-92.21%) 12.19 ( -1.30%) 12.03
length=14, align1=0, align2=0: 26.56 (-117.95%) 12.03 ( 1.28%) 12.19
length=14, align1=0, align2=0: 24.06 (-94.94%) 12.19 ( 1.27%) 12.34
length=14, align1=0, align2=0: 23.59 (-98.68%) 11.88 ( 0.00%) 11.88
length=15, align1=0, align2=0: 24.69 (-100.00%) 12.19 ( 1.27%) 12.34
length=15, align1=0, align2=0: 24.53 (-101.28%) 11.88 ( 2.56%) 12.19
length=15, align1=0, align2=0: 24.22 (-101.30%) 11.41 ( 5.19%) 12.03
length=4, align1=0, align2=0: 16.09 (-27.16%) 13.12 ( -3.70%) 12.66
length=4, align1=0, align2=0: 16.09 (-27.16%) 12.66 ( 0.00%) 12.66
length=4, align1=0, align2=0: 15.62 (-29.87%) 12.50 ( -3.90%) 12.03
length=32, align1=0, align2=0: 37.81 (-181.40%) 13.12 ( 2.33%) 13.44
length=32, align1=7, align2=2: 37.66 (-197.53%) 12.03 ( 4.94%) 12.66
length=32, align1=0, align2=0: 37.97 (-189.29%) 12.34 ( 5.95%) 13.12
length=32, align1=0, align2=0: 37.19 (-190.24%) 11.72 ( 8.54%) 12.81
length=8, align1=0, align2=0: 20.00 (-62.02%) 12.34 ( 0.00%) 12.34
length=8, align1=0, align2=0: 19.38 (-51.22%) 11.25 ( 12.20%) 12.81
length=8, align1=0, align2=0: 19.22 (-64.00%) 11.72 ( 0.00%) 11.72
length=64, align1=0, align2=0: 62.97 (-97.55%) 15.62 ( 50.98%) 31.88
length=64, align1=6, align2=4: 62.34 (-74.24%) 15.94 ( 55.46%) 35.78
length=64, align1=0, align2=0: 70.16 (-138.83%) 14.06 ( 52.13%) 29.38
length=64, align1=0, align2=0: 69.53 (-130.57%) 14.53 ( 51.81%) 30.16
length=16, align1=0, align2=0: 25.31 (-105.06%) 11.72 ( 5.06%) 12.34
length=16, align1=0, align2=0: 25.78 (-114.29%) 12.97 ( -7.79%) 12.03
length=16, align1=0, align2=0: 25.16 (-111.84%) 11.88 ( 0.00%) 11.88
length=128, align1=0, align2=0: 119.22 (-515.32%) 20.00 ( -3.23%) 19.38
length=128, align1=5, align2=6: 121.25 (-496.92%) 19.38 ( 4.62%) 20.31
length=128, align1=0, align2=0: 119.38 (-542.02%) 18.75 ( -0.84%) 18.59
length=128, align1=0, align2=0: 119.53 (-542.85%) 18.28 ( 1.68%) 18.59
length=32, align1=0, align2=0: 37.66 (-186.90%) 12.03 ( 8.33%) 13.12
length=32, align1=0, align2=0: 37.50 (-192.68%) 11.72 ( 8.54%) 12.81
length=32, align1=0, align2=0: 37.19 (-190.24%) 11.72 ( 8.54%) 12.81
length=256, align1=0, align2=0: 218.44 (-685.40%) 26.88 ( 3.37%) 27.81
length=256, align1=4, align2=8: 218.12 (-512.28%) 29.69 ( 16.67%) 35.62
length=256, align1=0, align2=0: 219.84 (-713.30%) 24.53 ( 9.25%) 27.03
length=256, align1=0, align2=0: 217.97 (-711.05%) 23.91 ( 11.05%) 26.88
length=64, align1=0, align2=0: 62.81 (-101.00%) 14.38 ( 54.00%) 31.25
length=64, align1=0, align2=0: 71.09 (-144.62%) 14.06 ( 51.61%) 29.06
length=64, align1=0, align2=0: 70.47 (-145.11%) 13.75 ( 52.17%) 28.75
length=512, align1=0, align2=0: 416.25 (-861.73%) 38.59 ( 10.83%) 43.28
length=512, align1=3, align2=10: 416.25 (-627.87%) 44.84 ( 21.58%) 57.19
length=512, align1=0, align2=0: 414.69 (-844.49%) 37.66 ( 14.23%) 43.91
length=512, align1=0, align2=0: 414.84 (-883.33%) 35.94 ( 14.81%) 42.19
length=128, align1=0, align2=0: 119.22 (-478.03%) 19.22 ( 6.82%) 20.62
length=128, align1=0, align2=0: 119.38 (-521.14%) 18.28 ( 4.88%) 19.22
length=128, align1=0, align2=0: 119.38 (-536.67%) 17.97 ( 4.17%) 18.75
length=1024, align1=0, align2=0: 809.53 (-995.35%) 61.09 ( 17.34%) 73.91
length=1024, align1=2, align2=12: 810.16 (-839.31%) 73.44 ( 14.86%) 86.25
length=1024, align1=0, align2=0: 808.59 (-998.73%) 60.78 ( 17.41%) 73.59
length=1024, align1=0, align2=0: 808.28 (-1007.71%) 60.47 ( 17.13%) 72.97
length=256, align1=0, align2=0: 217.19 (-689.77%) 25.62 ( 6.82%) 27.50
length=256, align1=0, align2=0: 217.81 (-715.20%) 23.91 ( 10.53%) 26.72
length=256, align1=0, align2=0: 217.66 (-700.57%) 24.22 ( 10.92%) 27.19
length=2048, align1=0, align2=0: 1597.50 (-954.02%) 110.16 ( 27.32%) 151.56
length=2048, align1=1, align2=14: 1597.19 (-960.38%) 130.78 ( 13.17%) 150.62
length=2048, align1=0, align2=0: 1596.09 (-1083.66%) 110.00 ( 18.42%) 134.84
length=2048, align1=0, align2=0: 1601.25 (-1091.63%) 113.12 ( 15.81%) 134.38
length=512, align1=0, align2=0: 414.38 (-857.40%) 37.03 ( 14.44%) 43.28
length=512, align1=0, align2=0: 419.53 (-890.77%) 35.94 ( 15.13%) 42.34
length=512, align1=0, align2=0: 414.69 (-890.30%) 36.09 ( 13.81%) 41.88
length=4096, align1=0, align2=0: 3172.66 (-1067.63%) 217.66 ( 19.90%) 271.72
length=4096, align1=0, align2=16: 3172.81 (-1084.02%) 209.22 ( 21.92%) 267.97
length=4096, align1=0, align2=0: 3172.34 (-1062.16%) 209.22 ( 23.35%) 272.97
length=4096, align1=0, align2=0: 3172.34 (-1082.47%) 208.59 ( 22.25%) 268.28
length=1024, align1=0, align2=0: 810.00 (-1007.69%) 60.78 ( 16.88%) 73.12
length=1024, align1=0, align2=0: 813.75 (-1015.20%) 60.78 ( 16.70%) 72.97
length=1024, align1=0, align2=0: 808.12 (-1000.43%) 60.62 ( 17.45%) 73.44
length=16, align1=1, align2=2: 27.19 (-104.71%) 12.03 ( 9.41%) 13.28
length=16, align1=1, align2=2: 25.47 (-103.75%) 12.03 ( 3.75%) 12.50
length=16, align1=1, align2=2: 24.84 (-101.27%) 11.88 ( 3.80%) 12.34
length=32, align1=2, align2=4: 38.12 (-183.72%) 12.19 ( 9.30%) 13.44
length=32, align1=2, align2=4: 38.12 (-193.97%) 12.19 ( 6.02%) 12.97
length=32, align1=2, align2=4: 37.34 (-191.46%) 12.34 ( 3.66%) 12.81
length=64, align1=3, align2=6: 72.66 (-121.43%) 15.47 ( 52.86%) 32.81
length=64, align1=3, align2=6: 73.44 (-162.57%) 15.00 ( 46.37%) 27.97
length=64, align1=3, align2=6: 72.97 (-171.51%) 14.69 ( 45.35%) 26.88
length=128, align1=4, align2=8: 118.91 (-428.47%) 20.00 ( 11.11%) 22.50
length=128, align1=4, align2=8: 119.53 (-470.89%) 19.53 ( 6.72%) 20.94
length=128, align1=4, align2=8: 119.69 (-484.73%) 19.53 ( 4.58%) 20.47
length=256, align1=5, align2=10: 221.09 (-567.45%) 30.78 ( 7.08%) 33.12
length=256, align1=5, align2=10: 217.81 (-593.53%) 29.53 ( 5.97%) 31.41
length=256, align1=5, align2=10: 217.19 (-584.73%) 29.38 ( 7.39%) 31.72
length=512, align1=6, align2=12: 416.56 (-733.12%) 43.75 ( 12.50%) 50.00
length=512, align1=6, align2=12: 413.91 (-740.95%) 44.22 ( 10.16%) 49.22
length=512, align1=6, align2=12: 414.22 (-749.68%) 43.75 ( 10.26%) 48.75
length=1024, align1=7, align2=14: 809.53 (-836.89%) 75.94 ( 12.12%) 86.41
length=1024, align1=7, align2=14: 807.81 (-843.43%) 75.94 ( 11.31%) 85.62
length=1024, align1=7, align2=14: 807.66 (-859.00%) 75.62 ( 10.20%) 84.22
---
sysdeps/aarch64/memcmp.S | 12 +-
sysdeps/aarch64/multiarch/Makefile | 1 +
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +
sysdeps/aarch64/multiarch/memcmp.c | 42 +++++++
sysdeps/aarch64/multiarch/memcmp_generic.S | 35 ++++++
sysdeps/aarch64/multiarch/memcmp_kunpeng.S | 187 ++++++++++++++++++++++++++++
6 files changed, 276 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memcmp.c
create mode 100644 sysdeps/aarch64/multiarch/memcmp_generic.S
create mode 100644 sysdeps/aarch64/multiarch/memcmp_kunpeng.S
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index f330154c7a..40ecbddb94 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -25,6 +25,10 @@
* ARMv8-a, AArch64, unaligned accesses.
*/
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
/* Parameters and result. */
#define src1 x0
#define src2 x1
@@ -41,7 +45,7 @@
#define tmp1 x7
#define tmp2 x8
-ENTRY_ALIGN (memcmp, 6)
+ENTRY_ALIGN (MEMCMP, 6)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
@@ -148,7 +152,7 @@ L(byte_loop):
sub result, data1w, data2w
ret
-END (memcmp)
+END (MEMCMP)
#undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a90..eedb8e486d 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -1,6 +1,7 @@
ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor \
+ memcmp_kunpeng memcmp_generic \
memset_generic memset_falkor memset_emag \
memchr_generic memchr_nosimd \
strlen_generic strlen_asimd
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e5..fdbc751897 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -57,6 +57,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memchr,
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd)
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
+..IFUNC_IMPL (i, name, memcmp,
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_kunpeng)
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_generic))
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memcmp.c b/sysdeps/aarch64/multiarch/memcmp.c
new file mode 100644
index 0000000000..276bc1e30e
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp.c
@@ -0,0 +1,42 @@
+/* Multiple versions of memcmp. AARCH64 version.
+ Copyright (C) 2017-2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memcmp so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memcmp
+# define memcmp __redirect_memcmp
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memcmp) __libc_memcmp;
+
+extern __typeof (__redirect_memcmp) __memcmp_generic attribute_hidden;
+extern __typeof (__redirect_memcmp) __memcmp_kunpeng attribute_hidden;
+
+libc_ifunc (__libc_memcmp,
+ (IS_KUNPENG(midr)
+ ? __memcmp_kunpeng
+ : __memcmp_generic));
+
+
+# undef memcmp
+strong_alias (__libc_memcmp, memcmp);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memcmp_generic.S b/sysdeps/aarch64/multiarch/memcmp_generic.S
new file mode 100644
index 0000000000..88bac46075
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp_generic.S
@@ -0,0 +1,35 @@
+/* A Generic Optimized memcmp implementation for AARCH64.
+ Copyright (C) 2018-2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+# define MEMCMP __memcmp_generic
+
+/* Do not hide the generic version of memcmp, we use it internally. */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal memcmp calls through a PLT. */
+ .globl __GI_memcmp; __GI_memcmp = __memcmp_generic
+# endif
+#endif
+
+#include "../memcmp.S"
\ No newline at end of file
diff --git a/sysdeps/aarch64/multiarch/memcmp_kunpeng.S b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S
new file mode 100644
index 0000000000..30c937a18d
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memcmp_kunpeng.S
@@ -0,0 +1,187 @@
+/* Optimized memcmp for Huawei Kunpeng processor.
+
+ Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ */
+
+/* Parameters and result. */
+#define src1 x0
+#define src2 x1
+#define limit x2
+#define result w0
+
+/* Internal variables. */
+#define data1 x3
+#define data1w w3
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
+
+#if IS_IN (libc)
+#define MEMCMP __memcmp_kunpeng
+
+ENTRY_ALIGN (MEMCMP, 6)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+
+ subs limit, limit, 16
+ b.lo L(less16)
+
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ ccmp data1, data2, 0, ne
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+ cmp limit, 112
+ b.lo L(loop16)
+
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+ subs limit, limit, 48
+
+ /* Compare 128 up bytes using aligned access. */
+ .p2align 4
+L(loop64):
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 16]
+ ldp data2, data2h, [src2, 16]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 32]
+ ldp data2, data2h, [src2, 32]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 48]
+ ldp data2, data2h, [src2, 48]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ subs limit, limit, 64
+ add src1, src1, 64
+ add src2, src2, 64
+ b.pl L(loop64)
+ adds limit, limit, 48
+ b.lo L(last_bytes)
+
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ subs limit, limit, 16
+ b.hi L(loop16)
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return64):
+ cmp data1, data2
+ bne L(return)
+L(return_pre):
+ mov data1, data1h
+ mov data2, data2h
+L(return):
+#ifndef __AARCH64EB__
+ rev data1, data1
+ rev data2, data2
+#endif
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+L(less16):
+ adds limit, limit, 8
+ b.lo L(less8) //lo:<
+ ldr data1, [src1]
+ ldr data2, [src2]
+ /* equal 8 optimized */
+ ccmp data1, data2, 0, ne
+ b.ne L(return)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+ .p2align 4
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ccmp data1, data2, 0, ne
+ b.ne L(return)
+ ldr data1w, [src1, limit]
+ ldr data2w, [src2, limit]
+ b L(return)
+
+ .p2align 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_0)
+
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
+ ret
+L(ret_0):
+ mov result, 0
+ ret
+
+END (MEMCMP)
+#undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+#endif
--
2.14.1.windows.1