This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v2 3/3] aarch64: Optimized memchr specific to AmpereComputing skylark
- From: Feng Xue <innat_xue at hotmail dot com>
- To: "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>
- Cc: "marcus dot shawcroft at linaro dot org" <marcus dot shawcroft at linaro dot org>, "szabolcs dot nagy at arm dot com" <szabolcs dot nagy at arm dot com>, Richard Henderson <rth7680 at gmail dot com>, Feng Xue <feng dot xue at amperecomputing dot com>
- Date: Wed, 17 Oct 2018 08:45:12 +0000
- Subject: [PATCH v2 3/3] aarch64: Optimized memchr specific to AmpereComputing skylark
Although prefetch load in previous version can benefit performance, it might cause a segfault. Thus, this patch removed that to ensure correct behaviour.
Feng
---
This version uses general register based memory instruction to load
data, because vector register based is slightly slower in skylark.
Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.
* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
[!MEMCHR](MEMCHR): Set to __memchr.
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
Add memchr_generic and memchr_skylark.
* sysdeps/aarch64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add memchr ifuncs.
* sysdeps/aarch64/multiarch/memchr.c: New file.
* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
* sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
---
ChangeLog | 12 ++
sysdeps/aarch64/memchr.S | 10 +-
sysdeps/aarch64/multiarch/Makefile | 1 +
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 +
sysdeps/aarch64/multiarch/memchr.c | 41 ++++++
sysdeps/aarch64/multiarch/memchr_generic.S | 33 +++++
sysdeps/aarch64/multiarch/memchr_skylark.S | 217 ++++++++++++++++++++++++++++
7 files changed, 314 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memchr.c
create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S
create mode 100644 sysdeps/aarch64/multiarch/memchr_skylark.S
diff --git a/ChangeLog b/ChangeLog
index 28370f9..e64b8b3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2018-10-13 Feng Xue <feng.xue@amperecomputing.com>
+
+ * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
+ [!MEMCHR](MEMCHR): Set to __memchr.
+ * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+ Add memchr_generic and memchr_skylark.
+ * sysdeps/aarch64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add memchr ifuncs.
+ * sysdeps/aarch64/multiarch/memchr.c: New file.
+ * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
+ * sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
+
2018-10-12 Feng Xue <feng.xue@amperecomputing.com>
* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index e422aef..4afebd3 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -26,6 +26,10 @@
* Neon Available.
*/
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+
/* Arguments and results. */
#define srcin x0
#define chrin w1
@@ -59,7 +63,7 @@
* identify exactly which byte has matched.
*/
-ENTRY (__memchr)
+ENTRY (MEMCHR)
/* Do not dereference srcin if no bytes to compare. */
cbz cntin, L(zero_length)
/*
@@ -152,6 +156,6 @@ L(tail):
L(zero_length):
mov result, #0
ret
-END (__memchr)
-weak_alias (__memchr, memchr)
+END (MEMCHR)
+weak_alias (MEMCHR, memchr)
libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 828ce4f..353ece7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -2,5 +2,6 @@ ifeq ($(subdir),string)
sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor \
memset_generic memset_falkor memset_skylark \
+ memchr_generic memchr_skylark \
strlen_generic strlen_asimd
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index baf01a0..f5014d2 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_skylark)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c
new file mode 100644
index 0000000..cbcf8b7
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memchr. AARCH64 version.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+
+#if IS_IN (libc)
+/* Redefine memchr so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# undef memchr
+# define memchr __redirect_memchr
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memchr) __memchr;
+
+extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
+extern __typeof (__redirect_memchr) __memchr_skylark attribute_hidden;
+
+libc_ifunc (__memchr,
+ ((IS_SKYLARK (midr)
+ ? __memchr_skylark
+ : __memchr_generic)));
+
+# undef memchr
+strong_alias (__memchr, memchr);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S
new file mode 100644
index 0000000..707148b
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_generic.S
@@ -0,0 +1,33 @@
+/* Memchr for aarch64, default version for internal use.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_generic
+
+/* Do not hide the generic version of memchr, we use it internally. */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+/* Add a hidden definition for use within libc.so. */
+# ifdef SHARED
+ .globl __GI_memchr; __GI_memchr = __memchr_generic
+# endif
+#endif
+
+# include "../memchr.S"
diff --git a/sysdeps/aarch64/multiarch/memchr_skylark.S b/sysdeps/aarch64/multiarch/memchr_skylark.S
new file mode 100644
index 0000000..f4dbe58
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_skylark.S
@@ -0,0 +1,217 @@
+/* Optimized memchr for AmpereComputing skylark processor.
+
+ Copyright (C) 2018 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_skylark
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin x1
+#define cntin x2
+
+#define result x0
+
+#define repchr x1
+
+#define tmp1 x2
+#define tmp2 x3
+#define tmp3 x4
+#define tmp4 x5
+
+#define src x6
+#define srcend x7
+#define srcend16 x8
+
+#define anymore x9
+
+#define zeroones x10
+
+#define data1 x11
+#define data2 x12
+
+#define has_chr1 x13
+#define has_chr2 x14
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+
+ENTRY_ALIGN (MEMCHR, 6)
+
+ DELOUSE (0)
+ DELOUSE (2)
+
+ /* Do not dereference srcin if no bytes to compare. */
+ cbz cntin, L(none_chr)
+
+ mov zeroones, REP8_01
+ and repchr, chrin, 255
+ /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
+ mul repchr, repchr, zeroones
+
+ /* Start address is 16-byte aligned or not? */
+ tst srcin, 15
+ bic src, srcin, 15
+
+ add srcend, srcin, cntin
+ /*
+ * srcend16 is address of the block following the last block.
+ *
+ * [A block is 16-byte aligned and sized.]
+ */
+ add srcend16, srcend, 15
+ bic srcend16, srcend16, 15
+
+ b.eq L(loop)
+
+ /* Load the first block containing start address. */
+ ldp data1, data2, [src], 16
+
+ lsl tmp1, srcin, 3
+ mov tmp2, ~0
+#ifdef __AARCH64EB__
+ lsr tmp3, tmp2, tmp1
+#else
+ lsl tmp3, tmp2, tmp1
+#endif
+ /* Start address is in the first or the second qword? */
+ tst srcin, 8
+
+ /*
+ * Transform any byte in the block to zero using XOR operation,
+ * if that byte equals the char to search. In this way, searching
+ * the char becomes detecting zero in the resulting two qwords.
+ */
+ eor data1, data1, repchr
+ eor data2, data2, repchr
+
+ /*
+ * Set those unused bytes(before start address) to 0xff, so
+ * that they will not hit any zero detection.
+ */
+ orn tmp1, data1, tmp3
+ orn tmp2, data2, tmp3
+
+ csinv data1, tmp1, xzr, eq
+ csel data2, data2, tmp2, eq
+
+ /*
+ * When the first and last block are the same, there are two cases:
+ * o. Memory range to search is just in one block.
+ * ( first address - last address) < 0
+ *
+ * o. Memory range is so large that last address wrap-around.
+ * ( first address - last address) > 0
+ */
+ cmp srcin, srcend
+ ccmp src, srcend16, 0, mi
+ csetm anymore, ne
+ b L(find_chr)
+
+ .p2align 4
+L(loop):
+ ldp data1, data2, [src], 16
+
+ subs anymore, src, srcend16
+
+ /*
+ * Transform any byte in the block to zero using XOR operation,
+ * if that byte equals the char to search.
+ */
+ eor data1, data1, repchr
+ eor data2, data2, repchr
+
+L(find_chr):
+ /*
+ * Use the following integer test to find out if any byte in a
+ * qword is zero. If do not contain zero-valued byte, test result
+ * is zero.
+ *
+ * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
+ * =
+ * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f)
+ *
+ */
+ sub tmp1, data1, zeroones
+ sub tmp2, data2, zeroones
+
+ orr tmp3, data1, REP8_7f
+ orr tmp4, data2, REP8_7f
+
+ bic has_chr1, tmp1, tmp3
+ bic has_chr2, tmp2, tmp4
+
+ orr tmp1, has_chr1, has_chr2
+ ccmp tmp1, 0, 0, ne
+
+ b.eq L(loop)
+
+ cbz has_chr1, 1f
+#ifdef __AARCH64EB__
+ rev data1, data1
+#else
+ rev has_chr1, has_chr1
+#endif
+ sub result, src, 16
+ b L(done)
+
+1: cbz has_chr2, L(none_chr)
+#ifdef __AARCH64EB__
+ rev data1, data2
+#else
+ rev has_chr1, has_chr2
+#endif
+ sub result, src, 8
+
+L(done):
+#ifdef __AARCH64EB__
+ /*
+ * For big-endian, can not directly use has_chr1/has_chr2 because
+ * two qwords has been reversed after loading from memory.
+ * Thus, have to perform char detection on two qwords again, which
+ * should be byte-swapped this time.
+ */
+ sub tmp1, data1, zeroones
+ orr tmp3, data1, REP8_7f
+ bic has_chr1, tmp1, tmp3
+ rev has_chr1, has_chr1
+#endif
+
+ /*
+ * If the specified char is found in a qword, the corresponding
+ * byte of in has_chr has value of 1, while this is only true for
+ * the first occurrence, not other occurrences.
+ */
+ cmp anymore, 0
+ clz tmp1, has_chr1
+ add result, result, tmp1, lsr 3
+ ccmp result, srcend, 8, eq /* NZCV = 8000 */
+ csel result, result, xzr, mi
+ ret
+
+L(none_chr):
+ mov result, 0
+ ret
+
+END (MEMCHR)
+libc_hidden_builtin_def (MEMCHR)
+
+#endif
--
1.8.3.1