This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH 3/3] aarch64: Optimized memchr specific to AmpereComputing skylark


This version uses general register based memory instruction to load
data, because vector register based is slightly slower in skylark.

Character-matching is performed on 16-byte (both size and alignment)
memory block in parallel each iteration.

Another minor optimization is to preload next 16-byte block at the
begin of each iteration, which can hide some memory access latency.

    * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
    [!MEMCHR](MEMCHR): Set to __memchr.
    * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
    Add memchr_generic and memchr_skylark.
    * sysdeps/aarch64/multiarch/ifunc-impl-list.c
    (__libc_ifunc_impl_list): Add memchr ifuncs.
    * sysdeps/aarch64/multiarch/memchr.c: New file.
    * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
    * sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
---
 ChangeLog                                   |  12 ++
 sysdeps/aarch64/memchr.S                    |  10 +-
 sysdeps/aarch64/multiarch/Makefile          |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c |   3 +
 sysdeps/aarch64/multiarch/memchr.c          |  41 +++++
 sysdeps/aarch64/multiarch/memchr_generic.S  |  33 +++++
 sysdeps/aarch64/multiarch/memchr_skylark.S  | 222 ++++++++++++++++++++++++++++
 7 files changed, 319 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/aarch64/multiarch/memchr.c
 create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S
 create mode 100644 sysdeps/aarch64/multiarch/memchr_skylark.S

diff --git a/ChangeLog b/ChangeLog
index 28370f9..e64b8b3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2018-10-13  Feng Xue  <feng.xue@amperecomputing.com>
+
+	* sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR.
+	[!MEMCHR](MEMCHR): Set to __memchr.
+	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
+	Add memchr_generic and memchr_skylark.
+	* sysdeps/aarch64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add memchr ifuncs.
+	* sysdeps/aarch64/multiarch/memchr.c: New file.
+	* sysdeps/aarch64/multiarch/memchr_generic.S: Likewise.
+	* sysdeps/aarch64/multiarch/memchr_skylark.S: Likewise.
+
 2018-10-12  Feng Xue  <feng.xue@amperecomputing.com>
 
 	* sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S
index e422aef..4afebd3 100644
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@@ -26,6 +26,10 @@
  * Neon Available.
  */
 
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+
 /* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
@@ -59,7 +63,7 @@
  * identify exactly which byte has matched.
  */
 
-ENTRY (__memchr)
+ENTRY (MEMCHR)
 	/* Do not dereference srcin if no bytes to compare.  */
 	cbz	cntin, L(zero_length)
 	/*
@@ -152,6 +156,6 @@ L(tail):
 L(zero_length):
 	mov	result, #0
 	ret
-END (__memchr)
-weak_alias (__memchr, memchr)
+END (MEMCHR)
+weak_alias (MEMCHR, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 828ce4f..353ece7 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -2,5 +2,6 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
 		   memcpy_falkor memmove_falkor \
 		   memset_generic memset_falkor memset_skylark \
+		   memchr_generic memchr_skylark \
 		   strlen_generic strlen_asimd
 endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index baf01a0..f5014d2 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
 	      IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_skylark)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_skylark)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic))
 
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c
new file mode 100644
index 0000000..cbcf8b7
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr.c
@@ -0,0 +1,41 @@
+/* Multiple versions of memchr. AARCH64 version.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+
+#if IS_IN (libc)
+/* Redefine memchr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+# undef memchr
+# define memchr __redirect_memchr
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__redirect_memchr) __memchr;
+
+extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden;
+extern __typeof (__redirect_memchr) __memchr_skylark attribute_hidden;
+
+libc_ifunc (__memchr,
+	    ((IS_SKYLARK (midr)
+	       ? __memchr_skylark
+	       : __memchr_generic)));
+
+# undef memchr
+strong_alias (__memchr, memchr);
+#endif
diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S
new file mode 100644
index 0000000..707148b
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_generic.S
@@ -0,0 +1,33 @@
+/* Memchr for aarch64, default version for internal use.
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_generic
+
+/* Do not hide the generic version of memchr, we use it internally.  */
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name)
+
+/* Add a hidden definition for use within libc.so.  */
+# ifdef SHARED
+	.globl __GI_memchr; __GI_memchr = __memchr_generic
+# endif
+#endif
+
+# include "../memchr.S"
diff --git a/sysdeps/aarch64/multiarch/memchr_skylark.S b/sysdeps/aarch64/multiarch/memchr_skylark.S
new file mode 100644
index 0000000..9277104
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memchr_skylark.S
@@ -0,0 +1,222 @@
+/* Optimized memchr for AmpereComputing skylark processor.
+
+   Copyright (C) 2018 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+# define MEMCHR __memchr_skylark
+
+/* Arguments and results.  */
+#define srcin		x0
+#define chrin		x1
+#define cntin		x2
+
+#define result		x0
+
+#define repchr		x1
+
+#define tmp1		x2
+#define tmp2		x3
+#define tmp3		x4
+#define tmp4		x5
+
+#define src		x6
+#define srcend		x7
+#define srcend16	x8
+
+#define zeroones	x9
+
+#define data1		x10
+#define data2		x11
+#define data1a		x12
+#define data2a		x13
+
+#define has_chr1	x14
+#define has_chr2	x15
+
+#define REP8_01		0x0101010101010101
+#define REP8_7f		0x7f7f7f7f7f7f7f7f
+
+ENTRY_ALIGN (MEMCHR, 6)
+
+	DELOUSE (0)
+	DELOUSE (2)
+
+	/* Do not dereference srcin if no bytes to compare. */
+	cbz	cntin, L(none_chr)
+
+	bic	src, srcin, 15
+	/*
+	 * Load the first 16-byte aligned block where memory to
+	 * search starts.
+	 */
+	ldp	data1, data2, [src]
+
+	mov	zeroones, REP8_01
+	and	repchr, chrin, 255
+	/* Generate a qword integer as |c|c|c|c|c|c|c|c|. */
+	mul	repchr, repchr, zeroones
+
+	add	srcend, srcin, cntin
+	/* srcend16 is begin address of last 16-byte aligned block. */
+	sub	srcend, srcend, 1
+	bic	srcend16, srcend, 15
+
+	lsl	tmp1, srcin, 3
+	mov	tmp2, ~0
+#ifdef __AARCH64EB__
+	lsr	tmp3, tmp2, tmp1
+#else
+	lsl	tmp3, tmp2, tmp1
+#endif
+	/* Real starting point is in the first or the second qword? */ 
+	tst	srcin, 8
+
+	/*
+	 * Transform any byte in the 16-byte block, which equals to the
+         * specified char, to zero using xor operation. In this way,
+	 * searching the char becomes  detecting zero in the resulting
+	 * two qword integers.
+	 */
+	eor	data1a, data1, repchr
+	eor	data2a, data2, repchr
+
+	/*
+	 * Set those unused bytes(before the real starting point) to
+	 * 0xff, so that they will not hit any zero detection.
+	 */
+	orn	tmp1, data1a, tmp3
+	orn	tmp2, data2a, tmp3
+
+	csinv	data1a, tmp1, xzr, eq 
+	csel	data2a, data2a, tmp2, eq
+
+	cmp	src, srcend16
+	b.eq	L(last)
+
+L(loop):
+	/*
+	 * Preload the next 16-byte aligned block to hide some memory
+	 * access latency.
+	 */
+	ldp	data1, data2, [src, 16]!
+
+	/*
+	 * Use the following integer test to find out if any byte in a
+	 * qword is zero. If do not contain zero-valued byte, test result
+	 * is zero.
+	 *
+	 *  (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080
+	 * =
+	 *  (qword - 0x0101010101010101) & ~(qword  | 0x7f7f7f7f7f7f7f7f)
+	 *
+	 */
+	sub	tmp1, data1a, zeroones
+	sub	tmp2, data2a, zeroones
+
+	orr	tmp3, data1a, REP8_7f
+	orr	tmp4, data2a, REP8_7f
+
+	bics	has_chr1, tmp1, tmp3
+	bic	has_chr2, tmp2, tmp4
+
+	ccmp	has_chr2, 0, 0, eq	/* NZCV = 0000 */
+	ccmp	src, srcend16, 4, eq	/* NZCV = 0100 */
+
+	/*
+	 * Transform any byte in next 16-byte aligned block, which equals
+         * to the specified char, to zero using xor operation. 
+	 */
+	eor	data1a, data1, repchr
+	eor	data2a, data2, repchr
+
+	/*
+	 * Here use equal-or-not comparison, instead of less/more, to
+	 * make it work even for case of address wrap-around.
+	 */
+	b.ne	L(loop)
+
+	cbz	has_chr1, 1f
+	sub	src, src, 16
+#ifdef __AARCH64EB__
+	/*
+	 * For big-endian, can not directly use has_chr1/has_chr2 because
+	 * two qwords has been reversed after loading from memory.
+	 * Thus, one way is to restore two qwords in some means, and
+	 * byte-swap them, then perform char detection on them.
+	 */
+	add	data2a, tmp1, zeroones
+	b	L(find_chr)
+#else
+	b	L(done)
+#endif
+
+1:	cbz	has_chr2, L(last)
+	sub	src, src, 8
+#ifdef __AARCH64EB__
+	add	data2a, tmp2, zeroones
+	b	L(find_chr)
+#else
+	mov	has_chr1, has_chr2
+	b	L(done)
+#endif
+
+L(last):
+#ifdef __AARCH64EB__
+	rev	data1a, data1a
+#endif
+	sub	tmp1, data1a, zeroones
+	orr	tmp3, data1a, REP8_7f
+	bics	has_chr1, tmp1, tmp3
+	b.ne    L(done)
+
+	add	src, src, 8
+#ifdef __AARCH64EB__
+L(find_chr):
+	rev	data2a, data2a
+#endif
+	sub	tmp1, data2a, zeroones
+	orr	tmp3, data2a, REP8_7f
+	bics	has_chr1, tmp1, tmp3
+	b.eq	L(none_chr)
+
+L(done):
+	/*
+	 * If the specified char is found in a qword, the corresponding
+	 * byte of in has_chr has value of 1, while this is only true for
+	 * the first occurence not all.
+	 */
+	rev	has_chr1, has_chr1
+	bic	tmp1, src, 15
+	cmp	tmp1, srcend16	/* At the last 16-byte aligned block? */
+	clz	tmp2, has_chr1
+	add	src, src, tmp2, lsr 3
+	ccmp	src, srcend, 0, eq	/* NZCV = 0000 */
+	csel	result, src, xzr, ls
+	ret
+
+L(none_chr):
+	mov	result, 0
+	ret
+
+END (MEMCHR)
+libc_hidden_builtin_def (MEMCHR)
+
+#endif
-- 
1.8.3.1

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]