[AArch64] Add strchr optimized implementation.
Richard Earnshaw
rearnsha@arm.com
Tue Jun 10 14:08:00 GMT 2014
This patch adds a hand-optimized assembly routine for strchr. It makes
heavy use of Neon, permitting strings to be searched in chunks of 32
bytes at a time. Benchmarking on Cortex-A57 suggests that this is very
rarely slower than a 'good' C implementation, but can be up to 3.5 times
as fast.
Because we use LD1 for fetching bytes from memory, there's no difference
between big-endian and little-endian code.
2014-06-10 Richard Earnshaw <rearnsha@arm.com>
* libc/machine/aarch64/strchr.S: New file
* libc/machine/aarch64/strchr-stub.c: New file
* libc/machine/aarch64/Makefile.am: Add them to build list.
* libc/machine/aarch64/Makefile.in: Regenerated.
Committed.
R.
-------------- next part --------------
? autom4te.cache
Index: Makefile.am
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/aarch64/Makefile.am,v
retrieving revision 1.7
diff -u -r1.7 Makefile.am
--- Makefile.am 10 Jan 2013 13:02:19 -0000 1.7
+++ Makefile.am 10 Jun 2014 13:58:53 -0000
@@ -18,6 +18,8 @@
lib_a_SOURCES += memset-stub.c
lib_a_SOURCES += memset.S
lib_a_SOURCES += setjmp.S
+lib_a_SOURCES += strchr-stub.c
+lib_a_SOURCES += strchr.S
lib_a_SOURCES += strcmp-stub.c
lib_a_SOURCES += strcmp.S
lib_a_SOURCES += strlen-stub.c
Index: Makefile.in
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/aarch64/Makefile.in,v
retrieving revision 1.8
diff -u -r1.8 Makefile.in
--- Makefile.in 10 Jan 2013 13:02:19 -0000 1.8
+++ Makefile.in 10 Jun 2014 13:58:53 -0000
@@ -73,7 +73,8 @@
lib_a-memcpy-stub.$(OBJEXT) lib_a-memcpy.$(OBJEXT) \
lib_a-memmove-stub.$(OBJEXT) lib_a-memmove.$(OBJEXT) \
lib_a-memset-stub.$(OBJEXT) lib_a-memset.$(OBJEXT) \
- lib_a-setjmp.$(OBJEXT) lib_a-strcmp-stub.$(OBJEXT) \
+ lib_a-setjmp.$(OBJEXT) lib_a-strchr-stub.$(OBJEXT) \
+ lib_a-strchr.$(OBJEXT) lib_a-strcmp-stub.$(OBJEXT) \
lib_a-strcmp.$(OBJEXT) lib_a-strlen-stub.$(OBJEXT) \
lib_a-strlen.$(OBJEXT) lib_a-strncmp-stub.$(OBJEXT) \
lib_a-strncmp.$(OBJEXT) lib_a-strnlen-stub.$(OBJEXT) \
@@ -205,8 +206,8 @@
noinst_LIBRARIES = lib.a
lib_a_SOURCES = memcmp-stub.c memcmp.S memcpy-stub.c memcpy.S \
memmove-stub.c memmove.S memset-stub.c memset.S setjmp.S \
- strcmp-stub.c strcmp.S strlen-stub.c strlen.S strncmp-stub.c \
- strncmp.S strnlen-stub.c strnlen.S
+ strchr-stub.c strchr.S strcmp-stub.c strcmp.S strlen-stub.c \
+ strlen.S strncmp-stub.c strncmp.S strnlen-stub.c strnlen.S
lib_a_CCASFLAGS = $(AM_CCASFLAGS)
lib_a_CFLAGS = $(AM_CFLAGS)
ACLOCAL_AMFLAGS = -I ../../.. -I ../../../..
@@ -299,6 +300,12 @@
lib_a-setjmp.obj: setjmp.S
$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-setjmp.obj `if test -f 'setjmp.S'; then $(CYGPATH_W) 'setjmp.S'; else $(CYGPATH_W) '$(srcdir)/setjmp.S'; fi`
+lib_a-strchr.o: strchr.S
+ $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-strchr.o `test -f 'strchr.S' || echo '$(srcdir)/'`strchr.S
+
+lib_a-strchr.obj: strchr.S
+ $(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-strchr.obj `if test -f 'strchr.S'; then $(CYGPATH_W) 'strchr.S'; else $(CYGPATH_W) '$(srcdir)/strchr.S'; fi`
+
lib_a-strcmp.o: strcmp.S
$(CCAS) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CCASFLAGS) $(CCASFLAGS) -c -o lib_a-strcmp.o `test -f 'strcmp.S' || echo '$(srcdir)/'`strcmp.S
@@ -353,6 +360,12 @@
lib_a-memset-stub.obj: memset-stub.c
$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-memset-stub.obj `if test -f 'memset-stub.c'; then $(CYGPATH_W) 'memset-stub.c'; else $(CYGPATH_W) '$(srcdir)/memset-stub.c'; fi`
+lib_a-strchr-stub.o: strchr-stub.c
+ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-strchr-stub.o `test -f 'strchr-stub.c' || echo '$(srcdir)/'`strchr-stub.c
+
+lib_a-strchr-stub.obj: strchr-stub.c
+ $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-strchr-stub.obj `if test -f 'strchr-stub.c'; then $(CYGPATH_W) 'strchr-stub.c'; else $(CYGPATH_W) '$(srcdir)/strchr-stub.c'; fi`
+
lib_a-strcmp-stub.o: strcmp-stub.c
$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(lib_a_CFLAGS) $(CFLAGS) -c -o lib_a-strcmp-stub.o `test -f 'strcmp-stub.c' || echo '$(srcdir)/'`strcmp-stub.c
Index: strchr-stub.c
===================================================================
RCS file: strchr-stub.c
diff -N strchr-stub.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ strchr-stub.c 10 Jun 2014 13:58:53 -0000
@@ -0,0 +1,31 @@
+/* Copyright (c) 2014, ARM Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
+# include "../../string/strchr.c"
+#else
+/* See strchr.S */
+#endif
Index: strchr.S
===================================================================
RCS file: strchr.S
diff -N strchr.S
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ strchr.S 10 Jun 2014 13:58:53 -0000
@@ -0,0 +1,164 @@
+/*
+ strchr - find a character in a string
+
+ Copyright (c) 2014, ARM Limited
+ All rights Reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the company nor the names of its contributors
+ may be used to endorse or promote products derived from this
+ software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
+
+#if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
+/* See strchr-stub.c */
+#else
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ * Neon Available.
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define chrin w1
+
+#define result x0
+
+#define src x2
+#define tmp1 x3
+#define wtmp2 w4
+#define tmp3 x5
+
+#define vrepchr v0
+#define vdata1 v1
+#define vdata2 v2
+#define vhas_nul1 v3
+#define vhas_nul2 v4
+#define vhas_chr1 v5
+#define vhas_chr2 v6
+#define vrepmask_0 v7
+#define vrepmask_c v16
+#define vend1 v17
+#define vend2 v18
+
+/* Core algorithm.
+
+ For each 32-byte hunk we calculate a 64-bit syndrome value, with
+ two bits per byte (LSB is always in bits 0 and 1, for both big
+ and little-endian systems). For each tuple, bit 0 is set iff
+ the relevant byte matched the requested character; bit 1 is set
+ iff the relevant byte matched the NUL end of string (we trigger
+ off bit0 for the special case of looking for NUL). Since the bits
+ in the syndrome reflect exactly the order in which things occur
+ in the original string a count_trailing_zeros() operation will
+ identify exactly which byte is causing the termination, and why. */
+
+/* Locals and temporaries. */
+
+ .macro def_fn f p2align=0
+ .text
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn strchr
+ /* Magic constant 0x40100401 to allow us to identify which lane
+ matches the requested byte. Magic constant 0x80200802 used
+ similarly for NUL termination. */
+ mov wtmp2, #0x0401
+ movk wtmp2, #0x4010, lsl #16
+ dup vrepchr.16b, chrin
+ bic src, srcin, #31 /* Work with aligned 32-byte hunks. */
+ dup vrepmask_c.4s, wtmp2
+ ands tmp1, srcin, #31
+ add vrepmask_0.4s, vrepmask_c.4s, vrepmask_c.4s /* equiv: lsl #1 */
+ b.eq .Lloop
+
+ /* Input string is not 32-byte aligned. Rather than forcing
+ the padding bytes to a safe value, we calculate the syndrome
+ for all the bytes, but then mask off those bits of the
+ syndrome that are related to the padding. */
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ neg tmp1, tmp1
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ lsl tmp1, tmp1, #1
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ mov tmp3, #~0
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+ lsr tmp1, tmp3, tmp1
+
+ mov tmp3, vend1.2d[0]
+ bic tmp1, tmp3, tmp1 // Mask padding bits.
+ cbnz tmp1, .Ltail
+
+.Lloop:
+ ld1 {vdata1.16b, vdata2.16b}, [src], #32
+ cmeq vhas_nul1.16b, vdata1.16b, #0
+ cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
+ cmeq vhas_nul2.16b, vdata2.16b, #0
+ cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
+ /* Use a fast check for the termination condition. */
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ orr vend1.16b, vend1.16b, vend2.16b
+ addp vend1.2d, vend1.2d, vend1.2d
+ mov tmp1, vend1.2d[0]
+ cbz tmp1, .Lloop
+
+ /* Termination condition found. Now need to establish exactly why
+ we terminated. */
+ and vhas_nul1.16b, vhas_nul1.16b, vrepmask_0.16b
+ and vhas_nul2.16b, vhas_nul2.16b, vrepmask_0.16b
+ and vhas_chr1.16b, vhas_chr1.16b, vrepmask_c.16b
+ and vhas_chr2.16b, vhas_chr2.16b, vrepmask_c.16b
+ orr vend1.16b, vhas_nul1.16b, vhas_chr1.16b
+ orr vend2.16b, vhas_nul2.16b, vhas_chr2.16b
+ addp vend1.16b, vend1.16b, vend2.16b // 256->128
+ addp vend1.16b, vend1.16b, vend2.16b // 128->64
+
+ mov tmp1, vend1.2d[0]
+.Ltail:
+ /* Count the trailing zeros, by bit reversing... */
+ rbit tmp1, tmp1
+ /* Re-bias source. */
+ sub src, src, #32
+ clz tmp1, tmp1 /* And counting the leading zeros. */
+ /* Tmp1 is even if the target charager was found first. Otherwise
+ we've found the end of string and we weren't looking for NUL. */
+ tst tmp1, #1
+ add result, src, tmp1, lsr #1
+ csel result, result, xzr, eq
+ ret
+
+ .size strchr, . - strchr
+#endif
More information about the Newlib
mailing list