This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: [Patch] Replace MIPS strcmp.c with assembly language version.
- From: Jeff Johnston <jjohnstn at redhat dot com>
- To: Steve Ellcey <sellcey at mips dot com>
- Cc: newlib at sourceware dot org, c at mips dot com
- Date: Wed, 8 Oct 2014 17:36:26 -0400 (EDT)
- Subject: Re: [Patch] Replace MIPS strcmp.c with assembly language version.
- Authentication-results: sourceware.org; auth=none
- References: <201410022049 dot s92KnoOb016486 at mipsswvm001 dot mips dot com>
I noticed that the glibc version has an LGPL license. Where did you get this version
from with the BSD-style license?
I also assume this is noticeably better
than the much smaller NetBSD version of MIPS strcmp.S I found on the net:
http://cvsweb.netbsd.org/bsdweb.cgi/src/common/lib/libc/arch/mips/string/strcmp.S?rev=1.2&content-type=text/x-cvsweb-markup&only_with_tag=MAIN
-- Jeff J.
----- Original Message -----
From: "Steve Ellcey" <sellcey@mips.com>
To: newlib@sourceware.org
Cc: c@mips.com
Sent: Thursday, October 2, 2014 4:49:50 PM
Subject: [Patch] Replace MIPS strcmp.c with assembly language version.
I would like to replace the MIPS C strcmp.c with an assembly language
version that is a bit faster. Both versions load 4 bytes at a time on
aligned strings but the assembly version has the advantage of being
scheduled by hand and having some loop unrolling to be a bit faster.
This version is already checked in to glibc. It has been tested by
hand and by running the GCC testsuite.
OK for checkin?
Steve Ellcey
sellcey@mips.com
2014-10-02 Steve Ellcey <sellcey@mips.com>
* newlib/libc/machine/mips/strcmp.c: Remove.
* newlib/libc/machine/mips/strcmp.S: New.
* newlib/libc/machine/mips/Makefile.am (lib_a_SOURCES):
Replace strcmp.c with strcmp.S
* newlib/libc/machine/mips/Makefile.in: Regenerate.
diff --git a/newlib/libc/machine/mips/Makefile.am b/newlib/libc/machine/mips/Makefile.am
index 46b4cc3..1695b18 100644
--- a/newlib/libc/machine/mips/Makefile.am
+++ b/newlib/libc/machine/mips/Makefile.am
@@ -8,7 +8,7 @@ AM_CCASFLAGS = $(INCLUDES)
noinst_LIBRARIES = lib.a
-lib_a_SOURCES = setjmp.S strlen.c strcmp.c strncpy.c memset.S memcpy.S
+lib_a_SOURCES = setjmp.S strlen.c strcmp.S strncpy.c memset.S memcpy.S
lib_a_CCASFLAGS=$(AM_CCASFLAGS) -D_COMPILING_NEWLIB
lib_a_CFLAGS=$(AM_CFLAGS) -D_COMPILING_NEWLIB
diff --git a/newlib/libc/machine/mips/strcmp.S b/newlib/libc/machine/mips/strcmp.S
new file mode 100644
index 0000000..b1d757e
--- /dev/null
+++ b/newlib/libc/machine/mips/strcmp.S
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2014
+ * Imagination Technologies Limited.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY IMAGINATION TECHNOLOGIES LIMITED ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL IMAGINATION TECHNOLOGIES LIMITED BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#elif _LIBC
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+#elif _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#else
+# include <regdef.h>
+# include <sys/asm.h>
+#endif
+
+/* Technically strcmp should not read past the end of the strings being
+ compared. We will read a full word that may contain excess bits beyond
+ the NULL string terminator but unless ENABLE_READAHEAD is set, we will not
+ read the next word after the end of string. Setting ENABLE_READAHEAD will
+ improve performance but is technically illegal based on the definition of
+ strcmp. */
+#ifdef ENABLE_READAHEAD
+# define DELAY_READ
+#else
+# define DELAY_READ nop
+#endif
+
+/* Testing on a little endian machine showed using CLZ was a
+ performance loss, so we are not turning it on by default. */
+#if defined(ENABLE_CLZ) && (__mips_isa_rev > 1)
+# define USE_CLZ
+#endif
+
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+# else
+# define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+# define PTR_ADDIU daddiu
+# else
+# define PTR_ADDIU addiu
+# endif
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef STRCMP_NAME
+# define STRCMP_NAME strcmp
+#endif
+
+#ifdef ANDROID_CHANGES
+LEAF(STRCMP_NAME, 0)
+#else
+LEAF(STRCMP_NAME)
+#endif
+ .set nomips16
+ .set noreorder
+
+ or t0, a0, a1
+ andi t0,0x3
+ bne t0, zero, L(byteloop)
+
+/* Both strings are 4 byte aligned at this point. */
+
+ lui t8, 0x0101
+ ori t8, t8, 0x0101
+ lui t9, 0x7f7f
+ ori t9, 0x7f7f
+
+#define STRCMP32(OFFSET) \
+ lw v0, OFFSET(a0); \
+ lw v1, OFFSET(a1); \
+ subu t0, v0, t8; \
+ bne v0, v1, L(worddiff); \
+ nor t1, v0, t9; \
+ and t0, t0, t1; \
+ bne t0, zero, L(returnzero)
+
+L(wordloop):
+ STRCMP32(0)
+ DELAY_READ
+ STRCMP32(4)
+ DELAY_READ
+ STRCMP32(8)
+ DELAY_READ
+ STRCMP32(12)
+ DELAY_READ
+ STRCMP32(16)
+ DELAY_READ
+ STRCMP32(20)
+ DELAY_READ
+ STRCMP32(24)
+ DELAY_READ
+ STRCMP32(28)
+ PTR_ADDIU a0, a0, 32
+ b L(wordloop)
+ PTR_ADDIU a1, a1, 32
+
+L(returnzero):
+ j ra
+ move v0, zero
+
+L(worddiff):
+#ifdef USE_CLZ
+ subu t0, v0, t8
+ nor t1, v0, t9
+ and t1, t0, t1
+ xor t0, v0, v1
+ or t0, t0, t1
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ wsbh t0, t0
+ rotr t0, t0, 16
+# endif
+ clz t1, t0
+ and t1, 0xf8
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ neg t1
+ addu t1, 24
+# endif
+ rotrv v0, v0, t1
+ rotrv v1, v1, t1
+ and v0, v0, 0xff
+ and v1, v1, 0xff
+ j ra
+ subu v0, v0, v1
+#else /* USE_CLZ */
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ andi t0, v0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, v1, 0xff
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 8
+ srl t9, v1, 8
+ andi t8, t8, 0xff
+ beq t8, zero, L(wexit89)
+ andi t9, t9, 0xff
+ bne t8, t9, L(wexit89)
+
+ srl t0, v0, 16
+ srl t1, v1, 16
+ andi t0, t0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, t1, 0xff
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 24
+ srl t9, v1, 24
+# else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+ srl t0, v0, 24
+ beq t0, zero, L(wexit01)
+ srl t1, v1, 24
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 16
+ srl t9, v1, 16
+ andi t8, t8, 0xff
+ beq t8, zero, L(wexit89)
+ andi t9, t9, 0xff
+ bne t8, t9, L(wexit89)
+
+ srl t0, v0, 8
+ srl t1, v1, 8
+ andi t0, t0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, t1, 0xff
+ bne t0, t1, L(wexit01)
+
+ andi t8, v0, 0xff
+ andi t9, v1, 0xff
+# endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+
+L(wexit89):
+ j ra
+ subu v0, t8, t9
+L(wexit01):
+ j ra
+ subu v0, t0, t1
+#endif /* USE_CLZ */
+
+/* It might seem better to do the 'beq' instruction between the two 'lbu'
+ instructions so that the nop is not needed but testing showed that this
+ code is actually faster (based on glibc strcmp test). */
+#define BYTECMP01(OFFSET) \
+ lbu v0, OFFSET(a0); \
+ lbu v1, OFFSET(a1); \
+ beq v0, zero, L(bexit01); \
+ nop; \
+ bne v0, v1, L(bexit01)
+
+#define BYTECMP89(OFFSET) \
+ lbu t8, OFFSET(a0); \
+ lbu t9, OFFSET(a1); \
+ beq t8, zero, L(bexit89); \
+ nop; \
+ bne t8, t9, L(bexit89)
+
+L(byteloop):
+ BYTECMP01(0)
+ BYTECMP89(1)
+ BYTECMP01(2)
+ BYTECMP89(3)
+ BYTECMP01(4)
+ BYTECMP89(5)
+ BYTECMP01(6)
+ BYTECMP89(7)
+ PTR_ADDIU a0, a0, 8
+ b L(byteloop)
+ PTR_ADDIU a1, a1, 8
+
+L(bexit01):
+ j ra
+ subu v0, v0, v1
+L(bexit89):
+ j ra
+ subu v0, t8, t9
+
+ .set at
+ .set reorder
+
+END(STRCMP_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+libc_hidden_builtin_def (STRCMP_NAME)
+# endif
+#endif