[PATCH, ARM] Improved strcmp for armv6/v7-a

Richard Earnshaw rearnsha@arm.com
Tue Apr 22 12:12:00 GMT 2014


This patch adds improved strcmp implementations for ARMv6 and ARMv7-a.
It also restructures the code for the various variants into separate
files.  The code is now sufficiently complex that having it all in one
file was becoming very confusing.

Benchmarking such a change is notoriously difficult, given the potential
number of variables that can affect performance and the number of
potential CPU implementations this can be used on, but across a range of
tests the improvements are often in the range of 10-50% and only very
rarely is there a regression of more than 1%.

Another feature of the new code is the (build-time) optional pre-check
on the strings.  If strcmp is likely to be used with a lot of random
strings, then checking the first character of each string can save a lot
of overhead, at the minor expense of having to do that work a second
time.  For some use cases, however, that check is likely to be rarely of
benefit (eg if you're normally using strcmp to validate a match in a
sparse hash table), so it is possible to disable it at compile time, by
defining STRCMP_NO_PRECHECK.  The pre-check is enabled by default since
it doesn't cost much and in general use it's more likely to be
beneficial than not.

A final minor change is that the ARMv7-a code now avoids any ARMv8
deprecated IT sequences.

	* libc/machine/arm/strcmp-arm-tiny.S: New file.
	* libc/machine/arm/strcmp-armv4.S: New file.
	* libc/machine/arm/strcmp-armv4t.S: New file.
	* libc/machine/arm/strcmp-armv6.S: New file.
	* libc/machine/arm/strcmp-armv7.S: New file.
	* libc/machine/arm/strcmp-armv7m.S: New file.
	* libc/machine/arm/strcmp.S: Replace with wrapper for various
	implementations.
	* libc/machine/arm/Makefile.am (strcmp.o, strcmp.obj): Add
	dependencies.
	* libc/machine/arm/Makefile.in: Regenerated.

I'll commit this momentarily.

R.
-------------- next part --------------
? autom4te.cache
Index: Makefile.am
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/arm/Makefile.am,v
retrieving revision 1.11
diff -u -r1.11 Makefile.am
--- Makefile.am	3 Jun 2013 14:02:10 -0000	1.11
+++ Makefile.am	22 Apr 2014 11:52:56 -0000
@@ -18,7 +18,13 @@
 CONFIG_STATUS_DEPENDENCIES = $(newlib_basedir)/configure.host
 
 MEMCPY_DEP=memcpy-armv7a.S memcpy-armv7m.S
+STRCMP_DEP=strcmp-arm-tiny.S strcmp-armv4.S strcmp-armv4t.S strcmp-armv6.S \
+	strcmp-armv7.S strcmp-armv7m.S
 
 $(lpfx)memcpy.o: $(MEMCPY_DEP)
 
 $(lpfx)memcpy.obj: $(MEMCPY_DEP)
+
+$(lpfx)strcmp.o: $(STRCMP_DEP)
+
+$(lpfx)strcmp.obj: $(STRCMP_DEP)
Index: Makefile.in
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/arm/Makefile.in,v
retrieving revision 1.25
diff -u -r1.25 Makefile.in
--- Makefile.in	3 Jun 2013 14:02:10 -0000	1.25
+++ Makefile.in	22 Apr 2014 11:52:56 -0000
@@ -209,6 +209,9 @@
 ACLOCAL_AMFLAGS = -I ../../.. -I ../../../..
 CONFIG_STATUS_DEPENDENCIES = $(newlib_basedir)/configure.host
 MEMCPY_DEP = memcpy-armv7a.S memcpy-armv7m.S
+STRCMP_DEP = strcmp-arm-tiny.S strcmp-armv4.S strcmp-armv4t.S strcmp-armv6.S \
+	strcmp-armv7.S strcmp-armv7m.S
+
 all: all-am
 
 .SUFFIXES:
@@ -508,6 +511,10 @@
 
 $(lpfx)memcpy.obj: $(MEMCPY_DEP)
 
+$(lpfx)strcmp.o: $(STRCMP_DEP)
+
+$(lpfx)strcmp.obj: $(STRCMP_DEP)
+
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
 .NOEXPORT:
Index: strcmp-arm-tiny.S
===================================================================
RCS file: strcmp-arm-tiny.S
diff -N strcmp-arm-tiny.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-arm-tiny.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Tiny version of strcmp in ARM state.  Used only when optimizing
+   for size.  Also supports Thumb-2.  */
+
+	.syntax unified
+def_fn strcmp
+	.cfi_startproc
+1:
+	ldrb	r2, [r0], #1
+	ldrb	r3, [r1], #1
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	beq	1b
+2:
+	subs	r0, r2, r3
+	RETURN
+	.cfi_endproc
+	.size	strcmp, . - strcmp
Index: strcmp-armv4.S
===================================================================
RCS file: strcmp-armv4.S
diff -N strcmp-armv4.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-armv4.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+	/* Basic ARM implementation.  This should run on anything except
+	   for ARMv6-M, but there are better implementations for later
+	   revisions of the architecture.  This version can support ARMv4T
+	   ARM/Thumb interworking.  */
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define data1		r2
+#define data2		r3
+#define magic1		r4
+#define tmp2		r5
+#define tmp1		r12
+#define syndrome	r12	/* Overlaps tmp1 */
+
+	.arm
+def_fn strcmp
+	.cfi_startproc
+	eor	tmp1, src1, src2
+	tst	tmp1, #3
+	/* Strings not at same byte offset from a word boundary.  */
+	bne	.Lstrcmp_unaligned
+	ands	tmp1, src1, #3
+	bic	src1, src1, #3
+	bic	src2, src2, #3
+	ldr	data1, [src1], #4
+	ldreq	data2, [src2], #4
+	beq	1f
+	/* Although s1 and s2 have identical initial alignment, they are
+	   not currently word aligned.	Rather than comparing bytes,
+	   make sure that any bytes fetched from before the addressed
+	   bytes are forced to 0xff.  Then they will always compare
+	   equal.  */
+	eor	tmp1, tmp1, #3
+	mvn	data2, #MSB
+	lsl	tmp1, tmp1, #3
+	S2LO	tmp1, data2, tmp1
+	ldr	data2, [src2], #4
+	orr	data1, data1, tmp1
+	orr	data2, data2, tmp1
+1:
+	/* Load the 'magic' constant 0x01010101.	*/
+	str	r4, [sp, #-4]!
+	.cfi_def_cfa_offset 4
+	.cfi_offset 4, -4
+	mov	magic1, #1
+	orr	magic1, magic1, magic1, lsl #8
+	orr	magic1, magic1, magic1, lsl #16
+	.p2align	2
+4:
+	sub	syndrome, data1, magic1
+	cmp	data1, data2
+	/* check for any zero bytes in first word */
+	biceq	syndrome, syndrome, data1
+	tsteq	syndrome, magic1, lsl #7
+	ldreq	data1, [src1], #4
+	ldreq	data2, [src2], #4
+	beq	4b
+2:
+	/* There's a zero or a different byte in the word */
+	S2HI	result, data1, #24
+	S2LO	data1, data1, #8
+	cmp	result, #1
+	cmpcs	result, data2, S2HI #24
+	S2LOEQ	data2, data2, #8
+	beq	2b
+	/* On a big-endian machine, RESULT contains the desired byte in bits
+	   0-7; on a little-endian machine they are in bits 24-31.  In
+	   both cases the other bits in RESULT are all zero.  For DATA2 the
+	   interesting byte is at the other end of the word, but the
+	   other bits are not necessarily zero.	 We need a signed result
+	   representing the differnece in the unsigned bytes, so for the
+	   little-endian case we can't just shift the interesting bits
+	   up.	*/
+#ifdef __ARM_BIG_ENDIAN
+	sub	result, result, data2, lsr #24
+#else
+	and	data2, data2, #255
+	rsb	result, data2, result, lsr #24
+#endif
+	ldr	r4, [sp], #4
+	.cfi_restore 4
+	.cfi_def_cfa_offset 0
+	RETURN
+
+
+#if 0
+	/* The assembly code below is based on the following alogrithm.	 */
+#ifdef __ARM_BIG_ENDIAN
+#define RSHIFT <<
+#define LSHIFT >>
+#else
+#define RSHIFT >>
+#define LSHIFT <<
+#endif
+
+#define body(shift)							\
+  mask = 0xffffffffU RSHIFT shift;					\
+  data1 = *src1++;							\
+  data2 = *src2++;							\
+  do									\
+    {									\
+      tmp2 = data1 & mask;						\
+      if (__builtin_expect(tmp2 != data2 RSHIFT shift, 0))		\
+	{								\
+	  data2 RSHIFT= shift;						\
+	  break;							\
+	}								\
+      if (__builtin_expect(((data1 - b1) & ~data1) & (b1 << 7), 0))	\
+	{								\
+	  /* See comment in assembler below re syndrome on big-endian */\
+	  if ((((data1 - b1) & ~data1) & (b1 << 7)) & mask)		\
+	    data2 RSHIFT= shift;					\
+	  else								\
+	    {								\
+	      data2 = *src2;						\
+	      tmp2 = data1 RSHIFT (32 - shift);				\
+	      data2 = (data2 LSHIFT (32 - shift)) RSHIFT (32 - shift);	\
+	    }								\
+	  break;							\
+	}								\
+      data2 = *src2++;							\
+      tmp2 ^= data1;							\
+      if (__builtin_expect(tmp2 != data2 LSHIFT (32 - shift), 0))	\
+	{								\
+	  tmp2 = data1 >> (32 - shift);					\
+	  data2 = (data2 << (32 - shift)) RSHIFT (32 - shift);		\
+	  break;							\
+	}								\
+      data1 = *src1++;							\
+    } while (1)
+
+  const unsigned* src1;
+  const unsigned* src2;
+  unsigned data1, data2;
+  unsigned mask;
+  unsigned shift;
+  unsigned b1 = 0x01010101;
+  char c1, c2;
+  unsigned tmp2;
+
+  while (((unsigned) s1) & 3)
+    {
+      c1 = *s1++;
+      c2 = *s2++;
+      if (c1 == 0 || c1 != c2)
+	return c1 - (int)c2;
+    }
+  src1 = (unsigned*) (((unsigned)s1) & ~3);
+  src2 = (unsigned*) (((unsigned)s2) & ~3);
+  tmp2 = ((unsigned) s2) & 3;
+  if (tmp2 == 1)
+    {
+      body(8);
+    }
+  else if (tmp2 == 2)
+    {
+      body(16);
+    }
+  else
+    {
+      body (24);
+    }
+
+  do
+    {
+#ifdef __ARM_BIG_ENDIAN
+      c1 = (char) tmp2 >> 24;
+      c2 = (char) data2 >> 24;
+#else /* not  __ARM_BIG_ENDIAN */
+      c1 = (char) tmp2;
+      c2 = (char) data2;
+#endif /* not  __ARM_BIG_ENDIAN */
+      tmp2 RSHIFT= 8;
+      data2 RSHIFT= 8;
+    } while (c1 != 0 && c1 == c2);
+  return c1 - c2;
+#endif /* 0 */
+
+
+	/* First of all, compare bytes until src1(sp1) is word-aligned. */
+.Lstrcmp_unaligned:
+	tst	src1, #3
+	beq	2f
+	ldrb	data1, [src1], #1
+	ldrb	data2, [src2], #1
+	cmp	data1, #1
+	cmpcs	data1, data2
+	beq	.Lstrcmp_unaligned
+	sub	result, data1, data2
+	RETURN
+
+2:
+	stmfd	sp!, {r4, r5}
+	.cfi_def_cfa_offset 8
+	.cfi_offset 4, -8
+	.cfi_offset 5, -4
+	mov	magic1, #1
+	orr	magic1, magic1, magic1, lsl #8
+	orr	magic1, magic1, magic1, lsl #16
+
+	ldr	data1, [src1], #4
+	and	tmp2, src2, #3
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	cmp	tmp2, #2
+	beq	.Loverlap2
+	bhi	.Loverlap1
+
+	/* Critical inner Loop: Block with 3 bytes initial overlap */
+	.p2align	2
+.Loverlap3:
+	bic	tmp2, data1, #MSB
+	cmp	tmp2, data2, S2LO #8
+	sub	syndrome, data1, magic1
+	bic	syndrome, syndrome, data1
+	bne	4f
+	ands	syndrome, syndrome, magic1, lsl #7
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+#ifdef __ARM_BIG_ENDIAN
+	/* The syndrome value may contain false ones if the string ends
+	with the bytes 0x01 0x00.  */
+	tst	data1, #0xff000000
+	tstne	data1, #0x00ff0000
+	tstne	data1, #0x0000ff00
+	beq	.Lstrcmp_done_equal
+#else
+	bics	syndrome, syndrome, #0xff000000
+	bne	.Lstrcmp_done_equal
+#endif
+	ldrb	data2, [src2]
+	S2LO	tmp2, data1, #24
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #24
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	tmp2, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	/* Critical inner Loop: Block with 2 bytes initial overlap.  */
+	.p2align	2
+.Loverlap2:
+	S2HI	tmp2, data1, #16
+	sub	syndrome, data1, magic1
+	S2LO	tmp2, tmp2, #16
+	bic	syndrome, syndrome, data1
+	cmp	tmp2, data2, S2LO #16
+	bne	4f
+	ands	syndrome, syndrome, magic1, lsl #7
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+
+5:
+#ifdef __ARM_BIG_ENDIAN
+	/* The syndrome value may contain false ones if the string ends
+	with the bytes 0x01 0x00 */
+	tst	data1, #0xff000000
+	tstne	data1, #0x00ff0000
+	beq	.Lstrcmp_done_equal
+#else
+	lsls	syndrome, syndrome, #16
+	bne	.Lstrcmp_done_equal
+#endif
+	ldrh	data2, [src2]
+	S2LO	tmp2, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2HI	data2, data2, #16
+	S2LO	tmp2, data1, #16
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+
+	/* Critical inner Loop: Block with 1 byte initial overlap.  */
+	.p2align	2
+.Loverlap1:
+	and	tmp2, data1, #LSB
+	cmp	tmp2, data2, S2LO #24
+	sub	syndrome, data1, magic1
+	bic	syndrome, syndrome, data1
+	bne	4f
+	ands	syndrome, syndrome, magic1, lsl #7
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	/* The syndrome value may contain false ones if the string ends
+	   with the bytes 0x01 0x00.  */
+	tst	data1, #LSB
+	beq	.Lstrcmp_done_equal
+	ldr	data2, [src2], #4
+6:
+	S2LO	tmp2, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldmfd	sp!, {r4, r5}
+	.cfi_restore 4
+	.cfi_restore 5
+	.cfi_def_cfa_offset 0
+	RETURN
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+	and	r2, tmp2, #LSB
+	and	result, data2, #LSB
+	cmp	result, #1
+	cmpcs	result, r2
+	S2LOEQ	tmp2, tmp2, #8
+	S2LOEQ	data2, data2, #8
+	beq	.Lstrcmp_tail
+	sub	result, r2, result
+	ldmfd	sp!, {r4, r5}
+	.cfi_restore 4
+	.cfi_restore 5
+	.cfi_def_cfa_offset 0
+	RETURN
+	.cfi_endproc
+	.size strcmp, . - strcmp
Index: strcmp-armv4t.S
===================================================================
RCS file: strcmp-armv4t.S
diff -N strcmp-armv4t.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-armv4t.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+	/* This version is only used when we want a very basic Thumb1
+	   implementation or for size, otherwise we use the base ARMv4
+	   version.  This is also suitable for ARMv6-M.  */
+
+	.thumb
+	.syntax unified
+	.arch	armv4t
+	.eabi_attribute Tag_also_compatible_with, "\006\013" /* ARMv6-M.  */
+	.eabi_attribute Tag_ARM_ISA_use, 0
+def_fn strcmp
+	.cfi_startproc
+1:
+	ldrb	r2, [r0]
+	ldrb	r3, [r1]
+	cmp	r2, #0
+	beq	2f
+	adds	r0, r0, #1
+	adds	r1, r1, #1
+	cmp	r2, r3
+	beq	1b
+2:
+	subs	r0, r2, r3
+	bx	lr
+	.cfi_endproc
+	.size	strcmp, . - strcmp
Index: strcmp-armv6.S
===================================================================
RCS file: strcmp-armv6.S
diff -N strcmp-armv6.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-armv6.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,469 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+	/* Implementation of strcmp for ARMv6.  Use ldrd to support wider
+	   loads, provided the data is sufficiently aligned.  Use
+	   saturating arithmetic to optimize the compares.  */
+
+	/* Build Options:
+	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+	   byte in the string.  If comparing completely random strings
+	   the pre-check will save time, since there is a very high
+	   probability of a mismatch in the first character: we save
+	   significant overhead if this is the common case.  However,
+	   if strings are likely to be identical (eg because we're
+	   verifying a hit in a hash table), then this check is largely
+	   redundant.  */
+
+	.arm
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsl	\d1, \d1, tmp1
+	.cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	.cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.text
+	.p2align	5
+.Lstrcmp_start_addr:
+#ifndef STRCMP_NO_PRECHECK
+.Lfastpath_exit:
+	sub	r0, r2, r3
+	bx	lr
+#endif
+def_fn	strcmp
+#ifndef STRCMP_NO_PRECHECK
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	cmpcs	r2, r3
+	bne	.Lfastpath_exit
+#endif
+	.cfi_startproc
+	strd	r4, r5, [sp, #-16]!
+	.cfi_def_cfa_offset 16
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
+	mvn	const_m1, #0
+	tst	tmp1, #7
+	beq	.Lloop_aligned8
+
+.Lnot_aligned:
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	.Lmisaligned8
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In ARM code we can't use ORN, but with do have MVN with a
+	   register shift.  */
+	mvn	tmp1, const_m1, S2HI tmp2
+	orr	data1a, data1a, tmp1
+	orr	data2a, data2a, tmp1
+	beq	.Lstart_realigned8
+	orr	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orr	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	.Lstart_realigned8
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+.Lloop_aligned8:
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+.Lstart_realigned8:
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	bne	.Ldiff_found
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	.Lloop_aligned8
+
+.Ldiff_found:
+	cmp	syndrome_a, #0
+	bne	.Ldiff_in_a
+
+.Ldiff_in_b:
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+	.cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	.cfi_restore_state
+.Lmisaligned8:
+	tst	tmp1, #3
+	bne	.Lmisaligned4
+	ands	tmp1, src1, #3
+	bne	.Lmutual_align4
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+.Lloop_aligned4:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned4:
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	bne	.Laligned4_done
+
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	.Lloop_aligned4
+
+.Laligned4_done:
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+	.cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In ARM code we can't use ORN, but with do have MVN with a
+	   register shift.  */
+	mvn	tmp1, const_m1, S2HI tmp1
+	orr	data1, data1, tmp1
+	orr	data2, data2, tmp1
+	b	.Lstart_realigned4
+
+.Lmisaligned4:
+	ands	tmp1, src1, #3
+	beq	.Lsrc1_aligned
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	.Laligned_m2
+	bcs	.Laligned_m1
+
+#ifdef STRCMP_NO_PRECHECK
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	cmp	tmp1, #1
+	cmpcs	tmp1, data2
+	bne	.Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	cmp	tmp1, #1
+	cmpcs	tmp1, data2
+	bne	.Lmisaligned_exit
+
+.Laligned_m1:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	cmp	tmp1, #1
+	cmpcs	tmp1, data2
+	beq	.Lsrc1_aligned
+
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	cmp	tmp1, #1
+	cmpcs	tmp1, data2
+	bne	.Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	cmp	tmp1, #1
+	cmpcs	tmp1, data2
+	beq	.Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+	.cfi_remember_state
+	sub	result, tmp1, data2
+	ldr	r4, [sp], #16
+	.cfi_restore 4
+	bx	lr
+
+#ifndef STRCMP_NO_PRECHECK
+.Laligned_m1:
+	add	src2, src2, #4
+#endif
+.Lsrc1_aligned:
+	.cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+.Loverlap3:
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cmp	syndrome, #0
+	ldreq	data2, [src2], #4
+	bne	5f
+
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	.Lstrcmp_done_equal
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 Not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	neg	result, result
+	bx	lr
+
+6:
+	.cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap2:
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cmp	syndrome, #0
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	.Lstrcmp_done_equal
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap1:
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cmp	syndrome, #0
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	tst	syndrome, #LSB
+	bne	.Lstrcmp_done_equal
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	bx	lr
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	sub	result, result, data2, lsr #24
+	bx	lr
+	.cfi_endproc
+	.size strcmp, . - .Lstrcmp_start_addr
Index: strcmp-armv7.S
===================================================================
RCS file: strcmp-armv7.S
diff -N strcmp-armv7.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-armv7.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+	/* Implementation of strcmp for ARMv7 when DSP instructions are
+	   available.  Use ldrd to support wider loads, provided the data
+	   is sufficiently aligned.  Use saturating arithmetic to optimize
+	   the compares.  */
+
+	/* Build Options:
+	   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
+	   byte in the string.  If comparing completely random strings
+	   the pre-check will save time, since there is a very high
+	   probability of a mismatch in the first character: we save
+	   significant overhead if this is the common case.  However,
+	   if strings are likely to be identical (eg because we're
+	   verifying a hit in a hash table), then this check is largely
+	   redundant.  */
+
+	/* This version uses Thumb-2 code.  */
+	.thumb
+	.syntax unified
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define tmp1		r4
+#define tmp2		r5
+#define const_m1	r12
+
+/* Additional internal variables for 64-bit aligned data.  */
+#define data1a		r2
+#define data1b		r3
+#define data2a		r6
+#define data2b		r7
+#define syndrome_a	tmp1
+#define syndrome_b	tmp2
+
+/* Additional internal variables for 32-bit aligned data.  */
+#define data1		r2
+#define data2		r3
+#define syndrome	tmp2
+
+
+	/* Macro to compute and return the result value for word-aligned
+	   cases.  */
+	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
+#ifdef __ARM_BIG_ENDIAN
+	/* If data1 contains a zero byte, then syndrome will contain a 1 in
+	   bit 7 of that byte.  Otherwise, the highest set bit in the
+	   syndrome will highlight the first different bit.  It is therefore
+	   sufficient to extract the eight bits starting with the syndrome
+	   bit.  */
+	clz	tmp1, \synd
+	lsl	r1, \d2, tmp1
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsl	\d1, \d1, tmp1
+	.cfi_remember_state
+	lsr	result, \d1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1, lsr #24
+	bx	lr
+#else
+	/* To use the big-endian trick we'd have to reverse all three words.
+	   that's slower than this approach.  */
+	rev	\synd, \synd
+	clz	tmp1, \synd
+	bic	tmp1, tmp1, #7
+	lsr	r1, \d2, tmp1
+	.cfi_remember_state
+	.if \restore_r6
+	ldrd	r6, r7, [sp, #8]
+	.endif
+	.cfi_restore 6
+	.cfi_restore 7
+	lsr	\d1, \d1, tmp1
+	and	result, \d1, #255
+	and	r1, r1, #255
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	sub	result, result, r1
+
+	bx	lr
+#endif
+	.endm
+
+	.text
+	.p2align	5
+.Lstrcmp_start_addr:
+#ifndef STRCMP_NO_PRECHECK
+.Lfastpath_exit:
+	sub	r0, r2, r3
+	bx	lr
+	nop
+#endif
+def_fn	strcmp
+#ifndef STRCMP_NO_PRECHECK
+	ldrb	r2, [src1]
+	ldrb	r3, [src2]
+	cmp	r2, #1
+	it	cs
+	cmpcs	r2, r3
+	bne	.Lfastpath_exit
+#endif
+	.cfi_startproc
+	strd	r4, r5, [sp, #-16]!
+	.cfi_def_cfa_offset 16
+	.cfi_offset 4, -16
+	.cfi_offset 5, -12
+	orr	tmp1, src1, src2
+	strd	r6, r7, [sp, #8]
+	.cfi_offset 6, -8
+	.cfi_offset 7, -4
+	mvn	const_m1, #0
+	lsl	r2, tmp1, #29
+	cbz	r2, .Lloop_aligned8
+
+.Lnot_aligned:
+	eor	tmp1, src1, src2
+	tst	tmp1, #7
+	bne	.Lmisaligned8
+
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	and	tmp1, src1, #7
+	bic	src1, src1, #7
+	and	tmp2, tmp1, #3
+	bic	src2, src2, #7
+	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
+	ldrd	data1a, data1b, [src1], #16
+	tst	tmp1, #4
+	ldrd	data2a, data2b, [src2], #16
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp2
+	orn	data1a, data1a, tmp1
+	orn	data2a, data2a, tmp1
+	beq	.Lstart_realigned8
+	orn	data1b, data1b, tmp1
+	mov	data1a, const_m1
+	orn	data2b, data2b, tmp1
+	mov	data2a, const_m1
+	b	.Lstart_realigned8
+
+	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
+	   pass.  */
+	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
+	.p2align 2	/* Always word aligned.  */
+.Lloop_aligned8:
+	ldrd	data1a, data1b, [src1], #16
+	ldrd	data2a, data2b, [src2], #16
+.Lstart_realigned8:
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	cbnz	syndrome_a, .Ldiff_in_a
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	cbnz	syndrome_b, .Ldiff_in_b
+
+	ldrd	data1a, data1b, [src1, #-8]
+	ldrd	data2a, data2b, [src2, #-8]
+	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
+	eor	syndrome_a, data1a, data2a
+	sel	syndrome_a, syndrome_a, const_m1
+	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
+	eor	syndrome_b, data1b, data2b
+	sel	syndrome_b, syndrome_b, const_m1
+	/* Can't use CBZ for backwards branch.  */
+	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
+	beq	.Lloop_aligned8
+
+.Ldiff_found:
+	cbnz	syndrome_a, .Ldiff_in_a
+
+.Ldiff_in_b:
+	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
+
+.Ldiff_in_a:
+	.cfi_restore_state
+	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
+
+	.cfi_restore_state
+.Lmisaligned8:
+	tst	tmp1, #3
+	bne	.Lmisaligned4
+	ands	tmp1, src1, #3
+	bne	.Lmutual_align4
+
+	/* Unrolled by a factor of 2, to reduce the number of post-increment
+	   operations.  */
+.Lloop_aligned4:
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+.Lstart_realigned4:
+	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cbnz	syndrome, .Laligned4_done
+	ldr	data1, [src1, #-4]
+	ldr	data2, [src2, #-4]
+	uadd8	syndrome, data1, const_m1
+	eor	syndrome, data1, data2
+	sel	syndrome, syndrome, const_m1
+	cmp	syndrome, #0
+	beq	.Lloop_aligned4
+
+.Laligned4_done:
+	strcmp_epilogue_aligned syndrome, data1, data2, 0
+
+.Lmutual_align4:
+	.cfi_restore_state
+	/* Deal with mutual misalignment by aligning downwards and then
+	   masking off the unwanted loaded data to prevent a difference.  */
+	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
+	bic	src1, src1, #3
+	ldr	data1, [src1], #8
+	bic	src2, src2, #3
+	ldr	data2, [src2], #8
+
+	/* In thumb code we can't use MVN with a register shift, but
+	   we do have ORN.  */
+	S2HI	tmp1, const_m1, tmp1
+	orn	data1, data1, tmp1
+	orn	data2, data2, tmp1
+	b	.Lstart_realigned4
+
+.Lmisaligned4:
+	ands	tmp1, src1, #3
+	beq	.Lsrc1_aligned
+	sub	src2, src2, tmp1
+	bic	src1, src1, #3
+	lsls	tmp1, tmp1, #31
+	ldr	data1, [src1], #4
+	beq	.Laligned_m2
+	bcs	.Laligned_m1
+
+#ifdef STRCMP_NO_PRECHECK
+	ldrb	data2, [src2, #1]
+	uxtb	tmp1, data1, ror #BYTE1_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m1:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	add	src2, src2, #4
+	cbnz	data2, .Lsrc1_aligned
+#else  /* STRCMP_NO_PRECHECK */
+	/* If we've done the pre-check, then we don't need to check the
+	   first byte again here.  */
+	ldrb	data2, [src2, #2]
+	uxtb	tmp1, data1, ror #BYTE2_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbz	data2, .Lmisaligned_exit
+
+.Laligned_m2:
+	ldrb	data2, [src2, #3]
+	uxtb	tmp1, data1, ror #BYTE3_OFFSET
+	subs	tmp1, tmp1, data2
+	bne	.Lmisaligned_exit
+	cbnz	data2, .Laligned_m1
+#endif
+
+.Lmisaligned_exit:
+	.cfi_remember_state
+	mov	result, tmp1
+	ldr	r4, [sp], #16
+	.cfi_restore 4
+	bx	lr
+
+#ifndef STRCMP_NO_PRECHECK
+.Laligned_m1:
+	add	src2, src2, #4
+#endif
+.Lsrc1_aligned:
+	.cfi_restore_state
+	/* src1 is word aligned, but src2 has no common alignment
+	   with it.  */
+	ldr	data1, [src1], #4
+	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
+
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	bhi	.Loverlap1		/* C=1, Z=0 => src2[1:0] = 0b11.  */
+	bcs	.Loverlap2		/* C=1, Z=1 => src2[1:0] = 0b10.  */
+
+	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
+.Loverlap3:
+	bic	tmp1, data1, #MSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #8
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+	bics	syndrome, syndrome, #MSB
+	bne	.Lstrcmp_done_equal
+
+	/* We can only get here if the MSB of data1 contains 0, so
+	   fast-path the exit.  */
+	ldrb	result, [src2]
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 Not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	neg	result, result
+	bx	lr
+
+6:
+	.cfi_restore_state
+	S2LO	data1, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap2:
+	and	tmp1, data1, const_m1, S2LO #16
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #16
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+5:
+	ands	syndrome, syndrome, const_m1, S2LO #16
+	bne	.Lstrcmp_done_equal
+
+	ldrh	data2, [src2]
+	S2LO	data1, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	data1, data1, #16
+	and	data2, data2, const_m1, S2LO #16
+	b	.Lstrcmp_tail
+
+	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
+.Loverlap1:
+	and	tmp1, data1, #LSB
+	uadd8	syndrome, data1, const_m1
+	eors	syndrome, tmp1, data2, S2LO #24
+	sel	syndrome, syndrome, const_m1
+	bne	4f
+	cbnz	syndrome, 5f
+	ldr	data2, [src2], #4
+	eor	tmp1, tmp1, data1
+	cmp	tmp1, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	tst	syndrome, #LSB
+	bne	.Lstrcmp_done_equal
+	ldr	data2, [src2]
+6:
+	S2LO	data1, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	bx	lr
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+#ifndef __ARM_BIG_ENDIAN
+	rev	data1, data1
+	rev	data2, data2
+	/* Now everything looks big-endian...  */
+#endif
+	uadd8	tmp1, data1, const_m1
+	eor	tmp1, data1, data2
+	sel	syndrome, tmp1, const_m1
+	clz	tmp1, syndrome
+	lsl	data1, data1, tmp1
+	lsl	data2, data2, tmp1
+	lsr	result, data1, #24
+	ldrd	r4, r5, [sp], #16
+	.cfi_restore 4
+	.cfi_restore 5
+	/* R6/7 not used in this sequence.  */
+	.cfi_restore 6
+	.cfi_restore 7
+	sub	result, result, data2, lsr #24
+	bx	lr
+	.cfi_endproc
+	.size strcmp, . - .Lstrcmp_start_addr
Index: strcmp-armv7m.S
===================================================================
RCS file: strcmp-armv7m.S
diff -N strcmp-armv7m.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strcmp-armv7m.S	22 Apr 2014 11:52:56 -0000
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2012-2014 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Very similar to the generic code, but uses Thumb2 as implemented
+   in ARMv7-M.  */
+
+/* Parameters and result.  */
+#define src1		r0
+#define src2		r1
+#define result		r0	/* Overlaps src1.  */
+
+/* Internal variables.  */
+#define data1		r2
+#define data2		r3
+#define tmp2		r5
+#define tmp1		r12
+#define syndrome	r12	/* Overlaps tmp1 */
+
+	.thumb
+	.syntax unified
+def_fn strcmp
+	.cfi_startproc
+	eor	tmp1, src1, src2
+	tst	tmp1, #3
+	/* Strings not at same byte offset from a word boundary.  */
+	bne	.Lstrcmp_unaligned
+	ands	tmp1, src1, #3
+	bic	src1, src1, #3
+	bic	src2, src2, #3
+	ldr	data1, [src1], #4
+	it	eq
+	ldreq	data2, [src2], #4
+	beq	4f
+	/* Although s1 and s2 have identical initial alignment, they are
+	   not currently word aligned.	Rather than comparing bytes,
+	   make sure that any bytes fetched from before the addressed
+	   bytes are forced to 0xff.  Then they will always compare
+	   equal.  */
+	eor	tmp1, tmp1, #3
+	mvn	data2, #MSB
+	lsl	tmp1, tmp1, #3
+	S2LO	tmp1, data2, tmp1
+	ldr	data2, [src2], #4
+	orr	data1, data1, tmp1
+	orr	data2, data2, tmp1
+	.p2align	2
+	/* Critical loop.  */
+4:
+	sub	syndrome, data1, #0x01010101
+	cmp	data1, data2
+	/* check for any zero bytes in first word */
+	itttt	eq
+	biceq	syndrome, syndrome, data1
+	tsteq	syndrome, #0x80808080
+	ldreq	data1, [src1], #4
+	ldreq	data2, [src2], #4
+	beq	4b
+2:
+	/* There's a zero or a different byte in the word */
+	S2HI	result, data1, #24
+	S2LO	data1, data1, #8
+	cmp	result, #1
+	it	cs
+	cmpcs	result, data2, S2HI #24
+	it	eq
+	S2LOEQ	data2, data2, #8
+	beq	2b
+	/* On a big-endian machine, RESULT contains the desired byte in bits
+	   0-7; on a little-endian machine they are in bits 24-31.  In
+	   both cases the other bits in RESULT are all zero.  For DATA2 the
+	   interesting byte is at the other end of the word, but the
+	   other bits are not necessarily zero.	 We need a signed result
+	   representing the differnece in the unsigned bytes, so for the
+	   little-endian case we can't just shift the interesting bits
+	   up.	*/
+#ifdef __ARM_BIG_ENDIAN
+	sub	result, result, data2, lsr #24
+#else
+	and	data2, data2, #255
+	lsrs	result, result, #24
+	subs	result, result, data2
+#endif
+	RETURN
+
+
+#if 0
+	/* The assembly code below is based on the following alogrithm.	 */
+#ifdef __ARM_BIG_ENDIAN
+#define RSHIFT <<
+#define LSHIFT >>
+#else
+#define RSHIFT >>
+#define LSHIFT <<
+#endif
+
+#define body(shift)							\
+  mask = 0xffffffffU RSHIFT shift;					\
+  data1 = *src1++;							\
+  data2 = *src2++;							\
+  do									\
+    {									\
+      tmp2 = data1 & mask;						\
+      if (__builtin_expect(tmp2 != data2 RSHIFT shift, 0))		\
+	{								\
+	  data2 RSHIFT= shift;						\
+	  break;							\
+	}								\
+      if (__builtin_expect(((data1 - b1) & ~data1) & (b1 << 7), 0))	\
+	{								\
+	  /* See comment in assembler below re syndrome on big-endian */\
+	  if ((((data1 - b1) & ~data1) & (b1 << 7)) & mask)		\
+	    data2 RSHIFT= shift;					\
+	  else								\
+	    {								\
+	      data2 = *src2;						\
+	      tmp2 = data1 RSHIFT (32 - shift);				\
+	      data2 = (data2 LSHIFT (32 - shift)) RSHIFT (32 - shift);	\
+	    }								\
+	  break;							\
+	}								\
+      data2 = *src2++;							\
+      tmp2 ^= data1;							\
+      if (__builtin_expect(tmp2 != data2 LSHIFT (32 - shift), 0))	\
+	{								\
+	  tmp2 = data1 >> (32 - shift);					\
+	  data2 = (data2 << (32 - shift)) RSHIFT (32 - shift);		\
+	  break;							\
+	}								\
+      data1 = *src1++;							\
+    } while (1)
+
+  const unsigned* src1;
+  const unsigned* src2;
+  unsigned data1, data2;
+  unsigned mask;
+  unsigned shift;
+  unsigned b1 = 0x01010101;
+  char c1, c2;
+  unsigned tmp2;
+
+  while (((unsigned) s1) & 3)
+    {
+      c1 = *s1++;
+      c2 = *s2++;
+      if (c1 == 0 || c1 != c2)
+	return c1 - (int)c2;
+    }
+  src1 = (unsigned*) (((unsigned)s1) & ~3);
+  src2 = (unsigned*) (((unsigned)s2) & ~3);
+  tmp2 = ((unsigned) s2) & 3;
+  if (tmp2 == 1)
+    {
+      body(8);
+    }
+  else if (tmp2 == 2)
+    {
+      body(16);
+    }
+  else
+    {
+      body (24);
+    }
+
+  do
+    {
+#ifdef __ARM_BIG_ENDIAN
+      c1 = (char) tmp2 >> 24;
+      c2 = (char) data2 >> 24;
+#else /* not  __ARM_BIG_ENDIAN */
+      c1 = (char) tmp2;
+      c2 = (char) data2;
+#endif /* not  __ARM_BIG_ENDIAN */
+      tmp2 RSHIFT= 8;
+      data2 RSHIFT= 8;
+    } while (c1 != 0 && c1 == c2);
+  return c1 - c2;
+#endif /* 0 */
+
+
+	/* First of all, compare bytes until src1(sp1) is word-aligned. */
+.Lstrcmp_unaligned:
+	tst	src1, #3
+	beq	2f
+	ldrb	data1, [src1], #1
+	ldrb	data2, [src2], #1
+	cmp	data1, #1
+	it	cs
+	cmpcs	data1, data2
+	beq	.Lstrcmp_unaligned
+	sub	result, data1, data2
+	bx	lr
+
+2:
+	stmfd	sp!, {r5}
+	.cfi_def_cfa_offset 4
+	.cfi_offset 5, -4
+
+	ldr	data1, [src1], #4
+	and	tmp2, src2, #3
+	bic	src2, src2, #3
+	ldr	data2, [src2], #4
+	cmp	tmp2, #2
+	beq	.Loverlap2
+	bhi	.Loverlap1
+
+	/* Critical inner Loop: Block with 3 bytes initial overlap */
+	.p2align	2
+.Loverlap3:
+	bic	tmp2, data1, #MSB
+	cmp	tmp2, data2, S2LO #8
+	sub	syndrome, data1, #0x01010101
+	bic	syndrome, syndrome, data1
+	bne	4f
+	ands	syndrome, syndrome, #0x80808080
+	it	eq
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #24
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap3
+4:
+	S2LO	data2, data2, #8
+	b	.Lstrcmp_tail
+
+5:
+#ifdef __ARM_BIG_ENDIAN
+	/* The syndrome value may contain false ones if the string ends
+	with the bytes 0x01 0x00.  */
+	tst	data1, #0xff000000
+	itt	ne
+	tstne	data1, #0x00ff0000
+	tstne	data1, #0x0000ff00
+	beq	.Lstrcmp_done_equal
+#else
+	bics	syndrome, syndrome, #0xff000000
+	bne	.Lstrcmp_done_equal
+#endif
+	ldrb	data2, [src2]
+	S2LO	tmp2, data1, #24
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #24
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2LO	tmp2, data1, #24
+	and	data2, data2, #LSB
+	b	.Lstrcmp_tail
+
+	/* Critical inner Loop: Block with 2 bytes initial overlap.  */
+	.p2align	2
+.Loverlap2:
+	S2HI	tmp2, data1, #16
+	sub	syndrome, data1, #0x01010101
+	S2LO	tmp2, tmp2, #16
+	bic	syndrome, syndrome, data1
+	cmp	tmp2, data2, S2LO #16
+	bne	4f
+	ands	syndrome, syndrome, #0x80808080
+	it	eq
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #16
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap2
+
+5:
+#ifdef __ARM_BIG_ENDIAN
+	/* The syndrome value may contain false ones if the string ends
+	with the bytes 0x01 0x00 */
+	tst	data1, #0xff000000
+	it	ne
+	tstne	data1, #0x00ff0000
+	beq	.Lstrcmp_done_equal
+#else
+	lsls	syndrome, syndrome, #16
+	bne	.Lstrcmp_done_equal
+#endif
+	ldrh	data2, [src2]
+	S2LO	tmp2, data1, #16
+#ifdef __ARM_BIG_ENDIAN
+	lsl	data2, data2, #16
+#endif
+	b	.Lstrcmp_tail
+
+6:
+	S2HI	data2, data2, #16
+	S2LO	tmp2, data1, #16
+4:
+	S2LO	data2, data2, #16
+	b	.Lstrcmp_tail
+
+	/* Critical inner Loop: Block with 1 byte initial overlap.  */
+	.p2align	2
+.Loverlap1:
+	and	tmp2, data1, #LSB
+	cmp	tmp2, data2, S2LO #24
+	sub	syndrome, data1, #0x01010101
+	bic	syndrome, syndrome, data1
+	bne	4f
+	ands	syndrome, syndrome, #0x80808080
+	it	eq
+	ldreq	data2, [src2], #4
+	bne	5f
+	eor	tmp2, tmp2, data1
+	cmp	tmp2, data2, S2HI #8
+	bne	6f
+	ldr	data1, [src1], #4
+	b	.Loverlap1
+4:
+	S2LO	data2, data2, #24
+	b	.Lstrcmp_tail
+5:
+	/* The syndrome value may contain false ones if the string ends
+	   with the bytes 0x01 0x00.  */
+	tst	data1, #LSB
+	beq	.Lstrcmp_done_equal
+	ldr	data2, [src2], #4
+6:
+	S2LO	tmp2, data1, #8
+	bic	data2, data2, #MSB
+	b	.Lstrcmp_tail
+.Lstrcmp_done_equal:
+	mov	result, #0
+	.cfi_remember_state
+	ldmfd	sp!, {r5}
+	.cfi_restore 5
+	.cfi_def_cfa_offset 0
+	RETURN
+
+.Lstrcmp_tail:
+	.cfi_restore_state
+	and	r2, tmp2, #LSB
+	and	result, data2, #LSB
+	cmp	result, #1
+	it	cs
+	cmpcs	result, r2
+	itt	eq
+	S2LOEQ	tmp2, tmp2, #8
+	S2LOEQ	data2, data2, #8
+	beq	.Lstrcmp_tail
+	sub	result, r2, result
+	ldmfd	sp!, {r5}
+	.cfi_restore 5
+	.cfi_def_cfa_offset 0
+	RETURN
+	.cfi_endproc
+	.size strcmp, . - strcmp
Index: strcmp.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/arm/strcmp.S,v
retrieving revision 1.4
diff -u -r1.4 strcmp.S
--- strcmp.S	5 Jun 2013 09:41:21 -0000	1.4
+++ strcmp.S	22 Apr 2014 11:52:56 -0000
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012 ARM Ltd
+ * Copyright (c) 2012-2014 ARM Ltd
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -26,769 +26,64 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/* Wrapper for the various implementations of strcmp.  */
+
 #include "arm_asm.h"
 
-#ifdef __ARMEB__
-#define S2LOMEM lsl
-#define S2LOMEMEQ lsleq
-#define S2HIMEM lsr
+#ifdef __ARM_BIG_ENDIAN
+#define S2LO lsl
+#define S2LOEQ lsleq
+#define S2HI lsr
 #define MSB 0x000000ff
 #define LSB 0xff000000
 #define BYTE0_OFFSET 24
 #define BYTE1_OFFSET 16
 #define BYTE2_OFFSET 8
 #define BYTE3_OFFSET 0
-#else /* not  __ARMEB__ */
-#define S2LOMEM lsr
-#define S2LOMEMEQ lsreq
-#define S2HIMEM lsl
+#else /* not  __ARM_BIG_ENDIAN */
+#define S2LO lsr
+#define S2LOEQ lsreq
+#define S2HI lsl
 #define BYTE0_OFFSET 0
 #define BYTE1_OFFSET 8
 #define BYTE2_OFFSET 16
 #define BYTE3_OFFSET 24
 #define MSB 0xff000000
 #define LSB 0x000000ff
-#endif /* not  __ARMEB__ */
+#endif /* not  __ARM_BIG_ENDIAN */
 
-.syntax         unified
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+\f:
+	.endm
 
-#if defined (__thumb__)
-        .thumb
-        .thumb_func
-#if !defined (__thumb2__)
-	/* If we have thumb1 only, we need to explictly mark the
-	   compatibility.  */
-	.arch armv4t
-	.eabi_attribute Tag_also_compatible_with, "\006\013"  /* v6-M.  */
-	.eabi_attribute Tag_ARM_ISA_use, 0
-#endif
-#endif
-        .global strcmp
-        .type   strcmp, %function
-strcmp:
-
-#if (defined (__thumb__) && !defined (__thumb2__))
-1:
-        ldrb    r2, [r0]
-        ldrb    r3, [r1]
-        adds    r0, r0, #1
-        adds    r1, r1, #1
-        cmp     r2, #0
-        beq     2f
-        cmp     r2, r3
-        beq     1b
-2:
-        subs    r0, r2, r3
-        bx      lr
-#elif (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED))
-1:
-        ldrb    r2, [r0], #1
-        ldrb    r3, [r1], #1
-        cmp     r2, #1
-        it      cs
-        cmpcs   r2, r3
-        beq     1b
-        subs    r0, r2, r3
-        RETURN
-
-
-#elif (defined (_ISA_THUMB_2) || defined (_ISA_ARM_6))
-      /* Use LDRD whenever possible.  */
-
-/* The main thing to look out for when comparing large blocks is that
-   the loads do not cross a page boundary when loading past the index
-   of the byte with the first difference or the first string-terminator.
-
-   For example, if the strings are identical and the string-terminator
-   is at index k, byte by byte comparison will not load beyond address
-   s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
-   k; double word - up to 7 bytes.  If the load of these bytes crosses
-   a page boundary, it might cause a memory fault (if the page is not mapped)
-   that would not have happened in byte by byte comparison.
-
-   If an address is (double) word aligned, then a load of a (double) word
-   from that address will not cross a page boundary.
-   Therefore, the algorithm below considers word and double-word alignment
-   of strings separately.  */
-
-/* High-level description of the algorithm.
-
-   * The fast path: if both strings are double-word aligned,
-     use LDRD to load two words from each string in every loop iteration.
-   * If the strings have the same offset from a word boundary,
-     use LDRB to load and compare byte by byte until
-     the first string is aligned to a word boundary (at most 3 bytes).
-     This is optimized for quick return on short unaligned strings.
-   * If the strings have the same offset from a double-word boundary,
-     use LDRD to load two words from each string in every loop iteration, as in the fast path.
-   * If the strings do not have the same offset from a double-word boundary,
-     load a word from the second string before the loop to initialize the queue.
-     Use LDRD to load two words from every string in every loop iteration.
-     Inside the loop, load the second word from the second string only after comparing
-     the first word, using the queued value, to guarantee safety across page boundaries.
-   * If the strings do not have the same offset from a word boundary,
-     use LDR and a shift queue. Order of loads and comparisons matters,
-     similarly to the previous case.
-
-   * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
-   * The only difference between ARM and Thumb modes is the use of CBZ instruction.
-   * The only difference between big and little endian is the use of REV in little endian
-     to compute the return value, instead of MOV.
-   * No preload. [TODO.]
-*/
-
-        .macro m_cbz reg label
-#ifdef __thumb2__
-        cbz     \reg, \label
-#else   /* not defined __thumb2__ */
-        cmp     \reg, #0
-        beq     \label
-#endif /* not defined __thumb2__ */
-        .endm /* m_cbz */
-
-        .macro m_cbnz reg label
-#ifdef __thumb2__
-        cbnz    \reg, \label
-#else   /* not defined __thumb2__ */
-        cmp     \reg, #0
-        bne     \label
-#endif /* not defined __thumb2__ */
-        .endm /* m_cbnz */
-
-        .macro  init
-        /* Macro to save temporary registers and prepare magic values.  */
-        subs    sp, sp, #16
-        strd    r4, r5, [sp, #8]
-        strd    r6, r7, [sp]
-        mvn     r6, #0  /* all F */
-        mov     r7, #0  /* all 0 */
-        .endm   /* init */
-
-        .macro  magic_compare_and_branch w1 w2 label
-        /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
-        cmp     \w1, \w2        /* Are w1 and w2 the same?  */
-        magic_find_zero_bytes \w1
-        it      eq
-        cmpeq   ip, #0          /* Is there a zero byte in w1?  */
-        bne     \label
-        .endm /* magic_compare_and_branch */
-
-        .macro  magic_find_zero_bytes w1
-        /* Macro to find all-zero bytes in w1, result is in ip.  */
-#if (defined (__ARM_FEATURE_DSP))
-        uadd8   ip, \w1, r6
-        sel     ip, r7, r6
-#else /* not defined (__ARM_FEATURE_DSP) */
-        /* __ARM_FEATURE_DSP is not defined for some Cortex-M processors.
-        Coincidently, these processors only have Thumb-2 mode, where we can use the
-        the (large) magic constant available directly as an immediate in instructions.
-        Note that we cannot use the magic constant in ARM mode, where we need
-        to create the constant in a register.  */
-        sub     ip, \w1, #0x01010101
-        bic     ip, ip, \w1
-        and     ip, ip, #0x80808080
-#endif /* not defined (__ARM_FEATURE_DSP) */
-        .endm /* magic_find_zero_bytes */
-
-        .macro  setup_return w1 w2
-#ifdef __ARMEB__
-        mov     r1, \w1
-        mov     r2, \w2
-#else /* not  __ARMEB__ */
-        rev     r1, \w1
-        rev     r2, \w2
-#endif /* not  __ARMEB__ */
-        .endm /* setup_return */
-
-        /*
-        optpld r0, #0
-        optpld r1, #0
-        */
-
-        /* Are both strings double-word aligned?  */
-        orr     ip, r0, r1
-        tst     ip, #7
-        bne     .Ldo_align
-
-        /* Fast path.  */
-        init
-
-.Ldoubleword_aligned:
-
-        /* Get here when the strings to compare are double-word aligned.  */
-        /* Compare two words in every iteration.  */
-        .p2align        2
-2:
-        /*
-        optpld r0, #16
-        optpld r1, #16
-        */
-
-        /* Load the next double-word from each string.  */
-        ldrd    r2, r3, [r0], #8
-        ldrd    r4, r5, [r1], #8
-
-        magic_compare_and_branch w1=r2, w2=r4, label=.Lreturn_24
-        magic_compare_and_branch w1=r3, w2=r5, label=.Lreturn_35
-        b       2b
-
-.Ldo_align:
-        /* Is the first string word-aligned?  */
-        ands    ip, r0, #3
-        beq     .Lword_aligned_r0
-
-        /* Fast compare byte by byte until the first string is word-aligned.  */
-        /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
-        to read until the next word boudnary is 4-ip.  */
-        bic     r0, r0, #3
-        ldr     r2, [r0], #4
-        lsls    ip, ip, #31
-        beq     .Lbyte2
-        bcs     .Lbyte3
-
-.Lbyte1:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE1_OFFSET
-        subs    ip, r3, ip
-        bne     .Lfast_return
-        m_cbz   reg=r3, label=.Lfast_return
-
-.Lbyte2:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE2_OFFSET
-        subs    ip, r3, ip
-        bne     .Lfast_return
-        m_cbz   reg=r3, label=.Lfast_return
-
-.Lbyte3:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE3_OFFSET
-        subs    ip, r3, ip
-        bne     .Lfast_return
-        m_cbnz  reg=r3, label=.Lword_aligned_r0
-
-.Lfast_return:
-        mov     r0, ip
-        bx      lr
-
-.Lword_aligned_r0:
-        init
-        /* The first string is word-aligned.  */
-        /* Is the second string word-aligned?  */
-        ands    ip, r1, #3
-        bne     .Lstrcmp_unaligned
-
-.Lword_aligned:
-        /* The strings are word-aligned. */
-        /* Is the first string double-word aligned?  */
-        tst     r0, #4
-        beq     .Ldoubleword_aligned_r0
-
-        /* If r0 is not double-word aligned yet, align it by loading
-        and comparing the next word from each string.  */
-        ldr     r2, [r0], #4
-        ldr     r4, [r1], #4
-        magic_compare_and_branch w1=r2 w2=r4 label=.Lreturn_24
-
-.Ldoubleword_aligned_r0:
-        /* Get here when r0 is double-word aligned.  */
-        /* Is r1 doubleword_aligned?  */
-        tst     r1, #4
-        beq     .Ldoubleword_aligned
-
-        /* Get here when the strings to compare are word-aligned,
-        r0 is double-word aligned, but r1 is not double-word aligned.  */
-
-        /* Initialize the queue.  */
-        ldr     r5, [r1], #4
-
-        /* Compare two words in every iteration.  */
-        .p2align        2
-3:
-        /*
-        optpld r0, #16
-        optpld r1, #16
-        */
-
-        /* Load the next double-word from each string and compare.  */
-        ldrd    r2, r3, [r0], #8
-        magic_compare_and_branch w1=r2 w2=r5 label=.Lreturn_25
-        ldrd    r4, r5, [r1], #8
-        magic_compare_and_branch w1=r3 w2=r4 label=.Lreturn_34
-        b       3b
-
-        .macro miscmp_word offsetlo offsethi
-        /* Macro to compare misaligned strings.  */
-        /* r0, r1 are word-aligned, and at least one of the strings
-        is not double-word aligned.  */
-        /* Compare one word in every loop iteration.  */
-        /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
-        OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
-
-        /* Initialize the shift queue.  */
-        ldr     r5, [r1], #4
-
-        /* Compare one word from each string in every loop iteration.  */
-        .p2align        2
-7:
-        ldr     r3, [r0], #4
-        S2LOMEM r5, r5, #\offsetlo
-        magic_find_zero_bytes w1=r3
-        cmp     r7, ip, S2HIMEM #\offsetlo
-        and     r2, r3, r6, S2LOMEM #\offsetlo
-        it      eq
-        cmpeq   r2, r5
-        bne     .Lreturn_25
-        ldr     r5, [r1], #4
-        cmp     ip, #0
-        eor	r3, r2, r3
-        S2HIMEM r2, r5, #\offsethi
-        it      eq
-        cmpeq   r3, r2
-        bne     .Lreturn_32
-        b       7b
-        .endm /* miscmp_word */
-
-.Lstrcmp_unaligned:
-        /* r0 is word-aligned, r1 is at offset ip from a word.  */
-        /* Align r1 to the (previous) word-boundary.  */
-        bic     r1, r1, #3
-
-        /* Unaligned comparison word by word using LDRs. */
-        cmp     ip, #2
-        beq     .Lmiscmp_word_16                    /* If ip == 2.  */
-        bge     .Lmiscmp_word_24                    /* If ip == 3.  */
-        miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
-.Lmiscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
-.Lmiscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
-
-
-.Lreturn_32:
-        setup_return w1=r3, w2=r2
-        b       .Ldo_return
-.Lreturn_34:
-        setup_return w1=r3, w2=r4
-        b       .Ldo_return
-.Lreturn_25:
-        setup_return w1=r2, w2=r5
-        b       .Ldo_return
-.Lreturn_35:
-        setup_return w1=r3, w2=r5
-        b       .Ldo_return
-.Lreturn_24:
-        setup_return w1=r2, w2=r4
-
-.Ldo_return:
-
-#ifdef __ARMEB__
-        mov     r0, ip
-#else /* not  __ARMEB__ */
-        rev     r0, ip
-#endif /* not  __ARMEB__ */
-
-        /* Restore temporaries early, before computing the return value.  */
-        ldrd    r6, r7, [sp]
-        ldrd    r4, r5, [sp, #8]
-        adds    sp, sp, #16
-
-        /* There is a zero or a different byte between r1 and r2.  */
-        /* r0 contains a mask of all-zero bytes in r1.  */
-        /* Using r0 and not ip here because cbz requires low register.  */
-        m_cbz   reg=r0, label=.Lcompute_return_value
-        clz     r0, r0
-        /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
-        rsb     r0, r0, #24
-        /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
-        lsr     r1, r1, r0
-        lsr     r2, r2, r0
-
-.Lcompute_return_value:
-        movs    r0, #1
-        cmp     r1, r2
-        /* The return value is computed as follows.
-        If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
-        If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
-        which means r0:=r0-r0-1 and r0 is #-1 at return.
-        If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
-        which means r0:=r0-r0 and r0 is #0 at return.
-        (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
-        it      ls
-        sbcls   r0, r0, r0
-        bx      lr
-
-
-#else   /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
-             defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
-             (defined (__thumb__) && !defined (__thumb2__))) */
-
-        /* Use LDR whenever possible. */
-
-#ifdef __thumb2__
-#define magic1(REG) 0x01010101
-#define magic2(REG) 0x80808080
-#else
-#define magic1(REG) REG
-#define magic2(REG) REG, lsl #7
-#endif
+#if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) \
+	|| (__ARM_ARCH == 6 && __ARM_ARCH_PROFILE == 'M')
 
-        optpld  r0
-        optpld  r1
-        eor     r2, r0, r1
-        tst     r2, #3
-        /* Strings not at same byte offset from a word boundary.  */
-        bne     .Lstrcmp_unaligned
-        ands    r2, r0, #3
-        bic     r0, r0, #3
-        bic     r1, r1, #3
-        ldr     ip, [r0], #4
-        it      eq
-        ldreq   r3, [r1], #4
-        beq     1f
-        /* Although s1 and s2 have identical initial alignment, they are
-        not currently word aligned.  Rather than comparing bytes,
-	make sure that any bytes fetched from before the addressed
-	bytes are forced to 0xff.  Then they will always compare
-	equal.  */
-        eor     r2, r2, #3
-        lsl     r2, r2, #3
-        mvn     r3, MSB
-        S2LOMEM        r2, r3, r2
-        ldr     r3, [r1], #4
-        orr     ip, ip, r2
-        orr     r3, r3, r2
-1:
-#ifndef __thumb2__
-              /* Load the 'magic' constant 0x01010101.  */
-        str     r4, [sp, #-4]!
-        mov     r4, #1
-        orr     r4, r4, r4, lsl #8
-        orr     r4, r4, r4, lsl #16
-#endif
-        .p2align        2
-4:
-        optpld  r0, #8
-        optpld  r1, #8
-        sub     r2, ip, magic1(r4)
-        cmp     ip, r3
-        itttt   eq
-        /* check for any zero bytes in first word */
-        biceq   r2, r2, ip
-        tsteq   r2, magic2(r4)
-        ldreq   ip, [r0], #4
-        ldreq   r3, [r1], #4
-        beq     4b
-2:
-        /* There's a zero or a different byte in the word */
-        S2HIMEM  r0, ip, #24
-        S2LOMEM  ip, ip, #8
-        cmp     r0, #1
-        it      cs
-        cmpcs   r0, r3, S2HIMEM #24
-        it      eq
-        S2LOMEMEQ r3, r3, #8
-        beq     2b
-        /* On a big-endian machine, r0 contains the desired byte in bits
-        0-7; on a little-endian machine they are in bits 24-31.  In
-        both cases the other bits in r0 are all zero.  For r3 the
-        interesting byte is at the other end of the word, but the
-        other bits are not necessarily zero.  We need a signed result
-        representing the differnece in the unsigned bytes, so for the
-        little-endian case we can't just shift the interesting bits
-        up.  */
-#ifdef __ARMEB__
-        sub     r0, r0, r3, lsr #24
-#else
-        and     r3, r3, #255
-#ifdef __thumb2__
-        /* No RSB instruction in Thumb2 */
-        lsr     r0, r0, #24
-        sub     r0, r0, r3
-#else
-        rsb     r0, r3, r0, lsr #24
-#endif
-#endif
-#ifndef __thumb2__
-        ldr     r4, [sp], #4
-#endif
-        RETURN
+# if defined (__thumb__) && !defined (__thumb2__)
+/* Thumb1 only variant.  */
+#  include "strcmp-armv4t.S"
+# else
+#  include "strcmp-arm-tiny.S"
+# endif
 
+#elif __ARM_ARCH >= 7
 
-.Lstrcmp_unaligned:
+# ifdef __ARM_FEATURE_SIMD32
+#  include "strcmp-armv7.S"
+# else
+#  include "strcmp-armv7m.S"
+# endif
 
-#if 0
-        /* The assembly code below is based on the following alogrithm.  */
-#ifdef __ARMEB__
-#define RSHIFT <<
-#define LSHIFT >>
-#else
-#define RSHIFT >>
-#define LSHIFT <<
-#endif
+#elif __ARM_ARCH >= 6
 
-#define body(shift)							\
-  mask = 0xffffffffU RSHIFT shift;					\
-  w1 = *wp1++;								\
-  w2 = *wp2++;								\
-  do									\
-    {									\
-      t1 = w1 & mask;							\
-      if (__builtin_expect(t1 != w2 RSHIFT shift, 0))			\
-	{								\
-	  w2 RSHIFT= shift;						\
-	  break;							\
-	}								\
-      if (__builtin_expect(((w1 - b1) & ~w1) & (b1 << 7), 0))		\
-	{								\
-	  /* See comment in assembler below re syndrome on big-endian */\
-	  if ((((w1 - b1) & ~w1) & (b1 << 7)) & mask)			\
-	    w2 RSHIFT= shift;						\
-	  else								\
-	    {								\
-	      w2 = *wp2;						\
-	      t1 = w1 RSHIFT (32 - shift);				\
-	      w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift);	\
-	    }								\
-	  break;							\
-	}								\
-      w2 = *wp2++;							\
-      t1 ^= w1;								\
-      if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0))		\
-	{								\
-	  t1 = w1 >> (32 - shift);					\
-	  w2 = (w2 << (32 - shift)) RSHIFT (32 - shift);		\
-	  break;							\
-	}								\
-      w1 = *wp1++;							\
-    } while (1)
-
-  const unsigned* wp1;
-  const unsigned* wp2;
-  unsigned w1, w2;
-  unsigned mask;
-  unsigned shift;
-  unsigned b1 = 0x01010101;
-  char c1, c2;
-  unsigned t1;
-
-  while (((unsigned) s1) & 3)
-    {
-      c1 = *s1++;
-      c2 = *s2++;
-      if (c1 == 0 || c1 != c2)
-	return c1 - (int)c2;
-    }
-  wp1 = (unsigned*) (((unsigned)s1) & ~3);
-  wp2 = (unsigned*) (((unsigned)s2) & ~3);
-  t1 = ((unsigned) s2) & 3;
-  if (t1 == 1)
-    {
-      body(8);
-    }
-  else if (t1 == 2)
-    {
-      body(16);
-    }
-  else
-    {
-      body (24);
-    }
-
-  do
-    {
-#ifdef __ARMEB__
-      c1 = (char) t1 >> 24;
-      c2 = (char) w2 >> 24;
-#else /* not  __ARMEB__ */
-      c1 = (char) t1;
-      c2 = (char) w2;
-#endif /* not  __ARMEB__ */
-      t1 RSHIFT= 8;
-      w2 RSHIFT= 8;
-    } while (c1 != 0 && c1 == c2);
-  return c1 - c2;
-#endif /* 0 */
-
-
-        wp1 .req r0
-        wp2 .req r1
-        b1  .req r2
-        w1  .req r4
-        w2  .req r5
-        t1  .req ip
-        @ r3 is scratch
-
-        /* First of all, compare bytes until wp1(sp1) is word-aligned. */
-1:
-        tst     wp1, #3
-        beq     2f
-        ldrb    r2, [wp1], #1
-        ldrb    r3, [wp2], #1
-        cmp     r2, #1
-        it      cs
-        cmpcs   r2, r3
-        beq     1b
-        sub     r0, r2, r3
-        RETURN
-
-2:
-        str     r5, [sp, #-4]!
-        str     r4, [sp, #-4]!
-        //stmfd   sp!, {r4, r5}
-        mov     b1, #1
-        orr     b1, b1, b1, lsl #8
-        orr     b1, b1, b1, lsl #16
-
-        and     t1, wp2, #3
-        bic     wp2, wp2, #3
-        ldr     w1, [wp1], #4
-        ldr     w2, [wp2], #4
-        cmp     t1, #2
-        beq     2f
-        bhi     3f
-
-        /* Critical inner Loop: Block with 3 bytes initial overlap */
-        .p2align        2
-1:
-        bic     t1, w1, MSB
-        cmp     t1, w2, S2LOMEM #8
-        sub     r3, w1, b1
-        bic     r3, r3, w1
-        bne     4f
-        ands    r3, r3, b1, lsl #7
-        it      eq
-        ldreq   w2, [wp2], #4
-        bne     5f
-        eor     t1, t1, w1
-        cmp     t1, w2, S2HIMEM #24
-        bne     6f
-        ldr     w1, [wp1], #4
-        b       1b
-4:
-        S2LOMEM        w2, w2, #8
-        b       8f
-
-5:
-#ifdef __ARMEB__
-        /* The syndrome value may contain false ones if the string ends
-        with the bytes 0x01 0x00 */
-        tst     w1, #0xff000000
-        itt     ne
-        tstne   w1, #0x00ff0000
-        tstne   w1, #0x0000ff00
-        beq     7f
-#else
-        bics    r3, r3, #0xff000000
-        bne     7f
-#endif
-        ldrb    w2, [wp2]
-        S2LOMEM  t1, w1, #24
-#ifdef __ARMEB__
-        lsl     w2, w2, #24
-#endif
-        b       8f
+# include "strcmp-armv6.S"
 
-6:
-        S2LOMEM  t1, w1, #24
-        and     w2, w2, LSB
-        b       8f
-
-        /* Critical inner Loop: Block with 2 bytes initial overlap */
-        .p2align        2
-2:
-        S2HIMEM  t1, w1, #16
-        sub     r3, w1, b1
-        S2LOMEM  t1, t1, #16
-        bic     r3, r3, w1
-        cmp     t1, w2, S2LOMEM #16
-        bne     4f
-        ands    r3, r3, b1, lsl #7
-        it      eq
-        ldreq   w2, [wp2], #4
-        bne     5f
-        eor     t1, t1, w1
-        cmp     t1, w2, S2HIMEM #16
-        bne     6f
-        ldr     w1, [wp1], #4
-        b       2b
-
-5:
-#ifdef __ARMEB__
-        /* The syndrome value may contain false ones if the string ends
-        with the bytes 0x01 0x00 */
-        tst     w1, #0xff000000
-        it      ne
-        tstne   w1, #0x00ff0000
-        beq     7f
 #else
-        lsls    r3, r3, #16
-        bne     7f
-#endif
-        ldrh    w2, [wp2]
-        S2LOMEM  t1, w1, #16
-#ifdef __ARMEB__
-        lsl     w2, w2, #16
-#endif
-        b       8f
 
-6:
-        S2HIMEM  w2, w2, #16
-        S2LOMEM  t1, w1, #16
-4:
-        S2LOMEM  w2, w2, #16
-        b       8f
-
-        /* Critical inner Loop: Block with 1 byte initial overlap */
-        .p2align        2
-3:
-        and     t1, w1, LSB
-        cmp     t1, w2, S2LOMEM #24
-        sub     r3, w1, b1
-        bic     r3, r3, w1
-        bne     4f
-        ands    r3, r3, b1, lsl #7
-        it      eq
-        ldreq   w2, [wp2], #4
-        bne     5f
-        eor     t1, t1, w1
-        cmp     t1, w2, S2HIMEM #8
-        bne     6f
-        ldr     w1, [wp1], #4
-        b       3b
-4:
-        S2LOMEM  w2, w2, #24
-        b       8f
-5:
-        /* The syndrome value may contain false ones if the string ends
-        with the bytes 0x01 0x00 */
-        tst     w1, LSB
-        beq     7f
-        ldr     w2, [wp2], #4
-6:
-        S2LOMEM  t1, w1, #8
-        bic     w2, w2, MSB
-        b       8f
-7:
-        mov     r0, #0
-        //ldmfd   sp!, {r4, r5}
-        ldr     r4, [sp], #4
-        ldr     r5, [sp], #4
-        RETURN
-8:
-        and     r2, t1, LSB
-        and     r0, w2, LSB
-        cmp     r0, #1
-        it      cs
-        cmpcs   r0, r2
-        itt     eq
-        S2LOMEMEQ        t1, t1, #8
-        S2LOMEMEQ        w2, w2, #8
-        beq     8b
-        sub     r0, r2, r0
-        //ldmfd   sp!, {r4, r5}
-        ldr     r4, [sp], #4
-        ldr     r5, [sp], #4
-        RETURN
-
-#endif /* !(defined (_ISA_THUMB_2) || defined (_ISA_ARM_6)
-            defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) ||
-            (defined (__thumb__) && !defined (__thumb2__))) */
+# include "strcmp-armv4.S"
+
+#endif


More information about the Newlib mailing list