This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Optimized arm string routines


I've recently written the following hand-optimized versions of the
string functions strlen, strcpy and strcmp for ARM (including support
for thumb1 and thumb2).  These are attached for inclusion in newlib.

They should perform significantly better than the C coded versions since
they invariably use fewer call-saved registers (a major overhead on
short strings).  They also handle more cases than the C versions before
falling back to simple byte loops.

The thumb1 variants are all optimized for space.  The inner loops become
very complex with the restricted instruction set and it's unlikely to
give any significant performance benefit. 

R.

<date>  Richard Earnshaw  <rearnsha@arm.com>

	* libc/machine/arm/arm_asm.h: New file.
	* libc/machine/arm/strlen.c: New file.
	* libc/machine/arm/strcpy.c: New file.
	* libc/machine/arm/strcmp.c: New file.
	* libc/machine/arm/Makefile.am: Add new string routines.


Index: Makefile.am
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/arm/Makefile.am,v
retrieving revision 1.6
diff -p -r1.6 Makefile.am
*** Makefile.am	24 May 2007 17:33:31 -0000	1.6
--- Makefile.am	20 Jan 2009 10:06:43 -0000
*************** AM_CCASFLAGS = $(INCLUDES)
*** 8,14 ****
  
  noinst_LIBRARIES = lib.a
  
! lib_a_SOURCES = setjmp.S access.c
  lib_a_CCASFLAGS=$(AM_CCASFLAGS)
  lib_a_CFLAGS = $(AM_CFLAGS)
  
--- 8,14 ----
  
  noinst_LIBRARIES = lib.a
  
! lib_a_SOURCES = setjmp.S access.c strlen.c strcmp.c strcpy.c
  lib_a_CCASFLAGS=$(AM_CCASFLAGS)
  lib_a_CFLAGS = $(AM_CFLAGS)
  
Index: strcmp.c
===================================================================
RCS file: strcmp.c
diff -N strcmp.c
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- strcmp.c	20 Jan 2009 10:06:43 -0000
***************
*** 0 ****
--- 1,404 ----
+ /*
+  * Copyright (c) 2008 ARM Ltd
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  * 3. The name of the company may not be used to endorse or promote
+  *    products derived from this software without specific prior written
+  *    permission.
+  *
+  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  */
+ 
+ #include "arm_asm.h"
+ #include <_ansi.h>
+ #include <string.h>
+ 
+ #ifdef __ARMEB__
+ #define SHFT2LSB "lsl"
+ #define SHFT2MSB "lsr"
+ #define MSB "0x000000ff"
+ #define LSB "0xff000000"
+ #else
+ #define SHFT2LSB "lsr"
+ #define SHFT2MSB "lsl"
+ #define MSB "0xff000000"
+ #define LSB "0x000000ff"
+ #endif
+ 
+ #ifdef __thumb2__
+ #define magic1(REG) "#0x01010101"
+ #define magic2(REG) "#0x80808080"
+ #else
+ #define magic1(REG) #REG
+ #define magic2(REG) #REG ", lsl #7"
+ #endif
+ 
+ int 
+ __attribute__((naked)) strcmp (const char* s1, const char* s2)
+ {
+   asm(
+ #if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+       (defined (__thumb__) && !defined (__thumb2__)))
+       "optpld	r0\n\t"
+       "optpld	r1\n\t"
+       "eor	r2, r0, r1\n\t"
+       "tst	r2, #3\n\t"
+       /* Strings not at same byte offset from a word boundary.  */
+       "bne	strcmp_unaligned\n\t"
+       "ands	r2, r0, #3\n\t"
+       "bic	r0, r0, #3\n\t"
+       "bic	r1, r1, #3\n\t"
+       "ldr	ip, [r0], #4\n\t"
+       "it	eq\n\t"
+       "ldreq	r3, [r1], #4\n\t"
+       "beq	1f\n\t"
+       /* Although s1 and s2 have identical initial alignment, they are
+ 	 not currently word aligned.  Rather than comparing bytes,
+ 	 make sure that any bytes fetched from before the addressed
+ 	 bytes are forced to 0xff.  Then they will always compare
+ 	 equal.  */
+       "eor	r2, r2, #3\n\t"
+       "lsl	r2, r2, #3\n\t"
+       "mvn	r3, #"MSB"\n\t"
+       SHFT2LSB"	r2, r3, r2\n\t"
+       "ldr	r3, [r1], #4\n\t"
+       "orr	ip, ip, r2\n\t"
+       "orr	r3, r3, r2\n"
+  "1:\n\t"
+ #ifndef __thumb2__
+       /* Load the 'magic' constant 0x01010101.  */
+       "str	r4, [sp, #-4]!\n\t"
+       "mov	r4, #1\n\t"
+       "orr	r4, r4, r4, lsl #8\n\t"
+       "orr	r4, r4, r4, lsl #16\n"
+ #endif
+       ".p2align	2\n"
+  "4:\n\t"
+       "optpld	r0, #8\n\t"
+       "optpld	r1, #8\n\t"
+       "sub	r2, ip, "magic1(r4)"\n\t"
+       "cmp	ip, r3\n\t"
+       "itttt	eq\n\t"
+       /* check for any zero bytes in first word */
+       "eoreq	r2, r2, ip\n\t"
+       "tsteq	r2, "magic2(r4)"\n\t"
+       "ldreq	ip, [r0], #4\n\t"
+       "ldreq	r3, [r1], #4\n\t"
+       "beq	4b\n"
+  "2:\n\t"
+       /* There's a zero or a different byte in the word */
+       SHFT2MSB"	r0, ip, #24\n\t"
+       SHFT2LSB"	ip, ip, #8\n\t"
+       "cmp	r0, #1\n\t"
+       "it	cs\n\t"
+       "cmpcs	r0, r3, "SHFT2MSB" #24\n\t"
+       "it	eq\n\t"
+       SHFT2LSB"eq r3, r3, #8\n\t"
+       "beq	2b\n\t"
+       "sub	r0, r0, r3, "SHFT2MSB" #24\n\t"
+ #ifndef __thumb2__
+       "ldr	r4, [sp], #4\n\t"
+ #endif
+       "RETURN"
+ #elif (defined (__thumb__) && !defined (__thumb2__))
+   "1:\n\t"
+       "ldrb	r2, [r0]\n\t"
+       "ldrb	r3, [r1]\n\t"
+       "add	r0, r0, #1\n\t"
+       "add	r1, r1, #1\n\t"
+       "cmp	r2, #0\n\t"
+       "beq	2f\n\t"
+       "cmp	r2, r3\n\t"
+       "beq	1b\n\t"
+   "2:\n\t"
+       "sub	r0, r2, r3\n\t"
+       "bx	lr"
+ #else
+  "3:\n\t"
+       "ldrb	r2, [r0], #1\n\t"
+       "ldrb	r3, [r1], #1\n\t"
+       "cmp	r2, #1\n\t"
+       "it	cs\n\t"
+       "cmpcs	r2, r3\n\t"
+       "beq	3b\n\t"
+       "sub	r0, r2, r3\n\t"
+       "RETURN"
+ #endif
+       );
+ }
+ 
+ #if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+       (defined (__thumb__) && !defined (__thumb2__)))
+ static int __attribute__((naked, used)) 
+ strcmp_unaligned(const char* s1, const char* s2)
+ {
+ #if 0
+   /* The assembly code below is based on the following alogrithm.  */
+ #ifdef __ARMEB__
+ #define RSHIFT <<
+ #define LSHIFT >>
+ #else
+ #define RSHIFT >>
+ #define LSHIFT <<
+ #endif
+ 
+ #define body(shift)							\
+   mask = 0xffffffffU RSHIFT shift;					\
+   w1 = *wp1++;								\
+   w2 = *wp2++;								\
+   do									\
+     {									\
+       t1 = w1 & mask;							\
+       if (__builtin_expect(t1 != w2 RSHIFT shift, 0))			\
+ 	{								\
+ 	  w2 RSHIFT= shift;						\
+ 	  break;							\
+ 	}								\
+       if (__builtin_expect(((w1 - b1) ^ w1) & (b1 << 7), 0))		\
+ 	{								\
+ 	  if ((((w1 - b1) ^ w1) & (b1 << 7)) & mask)			\
+ 	    w2 RSHIFT= shift;						\
+ 	  else								\
+ 	    {								\
+ 	      w2 = *wp2;						\
+ 	      t1 = w1 RSHIFT (32 - shift);				\
+ 	      w2 = (w2 LSHIFT (32 - shift)) RSHIFT (32 - shift);	\
+ 	    }								\
+ 	  break;							\
+ 	}								\
+       w2 = *wp2++;							\
+       t1 ^= w1;								\
+       if (__builtin_expect(t1 != w2 LSHIFT (32 - shift), 0))		\
+ 	{								\
+ 	  t1 = w1 >> (32 - shift);					\
+ 	  w2 = (w2 << (32 - shift)) RSHIFT (32 - shift);		\
+ 	  break;							\
+ 	}								\
+       w1 = *wp1++;							\
+     } while (1)
+ 
+   const unsigned* wp1;
+   const unsigned* wp2;
+   unsigned w1, w2;
+   unsigned mask;
+   unsigned shift;
+   unsigned b1 = 0x01010101;
+   char c1, c2;
+   unsigned t1;
+ 
+   while (((unsigned) s1) & 3)
+     {
+       c1 = *s1++;
+       c2 = *s2++;
+       if (c1 == 0 || c1 != c2)
+ 	return c1 - (int)c2;
+     }
+   wp1 = (unsigned*) (((unsigned)s1) & ~3);
+   wp2 = (unsigned*) (((unsigned)s2) & ~3);
+   t1 = ((unsigned) s2) & 3;
+   if (t1 == 1)
+     {
+       body(8);
+     }
+   else if (t1 == 2)
+     {
+       body(16);
+     }
+   else
+     {
+       body (24);
+     }
+   
+   do
+     {
+ #ifdef __ARMEB__
+       c1 = (char) t1 >> 24;
+       c2 = (char) w2 >> 24;
+ #else
+       c1 = (char) t1;
+       c2 = (char) w2;
+ #endif
+       t1 RSHIFT= 8;
+       w2 RSHIFT= 8;
+     } while (c1 != 0 && c1 == c2);
+   return c1 - c2;
+ #endif
+ 
+   asm("wp1 .req r0\n\t"
+       "wp2 .req r1\n\t"
+       "b1  .req r2\n\t"
+       "w1  .req r4\n\t"
+       "w2  .req r5\n\t"
+       "t1  .req ip\n\t"
+       "@ r3 is scratch\n"
+ 
+       /* First of all, compare bytes until wp1(sp1) is word-aligned. */
+  "1:\n\t"
+       "tst	wp1, #3\n\t"
+       "beq	2f\n\t"
+       "ldrb	r2, [wp1], #1\n\t"
+       "ldrb	r3, [wp2], #1\n\t"
+       "cmp	r2, #1\n\t"
+       "it	cs\n\t"
+       "cmpcs	r2, r3\n\t"
+       "beq	1b\n\t"
+       "sub	r0, r2, r3\n\t"
+       "RETURN\n"
+ 
+  "2:\n\t"
+       "str	r5, [sp, #-4]!\n\t"
+       "str	r4, [sp, #-4]!\n\t"
+       //      "stmfd	sp!, {r4, r5}\n\t"
+       "mov	b1, #1\n\t"
+       "orr	b1, b1, b1, lsl #8\n\t"
+       "orr	b1, b1, b1, lsl #16\n\t"
+ 
+       "and	t1, wp2, #3\n\t"
+       "bic	wp2, wp2, #3\n\t"
+       "ldr	w1, [wp1], #4\n\t"
+       "ldr	w2, [wp2], #4\n\t"
+       "cmp	t1, #2\n\t"
+       "beq	2f\n\t"
+       "bhi	3f\n"
+ 
+       /* Critical inner Loop: Block with 3 bytes initial overlap */
+       ".p2align	2\n"
+  "1:\n\t"
+       "bic	t1, w1, #"MSB"\n\t"
+       "cmp	t1, w2, "SHFT2LSB" #8\n\t"
+       "sub	r3, w1, b1\n\t"
+       "eor	r3, r3, w1\n\t"
+       "bne	4f\n\t"
+       "ands	r3, r3, b1, lsl #7\n\t"
+       "it	eq\n\t"
+       "ldreq	w2, [wp2], #4\n\t"
+       "bne	5f\n\t"
+       "eor	t1, t1, w1\n\t"
+       "cmp	t1, w2, "SHFT2MSB" #24\n\t"
+       "bne	6f\n\t"
+       "ldr	w1, [wp1], #4\n\t"
+       "b	1b\n"
+  "4:\n\t"
+       SHFT2LSB"	w2, w2, #8\n\t"
+       "b	8f\n"
+ 
+  "5:\n\t"
+       "bics	r3, r3, #"MSB"\n\t"
+       "bne	7f\n\t"
+       "ldrb	w2, [wp2]\n\t"
+       SHFT2LSB"	t1, w1, #24\n\t"
+ #ifdef __ARMEB__
+       SHFT2LSB"	w2, w2, #24\n\t"
+ #endif
+       "b	8f\n"
+ 
+  "6:\n\t"
+       SHFT2LSB"	t1, w1, #24\n\t"
+       "and	w2, w2, #"LSB"\n\t"
+       "b	8f\n"
+ 
+       /* Critical inner Loop: Block with 2 bytes initial overlap */
+       ".p2align	2\n"
+  "2:\n\t"
+       SHFT2MSB"	t1, w1, #16\n\t"
+       "sub	r3, w1, b1\n\t"
+       SHFT2LSB"	t1, t1, #16\n\t"
+       "eor	r3, r3, w1\n\t"
+       "cmp	t1, w2, "SHFT2LSB" #16\n\t"
+       "bne	4f\n\t"
+       "ands	r3, r3, b1, lsl #7\n\t"
+       "it	eq\n\t"
+       "ldreq	w2, [wp2], #4\n\t"
+       "bne	5f\n\t"
+       "eor	t1, t1, w1\n\t"
+       "cmp	t1, w2, "SHFT2MSB" #16\n\t"
+       "bne	6f\n\t"
+       "ldr	w1, [wp1], #4\n\t"
+       "b	2b\n"
+ 
+  "5:\n\t"
+       SHFT2MSB"s	r3, r3, #16\n\t"
+       "bne	7f\n\t"
+       "ldrh	w2, [wp2]\n\t"
+       SHFT2LSB"	t1, w1, #16\n\t"
+ #ifdef __ARMEB__
+       SHFT2LSB"	w2, w2, #16\n\t"
+ #endif
+       "b	8f\n"
+ 
+  "6:\n\t"
+       SHFT2MSB"	w2, w2, #16\n\t"
+       SHFT2LSB"	t1, w1, #16\n\t"
+  "4:\n\t"
+       SHFT2LSB"	w2, w2, #16\n\t"
+       "b	8f\n\t"
+ 
+       /* Critical inner Loop: Block with 1 byte initial overlap */
+       ".p2align	2\n"
+  "3:\n\t"
+       "and	t1, w1, #"LSB"\n\t"
+       "cmp	t1, w2, "SHFT2LSB" #24\n\t"
+       "sub	r3, w1, b1\n\t"
+       "eor	r3, r3, w1\n\t"
+       "bne	4f\n\t"
+       "ands	r3, r3, b1, lsl #7\n\t"
+       "it	eq\n\t"
+       "ldreq	w2, [wp2], #4\n\t"
+       "bne	5f\n\t"
+       "eor	t1, t1, w1\n\t"
+       "cmp	t1, w2, "SHFT2MSB" #8\n\t"
+       "bne	6f\n\t"
+       "ldr	w1, [wp1], #4\n\t"
+       "b	3b\n"
+  "4:\n\t"
+       SHFT2LSB"	w2, w2, #24\n\t"
+       "b	8f\n"
+  "5:\n\t"
+       "tst	r3, #128\n\t"
+       "bne	7f\n\t"
+       "ldr	w2, [wp2], #4\n"
+  "6:\n\t"
+       SHFT2LSB"	t1, w1, #8\n\t"
+       "bic	w2, w2, #"MSB"\n\t"
+       "b	8f\n"
+  "7:\n\t"
+       "mov	r0, #0\n\t"
+       //      "ldmfd	sp!, {r4, r5}\n\t"
+       "ldr	r4, [sp], #4\n\t"
+       "ldr	r5, [sp], #4\n\t"
+       "RETURN\n"
+  "8:\n\t"
+       "and	r2, t1, #"LSB"\n\t"
+       "and	r0, w2, #"LSB"\n\t"
+       "cmp	r0, #1\n\t"
+       "it	cs\n\t"
+       "cmpcs	r0, r2\n\t"
+       "itt	eq\n\t"
+       SHFT2LSB"eq	t1, t1, #8\n\t"
+       SHFT2LSB"eq	w2, w2, #8\n\t"
+       "beq	8b\n\t"
+       "sub	r0, r2, r0\n\t"
+       //      "ldmfd	sp!, {r4, r5}\n\t"
+       "ldr	r4, [sp], #4\n\t"
+       "ldr	r5, [sp], #4\n\t"
+       "RETURN");
+ }
+ 
+ #endif
Index: strcpy.c
===================================================================
RCS file: strcpy.c
diff -N strcpy.c
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- strcpy.c	20 Jan 2009 10:06:43 -0000
***************
*** 0 ****
--- 1,170 ----
+ /*
+  * Copyright (c) 2008 ARM Ltd
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  * 3. The name of the company may not be used to endorse or promote
+  *    products derived from this software without specific prior written
+  *    permission.
+  *
+  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  */
+ 
+ #include "arm_asm.h"
+ #include <_ansi.h>
+ #include <string.h>
+ 
+ #ifdef __thumb2__
+ #define magic1(REG) "#0x01010101"
+ #define magic2(REG) "#0x80808080"
+ #else
+ #define magic1(REG) #REG
+ #define magic2(REG) #REG ", lsl #7"
+ #endif
+ 
+ char* __attribute__((naked))
+ strcpy (char* dst, const char* src)
+ {
+   asm (
+ #if !(defined(__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+       (defined (__thumb__) && !defined (__thumb2__)))
+        "optpld	r1\n\t"
+        "eor	r2, r0, r1\n\t"
+        "mov	ip, r0\n\t"
+        "tst	r2, #3\n\t"
+        "bne	4f\n\t"
+        "tst	r1, #3\n\t"
+        "bne	3f\n"
+   "5:\n\t"
+ #ifndef __thumb2__
+        "str	r5, [sp, #-4]!\n\t"
+        "mov	r5, #0x01\n\t"
+        "orr	r5, r5, r5, lsl #8\n\t"
+        "orr	r5, r5, r5, lsl #16\n\t"
+ #endif
+ 
+        "str	r4, [sp, #-4]!\n\t"
+        "tst	r1, #4\n\t"
+        "ldr	r4, [r1], #4\n\t"
+        "beq	2f\n\t"
+        "sub	r2, r4, "magic1(r5)"\n\t"
+        "eors	r2, r2, r4\n\t"
+        "tst	r2, "magic2(r5)"\n\t"
+        "it	eq\n\t"
+        "ldreq	r3, [r1], #4\n"
+        "bne	6f\n\t"
+        "str	r4, [ip], #4\n\t"
+        /* Inner loop.  We now know that r1 is 64-bit aligned, so we
+ 	  can safely fetch up to two words.  This allows us to avoid
+ 	  load stalls.  */
+        ".p2align 2\n"
+   "2:\n\t"
+        "optpld	r1, #8\n\t"
+        "ldr	r4, [r1], #4\n\t"
+        "sub	r2, r3, "magic1(r5)"\n\t"
+        "eors	r2, r2, r3\n\t"
+        "tst	r2, "magic2(r5)"\n\t"
+        "sub	r2, r4, "magic1(r5)"\n\t"
+        "bne	1f\n\t"
+        "str	r3, [ip], #4\n\t"
+        "eors	r2, r2, r4\n\t"
+        "tst	r2, "magic2(r5)"\n\t"
+        "itt	eq\n\t"
+        "ldreq	r3, [r1], #4\n\t"
+        "streq	r4, [ip], #4\n\t"
+        "beq	2b\n\t"
+   "6:\n\t"
+        "mov	r3, r4\n"
+   "1:\n\t"
+ #ifdef __ARMEB__
+        "rors	r3, r3 #24\n\t"
+ #endif
+        "strb	r3, [ip], #1\n\t"
+        "tst	r3, #0xff\n\t"
+ #ifdef __ARMEL__
+        "ror	r3, r3, #8\n\t"
+ #endif
+        "bne	1b\n\t"
+        "ldr	r4, [sp], #4\n\t"
+ #ifndef __thumb2__
+        "ldr	r5, [sp], #4\n\t"
+ #endif
+        "RETURN\n"
+ 
+        /* Strings have the same offset from word alignment, but it's
+ 	  not zero.  */
+   "3:\n\t"
+        "tst	r1, #1\n\t"
+        "beq	1f\n\t"
+        "ldrb	r2, [r1], #1\n\t"
+        "strb	r2, [ip], #1\n\t"
+        "cmp	r2, #0\n\t"
+        "it	eq\n"
+        "RETURN	eq\n"
+   "1:\n\t"
+        "tst	r1, #2\n\t"
+        "beq	5b\n\t"
+        "ldrh	r2, [r1], #2\n\t"
+ #ifdef __ARMEB__
+        "tst	r2, #0xff00\n\t"
+        "iteet	ne\n\t"
+        "strneh	r2, [ip], #2\n\t"
+        "lsreq	r2, r2, #8\n\t"
+        "streqb	r2, [ip]\n\t"
+        "tstne	r2, #0xff\n\t"
+ #else
+        "tst	r2, #0xff\n\t"
+        "itet	ne\n\t"
+        "strneh	r2, [ip], #2\n\t"
+        "streqb	r2, [ip]\n\t"
+        "tstne	r2, #0xff00\n\t"
+ #endif
+        "bne	5b\n\t"
+        "RETURN\n"
+ 
+        /* src and dst do not have a common word-alignement.  Fall back to
+ 	  byte copying.  */
+   "4:\n\t"
+        "ldrb	r2, [r1], #1\n\t"
+        "strb	r2, [ip], #1\n\t"
+        "cmp	r2, #0\n\t"
+        "bne	4b\n\t"
+        "RETURN"
+ 
+ #elif !defined (__thumb__) || defined (__thumb2__)
+        "mov	r3, r0\n\t"
+   "1:\n\t"
+        "ldrb	r2, [r1], #1\n\t"
+        "strb	r2, [r3], #1\n\t"
+        "cmp	r2, #0\n\t"
+        "bne	1b\n\t"
+        "RETURN"
+ #else
+        "mov	r3, r0\n\t"
+   "1:\n\t"
+        "ldrb	r2, [r1]\n\t"
+        "add	r1, r1, #1\n\t"
+        "strb	r2, [r3]\n\t"
+        "add	r3, r3, #1\n\t"
+        "cmp	r2, #0\n\t"
+        "bne	1b\n\t"
+        "RETURN"
+ #endif
+        );
+ }
Index: strlen.c
===================================================================
RCS file: strlen.c
diff -N strlen.c
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- strlen.c	20 Jan 2009 10:06:43 -0000
***************
*** 0 ****
--- 1,177 ----
+ /*
+  * Copyright (c) 2008 ARM Ltd
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  * 3. The name of the company may not be used to endorse or promote
+  *    products derived from this software without specific prior written
+  *    permission.
+  *
+  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  */
+ 
+ #include "arm_asm.h"
+ #include <_ansi.h>
+ #include <string.h>
+ #include <limits.h>
+ 
+ #if defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
+   (defined (__thumb__) && !defined (__thumb2__))
+ 
+ size_t
+ strlen (const char* str)
+ {
+   int scratch;
+ #if defined (__thumb__) && !defined (__thumb2__)
+   size_t len;
+   asm ("mov	%0, #0\n"
+        "1:\n\t"
+        "ldrb	%1, [%2, %0]\n\t"
+        "add 	%0, %0, #1\n\t"
+        "cmp	%1, #0\n\t"
+        "bne	1b"
+        : "=&r" (len), "=&r" (scratch) : "r" (str) : "memory", "cc");
+   return len - 1;
+ #else
+   const char* end;
+   asm ("1:\n\t"
+        "ldrb	%1, [%0], #1\n\t"
+        "cmp	%1, #0\n\t"
+        "bne	1b"
+        : "=&r" (end), "=&r" (scratch) : "0" (str) : "memory", "cc");
+   return end - str - 1;
+ #endif
+ }
+ #else
+ 
+ size_t __attribute__((naked))
+ strlen (const char* str)
+ {
+   asm ("len .req r0\n\t"
+        "data .req r3\n\t"
+        "addr .req r1\n\t"
+ 
+        "optpld r0\n\t"
+        /* Word-align address */
+        "bic	addr, r0, #3\n\t"
+        /* Get adjustment for start ... */
+        "ands	len, r0, #3\n\t"
+        "neg	len, len\n\t"
+        /* First word of data */
+        "ldr	data, [addr], #4\n\t"
+        /* Ensure bytes preceeding start ... */
+        "add	ip, len, #4\n\t"
+        "mov	ip, ip, asl #3\n\t"
+        "mvn	r2, #0\n\t"
+        "it	ne\n\t"
+        /* ... are masked out */
+ #ifdef __thumb__
+ # ifdef __ARMEB__
+        "lslne	r2, ip\n\t"
+ # else
+        "lsrne	r2, ip\n\t"
+ # endif
+        "orr	data, data, r2\n\t"
+ #else
+ # ifdef __ARMEB__
+        "orrne	data, data, r2, lsl ip\n\t"
+ # else
+        "orrne	data, data, r2, lsr ip\n\t"
+ # endif
+ #endif
+        /* Magic const 0x01010101 */
+ #ifdef _ISA_ARM_7
+        "movw	ip, #0x101\n\t"
+ #else
+        "mov	ip, #0x1\n\t"
+        "orr	ip, ip, ip, lsl #8\n\t"
+ #endif
+        "orr	ip, ip, ip, lsl #16\n"
+ 
+ 	/* This is the main loop.  We subtract one from each byte in the
+ 	   word: the sign bit changes iff the byte was zero.  */
+        "1:\n\t"
+        /* test (data - 0x01010101)  */
+        "sub	r2, data, ip\n\t"
+        /* ... ^ data */
+        "eor	r2, r2, data\n\t"
+        /* ... & 0x80808080 == 0? */
+        "ands	r2, r2, ip, lsl #7\n\t"
+ #ifdef _ISA_ARM_7
+        /* yes, get more data... */
+        "itt	eq\n\t"
+        "ldreq	data, [addr], #4\n\t"
+        /* and 4 more bytes  */
+        "addeq	len, len, #4\n\t"
+ 	/* If we have PLD, then unroll the loop a bit.  */
+        "optpld addr, #8\n\t"
+        /*  test (data - 0x01010101)  */
+        "ittt	eq\n\t"
+        "subeq	r2, data, ip\n\t"
+        /* ... ^ data */
+        "eoreq	r2, r2, data\n\t"
+        /* ... & 0x80808080 == 0? */
+        "andeqs	r2, r2, ip, lsl #7\n\t"
+ #endif
+        "itt	eq\n\t"
+        /* yes, get more data... */
+        "ldreq	data, [addr], #4\n\t"
+        /* and 4 more bytes  */
+        "addeq	len, len, #4\n\t"
+        "beq	1b\n\t"
+ #ifdef __ARMEB__
+        "tst	data, #0xff000000\n\t"
+        "itttt	ne\n\t"
+        "addne	len, len, #1\n\t"
+        "tstne	data, #0xff0000\n\t"
+        "addne	len, len, #1\n\t"
+        "tstne	data, #0xff00\n\t"
+        "it	ne\n\t"
+        "addne	len, len, #1\n\t"
+ #else
+ # ifdef _ISA_ARM_5
+ 	/* R2 is the residual sign bits from the above test.  All we
+ 	need to do now is establish the position of the first zero
+ 	byte... */
+ 	/* Little-endian is harder, we need the number of trailing
+ 	zeros / 8 */
+ #  ifdef _ISA_ARM_7
+        "rbit	r2, r2\n\t"
+        "clz	r2, r2\n\t"
+ #  else
+        "rsb	r1, r2, #0\n\t"
+        "and	r2, r2, r1\n\t"
+        "clz	r2, r2\n\t"
+        "rsb	r2, r2, #31\n\t"
+ #  endif
+        "add	len, len, r2, lsr #3\n\t"
+ # else  /* No CLZ instruction */
+        "tst	data, #0xff\n\t"
+        "itttt	ne\n\t"
+        "addne	len, len, #1\n\t"
+        "tstne	data, #0xff00\n\t"
+        "addne	len, len, #1\n\t"
+        "tstne	data, #0xff0000\n\t"
+        "it	ne\n\t"
+        "addne	len, len, #1\n\t"
+ # endif
+ #endif
+        "RETURN");
+ }
+ #endif
Index: arm_asm.h
===================================================================
RCS file: arm_asm.h
diff -N arm_asm.h
*** /dev/null	1 Jan 1970 00:00:00 -0000
--- arm_asm.h	20 Jan 2009 10:06:43 -0000
***************
*** 0 ****
--- 1,81 ----
+ /*
+  * Copyright (c) 2009 ARM Ltd
+  * All rights reserved.
+  *
+  * Redistribution and use in source and binary forms, with or without
+  * modification, are permitted provided that the following conditions
+  * are met:
+  * 1. Redistributions of source code must retain the above copyright
+  *    notice, this list of conditions and the following disclaimer.
+  * 2. Redistributions in binary form must reproduce the above copyright
+  *    notice, this list of conditions and the following disclaimer in the
+  *    documentation and/or other materials provided with the distribution.
+  * 3. The name of the company may not be used to endorse or promote
+  *    products derived from this software without specific prior written
+  *    permission.
+  *
+  * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  */
+ 
+ #ifndef ARM_ASM__H
+ #define AMR_ASM__H
+ 
+ /* First define some macros that keep everything else sane.  */
+ #if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__)
+ #define _ISA_ARM_7
+ #endif
+ 
+ #if defined (_ISA_ARM_7) || defined (__ARM_ARCH_6__) || \
+     defined (__ARM_ARCH_6J__) || defined (__ARM_ARCH_6T2__) || \
+     defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6ZK__) || \
+     defined (__ARM_ARCH_6Z__)
+ #define _ISA_ARM_6
+ #endif
+ 
+ #if defined (_ISA_ARM_6) || defined (__ARM_ARCH_5__) || \
+     defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5TE__) || \
+     defined (__ARM_ARCH_5TEJ__)
+ #define _ISA_ARM_5
+ #endif
+ 
+ #if defined (_ISA_ARM_5) || defined (__ARM_ARCH_4T__)
+ #define _ISA_ARM_4T
+ #endif
+ 
+ #if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7__)
+ #define _ISA_THUMB_2
+ #endif
+ 
+ #if defined (_ISA_THUMB_2) || defined (__ARM_ARCH_6M__)
+ #define _ISA_THUMB_1
+ #endif
+ 
+ 
+ /* Now some macros for common instruction sequences.  */
+ 
+ asm(".macro  RETURN	cond=\n\t"
+ #if defined (_ISA_ARM_4T) || defined (_ISA_THUMB_1)
+     "bx\\cond	lr\n\t"
+ #else
+     "mov\\cond	pc, lr\n\t"
+ #endif
+     ".endm"
+     );
+ 
+ asm(".macro optpld	base, offset=#0\n\t"
+ #if defined (_ISA_ARM_7)
+     "pld	[\\base, \\offset]\n\t"
+ #endif
+     ".endm"
+     );
+ 
+ #endif /* ARM_ASM__H */

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]