strlen-armv7.S: Import latest strlen cortex-strings code.

author Will Newton <will.newton@linaro.org>

Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)

committer Will Newton <will.newton@linaro.org>

Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)
author Will Newton <will.newton@linaro.org>
Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)
committer Will Newton <will.newton@linaro.org>
Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)
diff --git a/newlib/ChangeLog b/newlib/ChangeLog

index 35fd4a0802eabdfae3764c86855dd3d7c4dfc47d..e4ac48eaeac5c2f2982d888966c3be8d4c0a54e3 100644 (file)
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,3 +1,8 @@
+2013-06-21  Will Newton  <will.newton@linaro.org>
+
+       * libc/machine/arm/strlen-armv7.S: Import latest strlen
+       code from Linaro cortex-strings.
+
  2013-06-21  Will Newton  <will.newton@linaro.org>
  
         * MAINTAINERS: Add Will Newton to Write After Approval.
diff --git a/newlib/libc/machine/arm/strlen-armv7.S b/newlib/libc/machine/arm/strlen-armv7.S

index d6e2831ff980d3edf5d7d8f321187c532ed80543..1aa51c9fbe235014dcc7d1c4cc895d3dc6de22f8 100644 (file)
--- a/newlib/libc/machine/arm/strlen-armv7.S
+++ b/newlib/libc/machine/arm/strlen-armv7.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Linaro Limited
+/* Copyright (c) 2010-2011,2013 Linaro Limited
     All rights reserved.
  
     Redistribution and use in source and binary forms, with or without
@@ -28,100 +28,130 @@
     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  
-   Written by Dave Gilbert <david.gilbert@linaro.org>
-
-   This strlen routine is optimised on a Cortex-A9 and should work on
-   all ARMv7 processors.   This routine is reasonably fast for short
-   strings, but is probably slower than a simple implementation if all
-   your strings are very short */
-
-@ 2011-02-08 david.gilbert@linaro.org
-@    Extracted from local git 6848613a
-@ 2011-10-13 david.gilbert@linaro.org
-@    Extracted from cortex-strings bzr rev 63
-@      Integrate to newlib, flip to ldrd
-@      Pull in Endian macro from my memchr
+   Assumes:
+   ARMv6T2, AArch32
+ */
  
  #include "arm_asm.h"
  
-@ NOTE: This ifdef MUST match the ones in arm/strlen.c
-@ We fallback to the one in arm/strlen.c for size optimised or
-@ for older arch's
+/* NOTE: This ifdef MUST match the ones in arm/strlen.c
+   We fallback to the one in arm/strlen.c for size optimised or
+   for older architectures. */
  #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
      !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
        (defined (__thumb__) && !defined (__thumb2__)))
  
-@ this lets us check a flag in a 00/ff byte easily in either endianness
+       .macro def_fn f p2align=0
+       .text
+       .p2align \p2align
+       .global \f
+       .type \f, %function
+\f:
+       .endm
+
  #ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
+#define S2LO           lsl
+#define S2HI           lsr
  #else
-#define CHARTSTMASK(c) 1<<(c*8)
+#define S2LO           lsr
+#define S2HI           lsl
  #endif
  
-@------------------------------------------------------------------------------
+       /* This code requires Thumb.  */
+       .thumb
         .syntax unified
-       .arch armv7-a
-
-       .thumb_func
-       .align 2
-       .p2align 4,,15
-       .global strlen
-       .type strlen,%function
-strlen:
-       @ r0 = string
-       @ returns count of bytes in string not including terminator
-       mov     r1, r0
-       push    { r4,r6 }
-       mvns    r6, #0          @ all F
-       movs    r4, #0
-       tst     r0, #7
-       beq     2f
-
-1:
-       ldrb    r2, [r1], #1
-       tst     r1, #7          @ Hit alignment yet?
-       cbz     r2, 10f         @ Exit if we found the 0
-       bne     1b
-
-       @ So we're now aligned
-2:
-       ldrd    r2,r3,[r1],#8
-       uadd8   r2, r2, r6      @ Par add 0xff - sets the GE bits for bytes!=0
-       sel     r2, r4, r6      @ bytes are 00 for none-00 bytes,
-                               @ or ff for 00 bytes - NOTE INVERSION
-       uadd8   r3, r3, r6      @ Par add 0xff - sets the GE bits for bytes!=0
-       sel     r3, r2, r6      @ chained...bytes are 00 for none-00 bytes,
-                               @ or ff for 00 bytes - NOTE INVERSION
-       cmp     r3, #0
-       beq     2b
-
-strlenendtmp:
-       @ One (or more) of the bytes we loaded was 0 - but which one?
-       @ r2 has the mask corresponding to the first loaded word
-       @ r3 has a combined mask of the two words - but if r2 was all-non 0 
-       @ then it's just the 2nd words
-       cmp     r2, #0
-       itte    eq
-       moveq   r2, r3          @ the end is in the 2nd word
-       subeq   r1,r1,#3
-       subne   r1,r1,#7
-
-       @ r1 currently points to the 2nd byte of the word containing the 0
-       tst     r2, # CHARTSTMASK(0)    @ 1st character
-       bne     10f
-       adds    r1,r1,#1
-       tst     r2, # CHARTSTMASK(1)    @ 2nd character
-       ittt    eq
-       addeq   r1,r1,#1
-       tsteq   r2, # (3<<15)   @ 2nd & 3rd character
-       @ If not the 3rd must be the last one
-       addeq   r1,r1,#1
-
-10:
-       @ r0 is still at the beginning, r1 is pointing 1 byte after the nul
-       sub     r0, r1, r0
-       subs    r0, r0, #1
-       pop     { r4, r6 }
+
+/* Parameters and result.  */
+#define srcin          r0
+#define result         r0
+
+/* Internal variables.  */
+#define src            r1
+#define data1a         r2
+#define data1b         r3
+#define const_m1       r12
+#define const_0                r4
+#define tmp1           r4              /* Overlaps const_0  */
+#define tmp2           r5
+
+def_fn strlen p2align=6
+       pld     [srcin, #0]
+       strd    r4, r5, [sp, #-8]!
+       bic     src, srcin, #7
+       mvn     const_m1, #0
+       ands    tmp1, srcin, #7         /* (8 - bytes) to alignment.  */
+       pld     [src, #32]
+       bne.w   .Lmisaligned8
+       mov     const_0, #0
+       mov     result, #-8
+.Lloop_aligned:
+       /* Bytes 0-7.  */
+       ldrd    data1a, data1b, [src]
+       pld     [src, #64]
+       add     result, result, #8
+.Lstart_realigned:
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 8-15.  */
+       ldrd    data1a, data1b, [src, #8]
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 16-23.  */
+       ldrd    data1a, data1b, [src, #16]
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cbnz    data1b, .Lnull_found
+
+       /* Bytes 24-31.  */
+       ldrd    data1a, data1b, [src, #24]
+       add     src, src, #32
+       uadd8   data1a, data1a, const_m1        /* Saturating GE<0:3> set.  */
+       add     result, result, #8
+       sel     data1a, const_0, const_m1       /* Select based on GE<0:3>.  */
+       uadd8   data1b, data1b, const_m1
+       sel     data1b, data1a, const_m1        /* Only used if d1a == 0.  */
+       cmp     data1b, #0
+       beq     .Lloop_aligned
+
+.Lnull_found:
+       cmp     data1a, #0
+       itt     eq
+       addeq   result, result, #4
+       moveq   data1a, data1b
+#ifndef __ARMEB__
+       rev     data1a, data1a
+#endif
+       clz     data1a, data1a
+       ldrd    r4, r5, [sp], #8
+       add     result, result, data1a, lsr #3  /* Bits -> Bytes.  */
         bx      lr
  
+.Lmisaligned8:
+       ldrd    data1a, data1b, [src]
+       and     tmp2, tmp1, #3
+       rsb     result, tmp1, #0
+       lsl     tmp2, tmp2, #3                  /* Bytes -> bits.  */
+       tst     tmp1, #4
+       pld     [src, #64]
+       S2HI    tmp2, const_m1, tmp2
+       orn     data1a, data1a, tmp2
+       itt     ne
+       ornne   data1b, data1b, tmp2
+       movne   data1a, const_m1
+       mov     const_0, #0
+       b       .Lstart_realigned
+       .size   strlen, . - strlen
+
  #endif
author	Will Newton <will.newton@linaro.org>
	Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)
committer	Will Newton <will.newton@linaro.org>
	Fri, 21 Jun 2013 09:10:37 +0000 (09:10 +0000)
newlib/ChangeLog		patch \| blob \| blame \| history
newlib/libc/machine/arm/strlen-armv7.S		patch \| blob \| blame \| history