This is the mail archive of the mailing list for the newlib project.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] strlen-armv7.S: Import latest strlen code.

Import the latest version of strlen from the Linaro cortex-strings
package. This version is faster across a variety of block sizes and
alignments on ARMv7.


2013-06-13  Will Newton  <>

	* libc/machine/arm/strlen-armv7.S: Import latest strlen
	code from Linaro cortex-strings.
 newlib/libc/machine/arm/strlen-armv7.S | 196 +++++++++++++++++++--------------
 1 file changed, 113 insertions(+), 83 deletions(-)

diff --git a/newlib/libc/machine/arm/strlen-armv7.S b/newlib/libc/machine/arm/strlen-armv7.S
index d6e2831..1aa51c9 100644
--- a/newlib/libc/machine/arm/strlen-armv7.S
+++ b/newlib/libc/machine/arm/strlen-armv7.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Linaro Limited
+/* Copyright (c) 2010-2011,2013 Linaro Limited
    All rights reserved.

    Redistribution and use in source and binary forms, with or without
@@ -28,100 +28,130 @@

-   Written by Dave Gilbert <>
-   This strlen routine is optimised on a Cortex-A9 and should work on
-   all ARMv7 processors.   This routine is reasonably fast for short
-   strings, but is probably slower than a simple implementation if all
-   your strings are very short */
-@ 2011-02-08
-@    Extracted from local git 6848613a
-@ 2011-10-13
-@    Extracted from cortex-strings bzr rev 63
-@      Integrate to newlib, flip to ldrd
-@      Pull in Endian macro from my memchr
+   Assumes:
+   ARMv6T2, AArch32
+ */

 #include "arm_asm.h"

-@ NOTE: This ifdef MUST match the ones in arm/strlen.c
-@ We fallback to the one in arm/strlen.c for size optimised or
-@ for older arch's
+/* NOTE: This ifdef MUST match the ones in arm/strlen.c
+   We fallback to the one in arm/strlen.c for size optimised or
+   for older architectures. */
 #if defined(_ISA_ARM_7) || defined(__ARM_ARCH_6T2__) && \
     !(defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
       (defined (__thumb__) && !defined (__thumb2__)))

-@ this lets us check a flag in a 00/ff byte easily in either endianness
+	.macro def_fn f p2align=0
+	.text
+	.p2align \p2align
+	.global \f
+	.type \f, %function
+	.endm
 #ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
+#define S2LO		lsl
+#define S2HI		lsr
-#define CHARTSTMASK(c) 1<<(c*8)
+#define S2LO		lsr
+#define S2HI		lsl

+	/* This code requires Thumb.  */
+	.thumb
 	.syntax unified
-	.arch armv7-a
-	.thumb_func
-	.align 2
-	.p2align 4,,15
-	.global strlen
-	.type strlen,%function
-	@ r0 = string
-	@ returns count of bytes in string not including terminator
-	mov	r1, r0
-	push	{ r4,r6 }
-	mvns	r6, #0		@ all F
-	movs	r4, #0
-	tst	r0, #7
-	beq	2f
-	ldrb	r2, [r1], #1
-	tst	r1, #7		@ Hit alignment yet?
-	cbz	r2, 10f		@ Exit if we found the 0
-	bne	1b
-	@ So we're now aligned
-	ldrd    r2,r3,[r1],#8
-	uadd8	r2, r2, r6	@ Par add 0xff - sets the GE bits for bytes!=0
-	sel	r2, r4, r6	@ bytes are 00 for none-00 bytes,
-				@ or ff for 00 bytes - NOTE INVERSION
-	uadd8	r3, r3, r6	@ Par add 0xff - sets the GE bits for bytes!=0
-	sel	r3, r2, r6	@ chained...bytes are 00 for none-00 bytes,
-				@ or ff for 00 bytes - NOTE INVERSION
-	cmp	r3, #0
-	beq	2b
-	@ One (or more) of the bytes we loaded was 0 - but which one?
-	@ r2 has the mask corresponding to the first loaded word
-	@ r3 has a combined mask of the two words - but if r2 was all-non 0
-	@ then it's just the 2nd words
-	cmp	r2, #0
-	itte	eq
-	moveq	r2, r3		@ the end is in the 2nd word
-	subeq	r1,r1,#3
-	subne	r1,r1,#7
-	@ r1 currently points to the 2nd byte of the word containing the 0
-	tst	r2, # CHARTSTMASK(0)	@ 1st character
-	bne	10f
-	adds	r1,r1,#1
-	tst	r2, # CHARTSTMASK(1)	@ 2nd character
-	ittt	eq
-	addeq	r1,r1,#1
-	tsteq	r2, # (3<<15)	@ 2nd & 3rd character
-	@ If not the 3rd must be the last one
-	addeq	r1,r1,#1
-	@ r0 is still at the beginning, r1 is pointing 1 byte after the nul
-	sub	r0, r1, r0
-	subs	r0, r0, #1
-	pop	{ r4, r6 }
+/* Parameters and result.  */
+#define srcin		r0
+#define result		r0
+/* Internal variables.  */
+#define src		r1
+#define data1a		r2
+#define data1b		r3
+#define const_m1	r12
+#define const_0		r4
+#define tmp1		r4		/* Overlaps const_0  */
+#define tmp2		r5
+def_fn	strlen p2align=6
+	pld	[srcin, #0]
+	strd	r4, r5, [sp, #-8]!
+	bic	src, srcin, #7
+	mvn	const_m1, #0
+	ands	tmp1, srcin, #7		/* (8 - bytes) to alignment.  */
+	pld	[src, #32]
+	bne.w	.Lmisaligned8
+	mov	const_0, #0
+	mov	result, #-8
+	/* Bytes 0-7.  */
+	ldrd	data1a, data1b, [src]
+	pld	[src, #64]
+	add	result, result, #8
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+	/* Bytes 8-15.  */
+	ldrd	data1a, data1b, [src, #8]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+	/* Bytes 16-23.  */
+	ldrd	data1a, data1b, [src, #16]
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cbnz	data1b, .Lnull_found
+	/* Bytes 24-31.  */
+	ldrd	data1a, data1b, [src, #24]
+	add	src, src, #32
+	uadd8	data1a, data1a, const_m1	/* Saturating GE<0:3> set.  */
+	add	result, result, #8
+	sel	data1a, const_0, const_m1	/* Select based on GE<0:3>.  */
+	uadd8	data1b, data1b, const_m1
+	sel	data1b, data1a, const_m1	/* Only used if d1a == 0.  */
+	cmp	data1b, #0
+	beq	.Lloop_aligned
+	cmp	data1a, #0
+	itt	eq
+	addeq	result, result, #4
+	moveq	data1a, data1b
+#ifndef __ARMEB__
+	rev	data1a, data1a
+	clz	data1a, data1a
+	ldrd	r4, r5, [sp], #8
+	add	result, result, data1a, lsr #3	/* Bits -> Bytes.  */
 	bx	lr

+	ldrd	data1a, data1b, [src]
+	and	tmp2, tmp1, #3
+	rsb	result, tmp1, #0
+	lsl	tmp2, tmp2, #3			/* Bytes -> bits.  */
+	tst	tmp1, #4
+	pld	[src, #64]
+	S2HI	tmp2, const_m1, tmp2
+	orn	data1a, data1a, tmp2
+	itt	ne
+	ornne	data1b, data1b, tmp2
+	movne	data1a, const_m1
+	mov	const_0, #0
+	b	.Lstart_realigned
+	.size	strlen, . - strlen

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]