This is the mail archive of the
libc-ports@sources.redhat.com
mailing list for the libc-ports project.
[PATCH 23/26] arm: Rewrite armv6t2 memchr with uqadd8
- From: Richard Henderson <rth at twiddle dot net>
- To: libc-ports at sourceware dot org
- Cc: Joseph Myers <joseph at codesourcery dot com>
- Date: Tue, 26 Feb 2013 19:16:23 -0800
- Subject: [PATCH 23/26] arm: Rewrite armv6t2 memchr with uqadd8
- References: <1361934986-17018-1-git-send-email-rth@twiddle.net>
Not recently speed tested vs the Linaro version, but having a
common set of algorithms for all the *chr routines is surely
worth something for maintanence. But I do recall it was once
a few percent faster on A8.
---
* sysdeps/arm/armv6t2/memchr.S: Rewrite.
---
ports/sysdeps/arm/armv6t2/memchr.S | 216 +++++++++++++++++--------------------
1 file changed, 101 insertions(+), 115 deletions(-)
diff --git a/ports/sysdeps/arm/armv6t2/memchr.S b/ports/sysdeps/arm/armv6t2/memchr.S
index 6d35f47..1739a4c 100644
--- a/ports/sysdeps/arm/armv6t2/memchr.S
+++ b/ports/sysdeps/arm/armv6t2/memchr.S
@@ -22,142 +22,128 @@
@ and ARMv6T2 processors. It has a fast path for short sizes, and has an
@ optimised path for large data sets; the worst case is finding the match early
@ in a large data set.
-@ Note: The use of cbz/cbnz means it's Thumb only
-
-@ 2011-07-15 david.gilbert@linaro.org
-@ Copy from Cortex strings release 21 and change license
-@ http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/view/head:/src/linaro-a9/memchr.S
-@ Change function declarations/entry/exit
-@ 2011-12-01 david.gilbert@linaro.org
-@ Add some fixes from comments received (including use of ldrd instead ldm)
-@ 2011-12-07 david.gilbert@linaro.org
-@ Removed cbz from align loop - can't be taken
-
-@ this lets us check a flag in a 00/ff byte easily in either endianness
-#ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
-#else
-#define CHARTSTMASK(c) 1<<(c*8)
-#endif
- .syntax unified
+ .syntax unified
.text
- .thumb
-@ ---------------------------------------------------------------------------
- .thumb_func
- .global memchr
- .type memchr,%function
ENTRY(memchr)
@ r0 = start of memory to scan
@ r1 = character to look for
@ r2 = length
@ returns r0 = pointer to character or NULL if not found
- and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char
-
- cmp r2,#16 @ If it's short don't bother with anything clever
- blt 20f
- tst r0, #7 @ If it's already aligned skip the next bit
- beq 10f
+ uxtb r1, r1
+ cmp r2, #16 @ Is the buffer too short?
+ blo .Lbuf_small
+ tst r0, #7 @ Is the buffer already aligned?
+ beq .Lbuf_aligned
@ Work up to an aligned point
-5:
- ldrb r3, [r0],#1
- subs r2, r2, #1
- cmp r3, r1
- beq 50f @ If it matches exit found
- tst r0, #7
- bne 5b @ If not aligned yet then do next byte
-
-10:
- @ At this point, we are aligned, we know we have at least 8 bytes to work with
- push {r4,r5,r6,r7}
- cfi_adjust_cfa_offset (16)
+0: ldrb r3, [r0], #1
+ s(sub) r2, r2, #1
+ cmp r3, r1 @ If found, adjust and return.
+ beq .Lfound_minus1
+ tst r0, #7 @ If not yet aligned, loop
+ bne 0b
+
+.Lbuf_aligned:
+ @ Here, we are aligned and we have at least 8 bytes to work with.
+ push { r4, r5 }
+ cfi_adjust_cfa_offset (8)
cfi_rel_offset (r4, 0)
cfi_rel_offset (r5, 4)
- cfi_rel_offset (r6, 8)
- cfi_rel_offset (r7, 12)
- cfi_remember_state
-
- orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes
+ orr r1, r1, r1, lsl #8 @ Replicate C to all bytes
+ movw ip, #0xfefe
orr r1, r1, r1, lsl #16
- bic r4, r2, #7 @ Number of double words to work with * 8
- mvns r7, #0 @ all F's
- movs r3, #0
-
-15:
- ldrd r5,r6, [r0],#8
- subs r4, r4, #8
- eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target
- eor r6,r6, r1
- uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
- sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
- uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
- sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
- cbnz r6, 60f
- bne 15b @ (Flags from the subs above) If not run out of bytes then go around again
-
- pop {r4,r5,r6,r7}
- cfi_adjust_cfa_offset (-16)
+ movt ip, #0xfefe
+
+1: ldrd r4, r5, [r0], #8
+ s(eor) r4, r4, r1 @ Convert C's to zeros
+ s(eor) r5, r5, r1
+ @ Adding (unsigned saturating) 0xfe means result of 0xfe for any byte
+ @ that was originally zero and 0xff otherwise. Therefore we consider
+ @ the lsb of each byte the "found" bit, with 0 for a match.
+ uqadd8 r4, r4, ip
+ uqadd8 r5, r5, ip
+ cmp r2, #8 @ Have we still a full 8 bytes left?
+ blo .Lbuf_aligned_finish
+ s(and) r5, r4, r4 @ Combine found bits between words.
+ mvns r5, r5 @ Match within either word?
+ beq 1b
+
+ @ Here, we've found a match. Disambiguate 1st or 2nd word.
+ mvns r4, r4
+ itee ne
+ subne r0, r0, #8 @ Dec pointer to 1st word
+ subeq r0, r0, #4 @ Dec pointer to 2nd word
+ moveq r4, r5 @ Copy found bits from 2nd word.
+
+.Lfind_word:
+ @ Here we've found a match.
+ @ r0 = pointer to word containing the match
+ @ r4 = found bits for the word.
+#ifdef __ARMEL__
+ rbit r4, r4 @ For LE we need count-trailing-zeros
+#endif
+ clz r4, r4 @ Find the bit offset of the match.
+ add r0, r0, r4, lsr #3 @ Adjust the pointer to the found byte
+
+ pop { r4, r5 }
+ cfi_remember_state
+ cfi_adjust_cfa_offset (-8)
cfi_restore (r4)
cfi_restore (r5)
- cfi_restore (r6)
- cfi_restore (r7)
-
- and r1,r1,#0xff @ Get r1 back to a single character from the expansion above
- and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done
-
-20:
- cbz r2, 40f @ 0 length or hit the end already then not found
-
-21: @ Post aligned section, or just a short call
- ldrb r3,[r0],#1
- subs r2,r2,#1
- eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub
- cbz r3, 50f
- bne 21b @ on r2 flags
-
-40:
- movs r0,#0 @ not found
- DO_RET(lr)
+ bx lr
-50:
- subs r0,r0,#1 @ found
- DO_RET(lr)
-
-60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was
- @ r0 points to the start of the double word after the one that was tested
- @ r5 has the 00/ff pattern for the first word, r6 has the chained value
cfi_restore_state
- cmp r5, #0
- itte eq
- moveq r5, r6 @ the end is in the 2nd word
- subeq r0,r0,#3 @ Points to 2nd byte of 2nd word
- subne r0,r0,#7 @ or 2nd byte of 1st word
-
- @ r0 currently points to the 2nd byte of the word containing the hit
- tst r5, # CHARTSTMASK(0) @ 1st character
- bne 61f
- adds r0,r0,#1
- tst r5, # CHARTSTMASK(1) @ 2nd character
- ittt eq
- addeq r0,r0,#1
- tsteq r5, # (3<<15) @ 2nd & 3rd character
- @ If not the 3rd must be the last one
- addeq r0,r0,#1
-
-61:
- pop {r4,r5,r6,r7}
- cfi_adjust_cfa_offset (-16)
+.Lbuf_aligned_finish:
+ @ Here we've read and computed found bits for 8 bytes, but not
+ @ all of those bytes are within the buffer. Determine which
+ @ found bytes are really valid.
+ s(sub) r0, r0, #8 @ Dec pointer to the 1st word
+ cmp r2, #4 @ Do we have at least 4 bytes left?
+ blo 1f
+ mvns r4, r4 @ Match within the 1st word?
+ bne .Lfind_word
+ s(add) r0, r0, #4 @ Inc pointer to the 2nd word
+ s(mvn) r4, r5 @ Copy found bits from 2nd word
+ s(sub) r2, r2, #4 @ Bytes remaining in 2nd word
+1:
+ lsls r2, r2, #3 @ Convert remaining to bits
+ bne 2f @ No bytes remaining?
+ mvn r3, #0
+#ifdef __ARMEL__
+ s(lsl) r3, r3, r2 @ Mask with 1s covering invalid bytes
+#else
+ s(lsr) r3, r3, r2
+#endif
+ bics r4, r4, r3 @ Clear found past end of buffer
+ bne .Lfind_word
+2:
+ s(mov) r0, #0 @ No found
+ pop { r4, r5 }
+ cfi_adjust_cfa_offset (-8)
cfi_restore (r4)
cfi_restore (r5)
- cfi_restore (r6)
- cfi_restore (r7)
-
- subs r0,r0,#1
- DO_RET(lr)
+ bx lr
+
+.Lbuf_small:
+ @ Here we've a small buffer to be searched a byte at a time.
+0: ldrb r3, [r0], #1
+ cmp r3, r1 @ If found, adjust and return.
+ beq .Lfound_minus1
+ subs r2, r2, #1 @ Any bytes left?
+ bne 0b
+
+ s(mov) r0, #0 @ Not found
+ bx lr
+
+.Lfound_minus1:
+ @ Here we've found a match in a byte loop
+ @ r0 = pointer, post-incremented past the byte
+ s(sub) r0, r0, #1
+ bx lr
END(memchr)
libc_hidden_builtin_def (memchr)
--
1.8.1.2