rawmemchr
Eric Blake
ebb9@byu.net
Thu May 22 00:04:00 GMT 2008
Eric Blake <ebb9 <at> byu.net> writes:
> > Maybe you could show a few timing measurements from realistic testcases at
> > this point in the discussion? I haven't felt comfortable trying to directly
> > infer from instruction counts to real-time clock cycles any time in the past
> > two decades myself, there are so many confounding factors these days that
> > I'm not even confident of being able to validly reason about it any more.
>
> Here's my test program:
I modified my test program to also check strlen numbers (in mystrchr, replace
the !c branch with "return (char *) str + strlen (str);"). With strlen's
current assembly implementation of 'repnz scasb' as the inner loop, modern x86
machines have HIDEOUS performance:
$ time ./foo 1000000 1 0 1000 0 1
real 0m2.859s
user 0m2.874s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 1
real 0m2.875s
user 0m2.874s
sys 0m0.030s
3x worse than current strchr on unaligned data, but at least alignment didn't
matter.
Finally, I implemented my assembly changes (renamed as strchr1 in my testing,
so I could still compare to the original strchr), and got much more promising
results:
$ time ./foo 1000000 1 0 1000 2 2
real 0m0.578s
user 0m0.593s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 2 2
real 0m0.562s
user 0m0.577s
sys 0m0.015s
All right - aligned searches are not noticeably worse than the original
implementation, and unaligned searches are now comparable in time to aligned
searches, rather than 40% slower.
$ time ./foo 1000000 1 0 1000 0 0
real 0m0.329s
user 0m0.327s
sys 0m0.015s
$ time ./foo 1000000 1 1 1000 0 0
real 0m0.328s
user 0m0.343s
sys 0m0.015s
And the special case of searching for NUL is 40% faster!
Code size difference:
original strchr: 111 bytes
my strchr: 250 bytes
So, OK to apply this patch?
2008-05-21 Eric Blake <ebb9@byu.net>
Optimize strchr for x86.
* libc/machine/i386/strchr.S (strchr): Pre-align so unaligned
searches aren't penalized. Special-case searching for 0.
Index: libc/machine/i386/strchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S
--- libc/machine/i386/strchr.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/strchr.S 21 May 2008 17:46:22 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -9,7 +9,7 @@
*/
#include "i386mach.h"
-
+
.global SYM (strchr)
SOTYPE_FUNCTION(strchr)
@@ -21,14 +21,45 @@ SYM (strchr):
pushl ebx
xorl ebx,ebx
movl 8(ebp),edi
- movb 12(ebp),bl
+ addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+ je L25
-#ifndef __OPTIMIZE_SIZE__
-/* check if string is aligned, if not do check one byte at a time */
+/* Do byte-wise checks until string is aligned. */
test $3,edi
- jne L9
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L5
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L14
+ cmpb bl,cl
+ je L19
+ incl edi
/* create 4 byte mask which is just the desired byte repeated 4 times */
+L5:
movl ebx,ecx
sall $8,ebx
subl $4,edi
@@ -49,15 +80,14 @@ L10:
testl $-2139062144,edx
jne L9
- movl ebx,eax
- xorl ecx,eax
- leal -16843009(eax),edx
- notl eax
- andl eax,edx
+ xorl ebx,ecx
+ leal -16843009(ecx),edx
+ notl ecx
+ andl ecx,edx
testl $-2139062144,edx
je L10
#endif /* not __OPTIMIZE_SIZE__ */
-
+
/* loop while (*s && *s++ != c) */
L9:
leal -1(edi),eax
@@ -69,7 +99,7 @@ L15:
je L14
cmpb bl,dl
jne L15
-
+
L14:
/* if (*s == c) return address otherwise return NULL */
cmpb bl,(eax)
@@ -83,3 +113,60 @@ L19:
leave
ret
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0). */
+#if 0
+ /* Hideous performance on modern machines. */
+L25:
+ cld
+ movl $-1,ecx
+ xor eax,eax
+ repnz
+ scasb
+ leal -1(edi),eax
+ jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned. */
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+ test $3,edi
+ je L26
+ movl edi,eax
+ movb (eax),cl
+ testb cl,cl
+ je L19
+ incl edi
+
+L26:
+ subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte */
+ .p2align 4,,7
+L27:
+ addl $4,edi
+ movl (edi),ecx
+ leal -16843009(ecx),edx
+ movl ecx,eax
+ notl eax
+ andl eax,edx
+ testl $-2139062144,edx
+ je L27
+
+ jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */
More information about the Newlib
mailing list