rawmemchr

Thu May 22 00:04:00 GMT 2008

Eric Blake <ebb9 <at> byu.net> writes:

> >   Maybe you could show a few timing measurements from realistic testcases at
> > this point in the discussion?  I haven't felt comfortable trying to directly
> > infer from instruction counts to real-time clock cycles any time in the past
> > two decades myself, there are so many confounding factors these days that
> > I'm not even confident of being able to validly reason about it any more.
> 
> Here's my test program:

I modified my test program to also check strlen numbers (in mystrchr, replace 
the !c branch with "return (char *) str + strlen (str);").  With strlen's 
current assembly implementation of 'repnz scasb' as the inner loop, modern x86 
machines have HIDEOUS performance:

$ time ./foo 1000000 1 0 1000 0 1

real	0m2.859s
user	0m2.874s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 1

real	0m2.875s
user	0m2.874s
sys	0m0.030s

3x worse than current strchr on unaligned data, but at least alignment didn't 
matter.

Finally, I implemented my assembly changes (renamed as strchr1 in my testing, 
so I could still compare to the original strchr), and got much more promising 
results:


$ time ./foo 1000000 1 0 1000 2 2

real	0m0.578s
user	0m0.593s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 2 2

real	0m0.562s
user	0m0.577s
sys	0m0.015s

All right - aligned searches are not noticeably worse than the original 
implementation, and unaligned searches are now comparable in time to aligned 
searches, rather than 40% slower.

$ time ./foo 1000000 1 0 1000 0 0

real	0m0.329s
user	0m0.327s
sys	0m0.015s

$ time ./foo 1000000 1 1 1000 0 0

real	0m0.328s
user	0m0.343s
sys	0m0.015s

And the special case of searching for NUL is 40% faster!

Code size difference:

original strchr: 111 bytes
my strchr: 250 bytes

So, OK to apply this patch?

2008-05-21  Eric Blake  <ebb9@byu.net>

	Optimize strchr for x86.
	* libc/machine/i386/strchr.S (strchr): Pre-align so unaligned
	searches aren't penalized.  Special-case searching for 0.


Index: libc/machine/i386/strchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 strchr.S

--- libc/machine/i386/strchr.S	20 Dec 2002 21:31:19 -0000	1.3
+++ libc/machine/i386/strchr.S	21 May 2008 17:46:22 -0000
@@ -1,6 +1,6 @@
 /*
  * ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
  *
  * Permission to use, copy, modify, and distribute this
  * software is freely granted, provided that this notice
@@ -9,7 +9,7 @@
  */
 
 	#include "i386mach.h"
-	
+
 	.global SYM (strchr)
        SOTYPE_FUNCTION(strchr)
 
@@ -21,14 +21,45 @@ SYM (strchr):
 	pushl ebx
 	xorl ebx,ebx
 	movl 8(ebp),edi
-	movb 12(ebp),bl
+	addb 12(ebp),bl
+
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0).  */
+	je L25
 
-#ifndef __OPTIMIZE_SIZE__	
-/* check if string is aligned, if not do check one byte at a time */
+/* Do byte-wise checks until string is aligned.  */
 	test $3,edi
-	jne L9
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L5
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L14
+	cmpb bl,cl
+	je L19
+	incl edi
 
 /* create 4 byte mask which is just the desired byte repeated 4 times */
+L5:
 	movl ebx,ecx
 	sall $8,ebx
 	subl $4,edi
@@ -49,15 +80,14 @@ L10:
 	testl $-2139062144,edx
 	jne L9
 
-	movl ebx,eax
-	xorl ecx,eax
-	leal -16843009(eax),edx
-	notl eax
-	andl eax,edx
+	xorl ebx,ecx
+	leal -16843009(ecx),edx
+	notl ecx
+	andl ecx,edx
 	testl $-2139062144,edx
 	je L10
 #endif /* not __OPTIMIZE_SIZE__ */
-		
+
 /* loop while (*s && *s++ != c) */
 L9:
 	leal -1(edi),eax
@@ -69,7 +99,7 @@ L15:
 	je L14
 	cmpb bl,dl
 	jne L15
-	
+
 L14:
 /*  if (*s == c) return address otherwise return NULL */
 	cmpb bl,(eax)
@@ -83,3 +113,60 @@ L19:
 	leave
 	ret
 
+#ifndef __OPTIMIZE_SIZE__
+/* Special case strchr(p,0).  */
+#if 0
+	/* Hideous performance on modern machines.  */
+L25:
+	cld
+	movl $-1,ecx
+	xor eax,eax
+	repnz
+	scasb
+	leal -1(edi),eax
+	jmp L19
+#endif
+L25:
+/* Do byte-wise checks until string is aligned.  */
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+	test $3,edi
+	je L26
+	movl edi,eax
+	movb (eax),cl
+	testb cl,cl
+	je L19
+	incl edi
+
+L26:
+	subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte */
+	.p2align 4,,7
+L27:
+	addl $4,edi
+	movl (edi),ecx
+	leal -16843009(ecx),edx
+	movl ecx,eax
+	notl eax
+	andl eax,edx
+	testl $-2139062144,edx
+	je L27
+
+	jmp L9
+
+#endif /* !__OPTIMIZE_SIZE__ */