rawmemchr

Jeff Johnston jjohnstn@redhat.com
Thu May 22 01:07:00 GMT 2008


Eric Blake wrote:
> Eric Blake <ebb9 <at> byu.net> writes:
>
>   
>>>   Maybe you could show a few timing measurements from realistic testcases at
>>> this point in the discussion?  I haven't felt comfortable trying to directly
>>> infer from instruction counts to real-time clock cycles any time in the past
>>> two decades myself, there are so many confounding factors these days that
>>> I'm not even confident of being able to validly reason about it any more.
>>>       
>> Here's my test program:
>>     
>
> I modified my test program to also check strlen numbers (in mystrchr, replace 
> the !c branch with "return (char *) str + strlen (str);").  With strlen's 
> current assembly implementation of 'repnz scasb' as the inner loop, modern x86 
> machines have HIDEOUS performance:
>
> $ time ./foo 1000000 1 0 1000 0 1
>
> real	0m2.859s
> user	0m2.874s
> sys	0m0.015s
>
> $ time ./foo 1000000 1 1 1000 0 1
>
> real	0m2.875s
> user	0m2.874s
> sys	0m0.030s
>
> 3x worse than current strchr on unaligned data, but at least alignment didn't 
> matter.
>
> Finally, I implemented my assembly changes (renamed as strchr1 in my testing, 
> so I could still compare to the original strchr), and got much more promising 
> results:
>
>
> $ time ./foo 1000000 1 0 1000 2 2
>
> real	0m0.578s
> user	0m0.593s
> sys	0m0.015s
>
> $ time ./foo 1000000 1 1 1000 2 2
>
> real	0m0.562s
> user	0m0.577s
> sys	0m0.015s
>
> All right - aligned searches are not noticeably worse than the original 
> implementation, and unaligned searches are now comparable in time to aligned 
> searches, rather than 40% slower.
>
> $ time ./foo 1000000 1 0 1000 0 0
>
> real	0m0.329s
> user	0m0.327s
> sys	0m0.015s
>
> $ time ./foo 1000000 1 1 1000 0 0
>
> real	0m0.328s
> user	0m0.343s
> sys	0m0.015s
>
> And the special case of searching for NUL is 40% faster!
>
> Code size difference:
>
> original strchr: 111 bytes
> my strchr: 250 bytes
>
> So, OK to apply this patch?
>
>   
Yes, go ahead.  I assume you have verified that the function still works 
correctly.

-- Jeff J.
> 2008-05-21  Eric Blake  <ebb9@byu.net>
>
> 	Optimize strchr for x86.
> 	* libc/machine/i386/strchr.S (strchr): Pre-align so unaligned
> 	searches aren't penalized.  Special-case searching for 0.
>
>
> Index: libc/machine/i386/strchr.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
> retrieving revision 1.3
> diff -u -p -r1.3 strchr.S
> --- libc/machine/i386/strchr.S	20 Dec 2002 21:31:19 -0000	1.3
> +++ libc/machine/i386/strchr.S	21 May 2008 17:46:22 -0000
> @@ -1,6 +1,6 @@
>  /*
>   * ====================================================
> - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
> + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
>   *
>   * Permission to use, copy, modify, and distribute this
>   * software is freely granted, provided that this notice
> @@ -9,7 +9,7 @@
>   */
>  
>  	#include "i386mach.h"
> -	
> +
>  	.global SYM (strchr)
>         SOTYPE_FUNCTION(strchr)
>  
> @@ -21,14 +21,45 @@ SYM (strchr):
>  	pushl ebx
>  	xorl ebx,ebx
>  	movl 8(ebp),edi
> -	movb 12(ebp),bl
> +	addb 12(ebp),bl
> +
> +#ifndef __OPTIMIZE_SIZE__
> +/* Special case strchr(p,0).  */
> +	je L25
>  
> -#ifndef __OPTIMIZE_SIZE__	
> -/* check if string is aligned, if not do check one byte at a time */
> +/* Do byte-wise checks until string is aligned.  */
>  	test $3,edi
> -	jne L9
> +	je L5
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L14
> +	cmpb bl,cl
> +	je L19
> +	incl edi
> +
> +	test $3,edi
> +	je L5
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L14
> +	cmpb bl,cl
> +	je L19
> +	incl edi
> +
> +	test $3,edi
> +	je L5
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L14
> +	cmpb bl,cl
> +	je L19
> +	incl edi
>  
>  /* create 4 byte mask which is just the desired byte repeated 4 times */
> +L5:
>  	movl ebx,ecx
>  	sall $8,ebx
>  	subl $4,edi
> @@ -49,15 +80,14 @@ L10:
>  	testl $-2139062144,edx
>  	jne L9
>  
> -	movl ebx,eax
> -	xorl ecx,eax
> -	leal -16843009(eax),edx
> -	notl eax
> -	andl eax,edx
> +	xorl ebx,ecx
> +	leal -16843009(ecx),edx
> +	notl ecx
> +	andl ecx,edx
>  	testl $-2139062144,edx
>  	je L10
>  #endif /* not __OPTIMIZE_SIZE__ */
> -		
> +
>  /* loop while (*s && *s++ != c) */
>  L9:
>  	leal -1(edi),eax
> @@ -69,7 +99,7 @@ L15:
>  	je L14
>  	cmpb bl,dl
>  	jne L15
> -	
> +
>  L14:
>  /*  if (*s == c) return address otherwise return NULL */
>  	cmpb bl,(eax)
> @@ -83,3 +113,60 @@ L19:
>  	leave
>  	ret
>  
> +#ifndef __OPTIMIZE_SIZE__
> +/* Special case strchr(p,0).  */
> +#if 0
> +	/* Hideous performance on modern machines.  */
> +L25:
> +	cld
> +	movl $-1,ecx
> +	xor eax,eax
> +	repnz
> +	scasb
> +	leal -1(edi),eax
> +	jmp L19
> +#endif
> +L25:
> +/* Do byte-wise checks until string is aligned.  */
> +	test $3,edi
> +	je L26
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L19
> +	incl edi
> +
> +	test $3,edi
> +	je L26
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L19
> +	incl edi
> +
> +	test $3,edi
> +	je L26
> +	movl edi,eax
> +	movb (eax),cl
> +	testb cl,cl
> +	je L19
> +	incl edi
> +
> +L26:
> +	subl $4,edi
> +
> +/* loop performing 4 byte mask checking for desired 0 byte */
> +	.p2align 4,,7
> +L27:
> +	addl $4,edi
> +	movl (edi),ecx
> +	leal -16843009(ecx),edx
> +	movl ecx,eax
> +	notl eax
> +	andl eax,edx
> +	testl $-2139062144,edx
> +	je L27
> +
> +	jmp L9
> +
> +#endif /* !__OPTIMIZE_SIZE__ */
>
>
>   



More information about the Newlib mailing list