rawmemchr
Jeff Johnston
jjohnstn@redhat.com
Thu May 22 01:07:00 GMT 2008
Eric Blake wrote:
> Eric Blake <ebb9 <at> byu.net> writes:
>
>
>>> Maybe you could show a few timing measurements from realistic testcases at
>>> this point in the discussion? I haven't felt comfortable trying to directly
>>> infer from instruction counts to real-time clock cycles any time in the past
>>> two decades myself, there are so many confounding factors these days that
>>> I'm not even confident of being able to validly reason about it any more.
>>>
>> Here's my test program:
>>
>
> I modified my test program to also check strlen numbers (in mystrchr, replace
> the !c branch with "return (char *) str + strlen (str);"). With strlen's
> current assembly implementation of 'repnz scasb' as the inner loop, modern x86
> machines have HIDEOUS performance:
>
> $ time ./foo 1000000 1 0 1000 0 1
>
> real 0m2.859s
> user 0m2.874s
> sys 0m0.015s
>
> $ time ./foo 1000000 1 1 1000 0 1
>
> real 0m2.875s
> user 0m2.874s
> sys 0m0.030s
>
> 3x worse than current strchr on unaligned data, but at least alignment didn't
> matter.
>
> Finally, I implemented my assembly changes (renamed as strchr1 in my testing,
> so I could still compare to the original strchr), and got much more promising
> results:
>
>
> $ time ./foo 1000000 1 0 1000 2 2
>
> real 0m0.578s
> user 0m0.593s
> sys 0m0.015s
>
> $ time ./foo 1000000 1 1 1000 2 2
>
> real 0m0.562s
> user 0m0.577s
> sys 0m0.015s
>
> All right - aligned searches are not noticeably worse than the original
> implementation, and unaligned searches are now comparable in time to aligned
> searches, rather than 40% slower.
>
> $ time ./foo 1000000 1 0 1000 0 0
>
> real 0m0.329s
> user 0m0.327s
> sys 0m0.015s
>
> $ time ./foo 1000000 1 1 1000 0 0
>
> real 0m0.328s
> user 0m0.343s
> sys 0m0.015s
>
> And the special case of searching for NUL is 40% faster!
>
> Code size difference:
>
> original strchr: 111 bytes
> my strchr: 250 bytes
>
> So, OK to apply this patch?
>
>
Yes, go ahead. I assume you have verified that the function still works
correctly.
-- Jeff J.
> 2008-05-21 Eric Blake <ebb9@byu.net>
>
> Optimize strchr for x86.
> * libc/machine/i386/strchr.S (strchr): Pre-align so unaligned
> searches aren't penalized. Special-case searching for 0.
>
>
> Index: libc/machine/i386/strchr.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/i386/strchr.S,v
> retrieving revision 1.3
> diff -u -p -r1.3 strchr.S
> --- libc/machine/i386/strchr.S 20 Dec 2002 21:31:19 -0000 1.3
> +++ libc/machine/i386/strchr.S 21 May 2008 17:46:22 -0000
> @@ -1,6 +1,6 @@
> /*
> * ====================================================
> - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
> + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
> *
> * Permission to use, copy, modify, and distribute this
> * software is freely granted, provided that this notice
> @@ -9,7 +9,7 @@
> */
>
> #include "i386mach.h"
> -
> +
> .global SYM (strchr)
> SOTYPE_FUNCTION(strchr)
>
> @@ -21,14 +21,45 @@ SYM (strchr):
> pushl ebx
> xorl ebx,ebx
> movl 8(ebp),edi
> - movb 12(ebp),bl
> + addb 12(ebp),bl
> +
> +#ifndef __OPTIMIZE_SIZE__
> +/* Special case strchr(p,0). */
> + je L25
>
> -#ifndef __OPTIMIZE_SIZE__
> -/* check if string is aligned, if not do check one byte at a time */
> +/* Do byte-wise checks until string is aligned. */
> test $3,edi
> - jne L9
> + je L5
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L14
> + cmpb bl,cl
> + je L19
> + incl edi
> +
> + test $3,edi
> + je L5
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L14
> + cmpb bl,cl
> + je L19
> + incl edi
> +
> + test $3,edi
> + je L5
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L14
> + cmpb bl,cl
> + je L19
> + incl edi
>
> /* create 4 byte mask which is just the desired byte repeated 4 times */
> +L5:
> movl ebx,ecx
> sall $8,ebx
> subl $4,edi
> @@ -49,15 +80,14 @@ L10:
> testl $-2139062144,edx
> jne L9
>
> - movl ebx,eax
> - xorl ecx,eax
> - leal -16843009(eax),edx
> - notl eax
> - andl eax,edx
> + xorl ebx,ecx
> + leal -16843009(ecx),edx
> + notl ecx
> + andl ecx,edx
> testl $-2139062144,edx
> je L10
> #endif /* not __OPTIMIZE_SIZE__ */
> -
> +
> /* loop while (*s && *s++ != c) */
> L9:
> leal -1(edi),eax
> @@ -69,7 +99,7 @@ L15:
> je L14
> cmpb bl,dl
> jne L15
> -
> +
> L14:
> /* if (*s == c) return address otherwise return NULL */
> cmpb bl,(eax)
> @@ -83,3 +113,60 @@ L19:
> leave
> ret
>
> +#ifndef __OPTIMIZE_SIZE__
> +/* Special case strchr(p,0). */
> +#if 0
> + /* Hideous performance on modern machines. */
> +L25:
> + cld
> + movl $-1,ecx
> + xor eax,eax
> + repnz
> + scasb
> + leal -1(edi),eax
> + jmp L19
> +#endif
> +L25:
> +/* Do byte-wise checks until string is aligned. */
> + test $3,edi
> + je L26
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L19
> + incl edi
> +
> + test $3,edi
> + je L26
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L19
> + incl edi
> +
> + test $3,edi
> + je L26
> + movl edi,eax
> + movb (eax),cl
> + testb cl,cl
> + je L19
> + incl edi
> +
> +L26:
> + subl $4,edi
> +
> +/* loop performing 4 byte mask checking for desired 0 byte */
> + .p2align 4,,7
> +L27:
> + addl $4,edi
> + movl (edi),ecx
> + leal -16843009(ecx),edx
> + movl ecx,eax
> + notl eax
> + andl eax,edx
> + testl $-2139062144,edx
> + je L27
> +
> + jmp L9
> +
> +#endif /* !__OPTIMIZE_SIZE__ */
>
>
>
More information about the Newlib
mailing list