faster memchr
Jeff Johnston
jjohnstn@redhat.com
Tue May 27 19:07:00 GMT 2008
Patch checked in. Thanks again.
-- Jeff J.
Eric Blake wrote:
> Yet another speedup.
>
> 2008-05-22 Eric Blake <ebb9@byu.net>
>
> Optimize the generic and x86 memchr.
> * libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]: Pre-align
> pointer so unaligned searches aren't penalized.
> * libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
> operations are faster than repnz byte searches.
>
>
> And the test program:
>
> #include <string.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
>
> #define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
> #define LBLOCKSIZE (sizeof (long))
> #define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
> #define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
> #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
> #define _CONST const
>
> #undef memchr
>
> void *
> memchr1 (const void *src_void, int c, size_t length)
> {
> _CONST unsigned char *src = (_CONST unsigned char *) src_void;
> unsigned char d = c;
>
> #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
> unsigned long *asrc;
> unsigned long mask;
> int i;
>
> while (UNALIGNED (src))
> {
> if (!length--)
> return NULL;
> if (*src == d)
> return (void *) src;
> src++;
> }
>
> if (!TOO_SMALL (length))
> {
> /* If we get this far, we know that length is large and src is
> word-aligned. */
> /* The fast code reads the source one word at a time and only
> performs the bytewise search on word-sized segments if they
> contain the search character, which is detected by XORing
> the word-sized segment with a word-sized block of the search
> character and then detecting for the presence of NUL in the
> result. */
> asrc = (unsigned long *) src;
> mask = d << 8 | d;
> mask = mask << 16 | mask;
> for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
> mask = (mask << i) | mask;
>
> while (length >= LBLOCKSIZE)
> {
> if (DETECTCHAR (*asrc, mask))
> break;
> length -= LBLOCKSIZE;
> asrc++;
> }
>
> /* If there are fewer than LBLOCKSIZE characters left,
> then we resort to the bytewise loop. */
>
> src = (unsigned char *) asrc;
> }
>
> #endif /* not PREFER_SIZE_OVER_SPEED */
>
> while (length--)
> {
> if (*src == d)
> return (void *) src;
> src++;
> }
>
> return NULL;
> }
>
> void *memchr0 (const void *m, int c, size_t n);
>
> int main (int argc, char **argv)
> {
> if (argc < 5)
> {
> printf ("usage: %s size repeat goal offset [func]\n", argv[0]);
> return 1;
> }
> int size = strtol (argv[1], NULL, 0);
> int repeat = strtol (argv[2], NULL, 0);
> int goal = strtol (argv[3], NULL, 0);
> int offset = strtol (argv[4], NULL, 0);
> void *(*func)(const void *, int, size_t);
> func = argc > 5 ? (atoi (argv[5]) ? memchr1 : memchr0) : memchr;
>
> unsigned char *buf = malloc (size);
> if (size)
> {
> __builtin_memset (buf, 1, size - 1);
> buf[size - 1] = 2;
> }
>
> char *expected = (goal == 1 && offset < size - 1 ? buf + offset
> : goal == 2 && offset < size ? buf + size - 1 : NULL);
> buf += offset;
> size -= offset;
> while (repeat--)
> assert (func (buf, goal, size) == expected);
> buf -= offset;
> free (buf);
> return 0;
> }
>
> And more interesting numbers:
>
> for i in `seq 0 25` ; do
> echo $i
> for j in `seq 0 $i` ; do
> for k in 0 1 2 ; do
> ./foo $i 1 $k $j 0
> done
> done
> done
>
> Proof that my patched assembly can handle any alignment for starting, and
> handles whether the byte is found at the beginning, at any alignment at the
> end, or not all.
>
> $ time ./foo 1000000 1000 0 0
>
> real 0m2.859s
> user 0m2.890s
> sys 0m0.000s
>
> $ time ./foo 1000000 1000 0 1
>
> real 0m2.859s
> user 0m2.858s
> sys 0m0.015s
>
> # Unpatched assembly uses 'repnz scasb' - dirt slow, but no alignment issues
>
> $ time ./foo 1000000 1000 0 0 1
>
> real 0m0.453s
> user 0m0.484s
> sys 0m0.000s
>
> $ time ./foo 1000000 1000 0 1 1
>
> real 0m0.453s
> user 0m0.468s
> sys 0m0.015s
>
> # Wow. Patched C is 6x faster, with no alignment problems.
>
> $ time ./foo 1000000 1000 0 0 0
>
> real 0m0.391s
> user 0m0.405s
> sys 0m0.015s
>
> $ time ./foo 1000000 1000 0 1 0
>
> real 0m0.390s
> user 0m0.405s
> sys 0m0.015s
>
> # and patched assembly is 15% faster than patched C, on any alignment
>
> Without further ado, here's my 7x speedup for x86 memchr:
>
>
> Index: libc/string/memchr.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/string/memchr.c,v
> retrieving revision 1.2
> diff -u -p -r1.2 memchr.c
> --- libc/string/memchr.c 28 Oct 2005 21:21:07 -0000 1.2
> +++ libc/string/memchr.c 22 May 2008 22:42:32 -0000
> @@ -20,7 +20,7 @@ DESCRIPTION
> This function searches memory starting at <<*<[src]>>> for the
> character <[c]>. The search only ends with the first
> occurrence of <[c]>, or after <[length]> characters; in
> - particular, <<NULL>> does not terminate the search.
> + particular, <<NUL>> does not terminate the search.
>
> RETURNS
> If the character <[c]> is found within <[length]> characters
> @@ -64,6 +64,9 @@ QUICKREF
> #error long int is not a 32bit or 64bit byte
> #endif
>
> +/* DETECTCHAR returns nonzero if (long)X contains the byte used
> + to fill (long)MASK. */
> +#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
>
> _PTR
> _DEFUN (memchr, (src_void, c, length),
> @@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
> int c _AND
> size_t length)
> {
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
> _CONST unsigned char *src = (_CONST unsigned char *) src_void;
> + unsigned char d = c;
>
> - c &= 0xff;
> -
> - while (length--)
> +#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
> + unsigned long *asrc;
> + unsigned long mask;
> + int i;
> +
> + while (UNALIGNED (src))
> {
> - if (*src == c)
> - return (char *) src;
> + if (!length--)
> + return NULL;
> + if (*src == d)
> + return (void *) src;
> src++;
> }
> - return NULL;
> -#else
> - _CONST unsigned char *src = (_CONST unsigned char *) src_void;
> - unsigned long *asrc;
> - unsigned long buffer;
> - unsigned long mask;
> - int i, j;
>
> - c &= 0xff;
> -
> - /* If the size is small, or src is unaligned, then
> - use the bytewise loop. We can hope this is rare. */
> - if (!TOO_SMALL (length) && !UNALIGNED (src))
> + if (!TOO_SMALL (length))
> {
> - /* The fast code reads the ASCII one word at a time and only
> + /* If we get this far, we know that length is large and src is
> + word-aligned. */
> + /* The fast code reads the source one word at a time and only
> performs the bytewise search on word-sized segments if they
> - contain the search character, which is detected by XORing
> + contain the search character, which is detected by XORing
> the word-sized segment with a word-sized block of the search
> - character and then detecting for the presence of NULL in the
> + character and then detecting for the presence of NUL in the
> result. */
> - asrc = (unsigned long*) src;
> - mask = 0;
> - for (i = 0; i < LBLOCKSIZE; i++)
> - mask = (mask << 8) + c;
> + asrc = (unsigned long *) src;
> + mask = d << 8 | d;
> + mask = mask << 16 | mask;
> + for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
> + mask = (mask << i) | mask;
>
> while (length >= LBLOCKSIZE)
> {
> - buffer = *asrc;
> - buffer ^= mask;
> - if (DETECTNULL (buffer))
> - {
> - src = (unsigned char*) asrc;
> - for ( j = 0; j < LBLOCKSIZE; j++ )
> - {
> - if (*src == c)
> - return (char*) src;
> - src++;
> - }
> - }
> + if (DETECTCHAR (*asrc, mask))
> + break;
> length -= LBLOCKSIZE;
> asrc++;
> }
> -
> +
> /* If there are fewer than LBLOCKSIZE characters left,
> then we resort to the bytewise loop. */
>
> - src = (unsigned char*) asrc;
> + src = (unsigned char *) asrc;
> }
>
> +#endif /* not PREFER_SIZE_OVER_SPEED */
> +
> while (length--)
> - {
> - if (*src == c)
> - return (char*) src;
> + {
> + if (*src == d)
> + return (void *) src;
> src++;
> - }
> + }
>
> return NULL;
> -#endif /* not PREFER_SIZE_OVER_SPEED */
> }
> Index: libc/machine/i386/memchr.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/i386/memchr.S,v
> retrieving revision 1.3
> diff -u -p -r1.3 memchr.S
> --- libc/machine/i386/memchr.S 20 Dec 2002 21:31:19 -0000 1.3
> +++ libc/machine/i386/memchr.S 22 May 2008 22:42:32 -0000
> @@ -1,6 +1,6 @@
> /*
> * ====================================================
> - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
> + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
> *
> * Permission to use, copy, modify, and distribute this
> * software is freely granted, provided that this notice
> @@ -9,21 +9,23 @@
> */
>
> #include "i386mach.h"
> -
> +
> .global SYM (memchr)
> SOTYPE_FUNCTION(memchr)
>
> SYM (memchr):
> pushl ebp
> movl esp,ebp
> - pushl edi
> - movl 12(ebp),eax
> - movl 16(ebp),ecx
> - movl 8(ebp),edi
> + pushl edi
> + movzbl 12(ebp),eax
> + movl 16(ebp),ecx
> + movl 8(ebp),edi
>
> xorl edx,edx
> testl ecx,ecx
> - jz L1
> + jz L20
> +
> +#ifdef __OPTIMIZE_SIZE__
>
> cld
> repnz
> @@ -31,9 +33,79 @@ SYM (memchr):
>
> setnz dl
> decl edi
> +
> +#else /* !__OPTIMIZE_SIZE__ */
> +/* Do byte-wise checks until string is aligned. */
> + testl $3,edi
> + je L5
> + cmpb (edi),al
> + je L15
> + incl edi
> + decl ecx
> + je L20
> +
> + testl $3,edi
> + je L5
> + cmpb (edi),al
> + je L15
> + incl edi
> + decl ecx
> + je L20
> +
> + testl $3,edi
> + je L5
> + cmpb (edi),al
> + je L15
> + incl edi
> + decl ecx
> + je L20
> +
> +/* Create a mask, then check a word at a time. */
> +L5:
> + movb al,ah
> + movl eax,edx
> + sall $16,edx
> + orl edx,eax
> + pushl ebx
> +
> + .p2align 4,,7
> +L8:
> + subl $4,ecx
> + jc L9
> + movl (edi),edx
> + addl $4,edi
> + xorl eax,edx
> + leal -16843009(edx),ebx
> + notl edx
> + andl edx,ebx
> + testl $-2139062144,ebx
> + je L8
> +
> + subl $4,edi
> +
> +L9:
> + popl ebx
> + xorl edx,edx
> + addl $4,ecx
> + je L20
> +
> +/* Final byte-wise checks. */
> + .p2align 4,,7
> +L10:
> + cmpb (edi),al
> + je L15
> + incl edi
> + decl ecx
> + jne L10
> +
> + xorl edi,edi
> +
> +#endif /* !__OPTIMIZE_SIZE__ */
> +
> +L15:
> decl edx
> andl edi,edx
> -L1:
> +L20:
> movl edx,eax
>
> leal -4(ebp),esp
>
>
>
More information about the Newlib
mailing list