faster memchr

Jeff Johnston jjohnstn@redhat.com
Tue May 27 19:07:00 GMT 2008


Patch checked in.  Thanks again.

-- Jeff J.

Eric Blake wrote:
> Yet another speedup.
>
> 2008-05-22 Eric Blake <ebb9@byu.net>
>
> 	Optimize the generic and x86 memchr.
> 	* libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]: Pre-align
> 	pointer so unaligned searches aren't penalized.
> 	* libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
> 	operations are faster than repnz byte searches.
>
>
> And the test program:
>
> #include <string.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <assert.h>
>
> #define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
> #define LBLOCKSIZE (sizeof (long))
> #define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
> #define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
> #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
> #define _CONST const
>
> #undef memchr
>
> void *
> memchr1 (const void *src_void, int c, size_t length)
> {
>   _CONST unsigned char *src = (_CONST unsigned char *) src_void;
>   unsigned char d = c;
>
> #if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
>   unsigned long *asrc;
>   unsigned long  mask;
>   int i;
>
>   while (UNALIGNED (src))
>     {
>       if (!length--)
>         return NULL;
>       if (*src == d)
>         return (void *) src;
>       src++;
>     }
>
>   if (!TOO_SMALL (length))
>     {
>       /* If we get this far, we know that length is large and src is
>          word-aligned. */
>       /* The fast code reads the source one word at a time and only
>          performs the bytewise search on word-sized segments if they
>          contain the search character, which is detected by XORing
>          the word-sized segment with a word-sized block of the search
>          character and then detecting for the presence of NUL in the
>          result.  */
>       asrc = (unsigned long *) src;
>       mask = d << 8 | d;
>       mask = mask << 16 | mask;
>       for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
>         mask = (mask << i) | mask;
>
>       while (length >= LBLOCKSIZE)
>         {
>           if (DETECTCHAR (*asrc, mask))
>             break;
>           length -= LBLOCKSIZE;
>           asrc++;
>         }
>
>       /* If there are fewer than LBLOCKSIZE characters left,
>          then we resort to the bytewise loop.  */
>
>       src = (unsigned char *) asrc;
>     }
>
> #endif /* not PREFER_SIZE_OVER_SPEED */
>
>   while (length--)
>     {
>       if (*src == d)
>         return (void *) src;
>       src++;
>     }
>
>   return NULL;
> }
>
> void *memchr0 (const void *m, int c, size_t n);
>
> int main (int argc, char **argv)
> {
>   if (argc < 5)
>     {
>       printf ("usage: %s size repeat goal offset [func]\n", argv[0]);
>       return 1;
>     }
>   int size = strtol (argv[1], NULL, 0);
>   int repeat = strtol (argv[2], NULL, 0);
>   int goal = strtol (argv[3], NULL, 0);
>   int offset = strtol (argv[4], NULL, 0);
>   void *(*func)(const void *, int, size_t);
>   func = argc > 5 ? (atoi (argv[5]) ? memchr1 : memchr0) : memchr;
>
>   unsigned char *buf = malloc (size);
>   if (size)
>     {
>       __builtin_memset (buf, 1, size - 1);
>       buf[size - 1] = 2;
>     }
>
>   char *expected = (goal == 1 && offset < size - 1 ? buf + offset
>                     : goal == 2 && offset < size ? buf + size - 1 : NULL);
>   buf += offset;
>   size -= offset;
>   while (repeat--)
>     assert (func (buf, goal, size) == expected);
>   buf -= offset;
>   free (buf);
>   return 0;
> }
>
> And more interesting numbers:
>
> for i in `seq 0 25` ; do
>  echo $i
>  for j in `seq 0 $i` ; do
>   for k in 0 1 2 ; do
>    ./foo $i 1 $k $j 0
>   done
>  done
> done
>
> Proof that my patched assembly can handle any alignment for starting, and 
> handles whether the byte is found at the beginning, at any alignment at the 
> end, or not all.
>
> $ time ./foo 1000000 1000 0 0
>
> real	0m2.859s
> user	0m2.890s
> sys	0m0.000s
>
> $ time ./foo 1000000 1000 0 1
>
> real	0m2.859s
> user	0m2.858s
> sys	0m0.015s
>
> # Unpatched assembly uses 'repnz scasb' - dirt slow, but no alignment issues
>
> $ time ./foo 1000000 1000 0 0 1
>
> real	0m0.453s
> user	0m0.484s
> sys	0m0.000s
>
> $ time ./foo 1000000 1000 0 1 1
>
> real	0m0.453s
> user	0m0.468s
> sys	0m0.015s
>
> # Wow.  Patched C is 6x faster, with no alignment problems.
>
> $ time ./foo 1000000 1000 0 0 0
>
> real	0m0.391s
> user	0m0.405s
> sys	0m0.015s
>
> $ time ./foo 1000000 1000 0 1 0
>
> real	0m0.390s
> user	0m0.405s
> sys	0m0.015s
>
> # and patched assembly is 15% faster than patched C, on any alignment
>
> Without further ado, here's my 7x speedup for x86 memchr:
>
>
> Index: libc/string/memchr.c
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/string/memchr.c,v
> retrieving revision 1.2
> diff -u -p -r1.2 memchr.c
> --- libc/string/memchr.c	28 Oct 2005 21:21:07 -0000	1.2
> +++ libc/string/memchr.c	22 May 2008 22:42:32 -0000
> @@ -20,7 +20,7 @@ DESCRIPTION
>  	This function searches memory starting at <<*<[src]>>> for the
>  	character <[c]>.  The search only ends with the first
>  	occurrence of <[c]>, or after <[length]> characters; in
> -	particular, <<NULL>> does not terminate the search.
> +	particular, <<NUL>> does not terminate the search.
>  
>  RETURNS
>  	If the character <[c]> is found within <[length]> characters
> @@ -64,6 +64,9 @@ QUICKREF
>  #error long int is not a 32bit or 64bit byte
>  #endif
>  
> +/* DETECTCHAR returns nonzero if (long)X contains the byte used
> +   to fill (long)MASK. */
> +#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
>  
>  _PTR
>  _DEFUN (memchr, (src_void, c, length),
> @@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
>  	int c _AND
>  	size_t length)
>  {
> -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
>    _CONST unsigned char *src = (_CONST unsigned char *) src_void;
> +  unsigned char d = c;
>  
> -  c &= 0xff;
> -  
> -  while (length--)
> +#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
> +  unsigned long *asrc;
> +  unsigned long  mask;
> +  int i;
> +
> +  while (UNALIGNED (src))
>      {
> -      if (*src == c)
> -	return (char *) src;
> +      if (!length--)
> +        return NULL;
> +      if (*src == d)
> +        return (void *) src;
>        src++;
>      }
> -  return NULL;
> -#else
> -  _CONST unsigned char *src = (_CONST unsigned char *) src_void;
> -  unsigned long *asrc;
> -  unsigned long  buffer;
> -  unsigned long  mask;
> -  int i, j;
>  
> -  c &= 0xff;
> -  
> -  /* If the size is small, or src is unaligned, then 
> -     use the bytewise loop.  We can hope this is rare.  */
> -  if (!TOO_SMALL (length) && !UNALIGNED (src)) 
> +  if (!TOO_SMALL (length))
>      {
> -      /* The fast code reads the ASCII one word at a time and only 
> +      /* If we get this far, we know that length is large and src is
> +         word-aligned. */
> +      /* The fast code reads the source one word at a time and only
>           performs the bytewise search on word-sized segments if they
> -         contain the search character, which is detected by XORing 
> +         contain the search character, which is detected by XORing
>           the word-sized segment with a word-sized block of the search
> -         character and then detecting for the presence of NULL in the
> +         character and then detecting for the presence of NUL in the
>           result.  */
> -      asrc = (unsigned long*) src;
> -      mask = 0;
> -      for (i = 0; i < LBLOCKSIZE; i++)
> -        mask = (mask << 8) + c;
> +      asrc = (unsigned long *) src;
> +      mask = d << 8 | d;
> +      mask = mask << 16 | mask;
> +      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
> +        mask = (mask << i) | mask;
>  
>        while (length >= LBLOCKSIZE)
>          {
> -          buffer = *asrc;
> -          buffer ^=  mask;
> -          if (DETECTNULL (buffer))
> -	    {
> -              src = (unsigned char*) asrc;
> -  	      for ( j = 0; j < LBLOCKSIZE; j++ )
> -	        {
> -                  if (*src == c)
> -                    return (char*) src;
> -                  src++;
> -	        }
> -   	    }
> +          if (DETECTCHAR (*asrc, mask))
> +            break;
>            length -= LBLOCKSIZE;
>            asrc++;
>          }
> -  
> +
>        /* If there are fewer than LBLOCKSIZE characters left,
>           then we resort to the bytewise loop.  */
>  
> -      src = (unsigned char*) asrc;
> +      src = (unsigned char *) asrc;
>      }
>  
> +#endif /* not PREFER_SIZE_OVER_SPEED */
> +
>    while (length--)
> -    { 
> -      if (*src == c)
> -        return (char*) src;
> +    {
> +      if (*src == d)
> +        return (void *) src;
>        src++;
> -    } 
> +    }
>  
>    return NULL;
> -#endif /* not PREFER_SIZE_OVER_SPEED */
>  }
> Index: libc/machine/i386/memchr.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/i386/memchr.S,v
> retrieving revision 1.3
> diff -u -p -r1.3 memchr.S
> --- libc/machine/i386/memchr.S	20 Dec 2002 21:31:19 -0000	1.3
> +++ libc/machine/i386/memchr.S	22 May 2008 22:42:32 -0000
> @@ -1,6 +1,6 @@
>  /*
>   * ====================================================
> - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
> + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
>   *
>   * Permission to use, copy, modify, and distribute this
>   * software is freely granted, provided that this notice
> @@ -9,21 +9,23 @@
>   */
>  
>  	#include "i386mach.h"
> -	
> +
>  	.global	SYM (memchr)
>         SOTYPE_FUNCTION(memchr)
>  
>  SYM (memchr):
>  	pushl	ebp
>  	movl	esp,ebp
> -	pushl 	edi
> -	movl 	12(ebp),eax
> -	movl 	16(ebp),ecx
> -	movl 	8(ebp),edi
> +	pushl	edi
> +	movzbl	12(ebp),eax
> +	movl	16(ebp),ecx
> +	movl	8(ebp),edi
>  
>  	xorl	edx,edx
>  	testl	ecx,ecx
> -	jz	L1
> +	jz	L20
> +
> +#ifdef __OPTIMIZE_SIZE__
>  
>  	cld
>  	repnz
> @@ -31,9 +33,79 @@ SYM (memchr):
>  
>  	setnz	dl
>  	decl	edi
> +
> +#else /* !__OPTIMIZE_SIZE__ */
> +/* Do byte-wise checks until string is aligned.  */
> +	testl	$3,edi
> +	je	L5
> +	cmpb	(edi),al
> +	je	L15
> +	incl	edi
> +	decl	ecx
> +	je	L20
> +
> +	testl	$3,edi
> +	je	L5
> +	cmpb	(edi),al
> +	je	L15
> +	incl	edi
> +	decl	ecx
> +	je	L20
> +
> +	testl	$3,edi
> +	je	L5
> +	cmpb	(edi),al
> +	je	L15
> +	incl	edi
> +	decl	ecx
> +	je	L20
> +
> +/* Create a mask, then check a word at a time.  */
> +L5:
> +	movb	al,ah
> +	movl	eax,edx
> +	sall	$16,edx
> +	orl	edx,eax
> +	pushl	ebx
> +
> +	.p2align 4,,7
> +L8:
> +	subl	$4,ecx
> +	jc	L9
> +	movl	(edi),edx
> +	addl	$4,edi
> +	xorl	eax,edx
> +	leal	-16843009(edx),ebx
> +	notl	edx
> +	andl	edx,ebx
> +	testl	$-2139062144,ebx
> +	je	L8
> +
> +	subl	$4,edi
> +
> +L9:
> +	popl	ebx
> +	xorl	edx,edx
> +	addl	$4,ecx
> +	je	L20
> +
> +/* Final byte-wise checks.  */
> +	.p2align 4,,7
> +L10:
> +	cmpb	(edi),al
> +	je	L15
> +	incl	edi
> +	decl	ecx
> +	jne	L10
> +
> +	xorl	edi,edi
> +
> +#endif /* !__OPTIMIZE_SIZE__ */
> +
> +L15:
>  	decl	edx
>  	andl	edi,edx
> -L1:
> +L20:
>  	movl	edx,eax
>  
>  	leal	-4(ebp),esp
>
>
>   



More information about the Newlib mailing list