This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: faster memchr


Patch checked in. Thanks again.

-- Jeff J.

Eric Blake wrote:
Yet another speedup.

2008-05-22 Eric Blake <ebb9@byu.net>

	Optimize the generic and x86 memchr.
	* libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]: Pre-align
	pointer so unaligned searches aren't penalized.
	* libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
	operations are faster than repnz byte searches.


And the test program:


#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#define _CONST const

#undef memchr

void *
memchr1 (const void *src_void, int c, size_t length)
{
  _CONST unsigned char *src = (_CONST unsigned char *) src_void;
  unsigned char d = c;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  unsigned long *asrc;
  unsigned long  mask;
  int i;

  while (UNALIGNED (src))
    {
      if (!length--)
        return NULL;
      if (*src == d)
        return (void *) src;
      src++;
    }

  if (!TOO_SMALL (length))
    {
      /* If we get this far, we know that length is large and src is
         word-aligned. */
      /* The fast code reads the source one word at a time and only
         performs the bytewise search on word-sized segments if they
         contain the search character, which is detected by XORing
         the word-sized segment with a word-sized block of the search
         character and then detecting for the presence of NUL in the
         result.  */
      asrc = (unsigned long *) src;
      mask = d << 8 | d;
      mask = mask << 16 | mask;
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        mask = (mask << i) | mask;

      while (length >= LBLOCKSIZE)
        {
          if (DETECTCHAR (*asrc, mask))
            break;
          length -= LBLOCKSIZE;
          asrc++;
        }

      /* If there are fewer than LBLOCKSIZE characters left,
         then we resort to the bytewise loop.  */

      src = (unsigned char *) asrc;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (length--)
    {
      if (*src == d)
        return (void *) src;
      src++;
    }

  return NULL;
}

void *memchr0 (const void *m, int c, size_t n);

int main (int argc, char **argv)
{
  if (argc < 5)
    {
      printf ("usage: %s size repeat goal offset [func]\n", argv[0]);
      return 1;
    }
  int size = strtol (argv[1], NULL, 0);
  int repeat = strtol (argv[2], NULL, 0);
  int goal = strtol (argv[3], NULL, 0);
  int offset = strtol (argv[4], NULL, 0);
  void *(*func)(const void *, int, size_t);
  func = argc > 5 ? (atoi (argv[5]) ? memchr1 : memchr0) : memchr;

  unsigned char *buf = malloc (size);
  if (size)
    {
      __builtin_memset (buf, 1, size - 1);
      buf[size - 1] = 2;
    }

  char *expected = (goal == 1 && offset < size - 1 ? buf + offset
                    : goal == 2 && offset < size ? buf + size - 1 : NULL);
  buf += offset;
  size -= offset;
  while (repeat--)
    assert (func (buf, goal, size) == expected);
  buf -= offset;
  free (buf);
  return 0;
}

And more interesting numbers:

for i in `seq 0 25` ; do
 echo $i
 for j in `seq 0 $i` ; do
  for k in 0 1 2 ; do
   ./foo $i 1 $k $j 0
  done
 done
done

Proof that my patched assembly can handle any alignment for starting, and handles whether the byte is found at the beginning, at any alignment at the end, or not all.

$ time ./foo 1000000 1000 0 0

real	0m2.859s
user	0m2.890s
sys	0m0.000s

$ time ./foo 1000000 1000 0 1

real	0m2.859s
user	0m2.858s
sys	0m0.015s

# Unpatched assembly uses 'repnz scasb' - dirt slow, but no alignment issues

$ time ./foo 1000000 1000 0 0 1

real	0m0.453s
user	0m0.484s
sys	0m0.000s

$ time ./foo 1000000 1000 0 1 1

real	0m0.453s
user	0m0.468s
sys	0m0.015s

# Wow. Patched C is 6x faster, with no alignment problems.

$ time ./foo 1000000 1000 0 0 0

real	0m0.391s
user	0m0.405s
sys	0m0.015s

$ time ./foo 1000000 1000 0 1 0

real	0m0.390s
user	0m0.405s
sys	0m0.015s

# and patched assembly is 15% faster than patched C, on any alignment

Without further ado, here's my 7x speedup for x86 memchr:


Index: libc/string/memchr.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/memchr.c,v
retrieving revision 1.2
diff -u -p -r1.2 memchr.c
--- libc/string/memchr.c 28 Oct 2005 21:21:07 -0000 1.2
+++ libc/string/memchr.c 22 May 2008 22:42:32 -0000
@@ -20,7 +20,7 @@ DESCRIPTION
This function searches memory starting at <<*<[src]>>> for the
character <[c]>. The search only ends with the first
occurrence of <[c]>, or after <[length]> characters; in
- particular, <<NULL>> does not terminate the search.
+ particular, <<NUL>> does not terminate the search.
RETURNS
If the character <[c]> is found within <[length]> characters
@@ -64,6 +64,9 @@ QUICKREF
#error long int is not a 32bit or 64bit byte
#endif
+/* DETECTCHAR returns nonzero if (long)X contains the byte used
+ to fill (long)MASK. */
+#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
_PTR
_DEFUN (memchr, (src_void, c, length),
@@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
int c _AND
size_t length)
{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
_CONST unsigned char *src = (_CONST unsigned char *) src_void;
+ unsigned char d = c;
- c &= 0xff;
- - while (length--)
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
+ unsigned long *asrc;
+ unsigned long mask;
+ int i;
+
+ while (UNALIGNED (src))
{
- if (*src == c)
- return (char *) src;
+ if (!length--)
+ return NULL;
+ if (*src == d)
+ return (void *) src;
src++;
}
- return NULL;
-#else
- _CONST unsigned char *src = (_CONST unsigned char *) src_void;
- unsigned long *asrc;
- unsigned long buffer;
- unsigned long mask;
- int i, j;
- c &= 0xff;
- - /* If the size is small, or src is unaligned, then - use the bytewise loop. We can hope this is rare. */
- if (!TOO_SMALL (length) && !UNALIGNED (src)) + if (!TOO_SMALL (length))
{
- /* The fast code reads the ASCII one word at a time and only + /* If we get this far, we know that length is large and src is
+ word-aligned. */
+ /* The fast code reads the source one word at a time and only
performs the bytewise search on word-sized segments if they
- contain the search character, which is detected by XORing + contain the search character, which is detected by XORing
the word-sized segment with a word-sized block of the search
- character and then detecting for the presence of NULL in the
+ character and then detecting for the presence of NUL in the
result. */
- asrc = (unsigned long*) src;
- mask = 0;
- for (i = 0; i < LBLOCKSIZE; i++)
- mask = (mask << 8) + c;
+ asrc = (unsigned long *) src;
+ mask = d << 8 | d;
+ mask = mask << 16 | mask;
+ for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+ mask = (mask << i) | mask;
while (length >= LBLOCKSIZE)
{
- buffer = *asrc;
- buffer ^= mask;
- if (DETECTNULL (buffer))
- {
- src = (unsigned char*) asrc;
- for ( j = 0; j < LBLOCKSIZE; j++ )
- {
- if (*src == c)
- return (char*) src;
- src++;
- }
- }
+ if (DETECTCHAR (*asrc, mask))
+ break;
length -= LBLOCKSIZE;
asrc++;
}
- +
/* If there are fewer than LBLOCKSIZE characters left,
then we resort to the bytewise loop. */
- src = (unsigned char*) asrc;
+ src = (unsigned char *) asrc;
}
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
while (length--)
- { - if (*src == c)
- return (char*) src;
+ {
+ if (*src == d)
+ return (void *) src;
src++;
- } + }
return NULL;
-#endif /* not PREFER_SIZE_OVER_SPEED */
}
Index: libc/machine/i386/memchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/memchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 memchr.S
--- libc/machine/i386/memchr.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/memchr.S 22 May 2008 22:42:32 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -9,21 +9,23 @@
*/
#include "i386mach.h"
-
+
.global SYM (memchr)
SOTYPE_FUNCTION(memchr)
SYM (memchr):
pushl ebp
movl esp,ebp
- pushl edi
- movl 12(ebp),eax
- movl 16(ebp),ecx
- movl 8(ebp),edi
+ pushl edi
+ movzbl 12(ebp),eax
+ movl 16(ebp),ecx
+ movl 8(ebp),edi
xorl edx,edx
testl ecx,ecx
- jz L1
+ jz L20
+
+#ifdef __OPTIMIZE_SIZE__
cld
repnz
@@ -31,9 +33,79 @@ SYM (memchr):
setnz dl
decl edi
+
+#else /* !__OPTIMIZE_SIZE__ */
+/* Do byte-wise checks until string is aligned. */
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+/* Create a mask, then check a word at a time. */
+L5:
+ movb al,ah
+ movl eax,edx
+ sall $16,edx
+ orl edx,eax
+ pushl ebx
+
+ .p2align 4,,7
+L8:
+ subl $4,ecx
+ jc L9
+ movl (edi),edx
+ addl $4,edi
+ xorl eax,edx
+ leal -16843009(edx),ebx
+ notl edx
+ andl edx,ebx
+ testl $-2139062144,ebx
+ je L8
+
+ subl $4,edi
+
+L9:
+ popl ebx
+ xorl edx,edx
+ addl $4,ecx
+ je L20
+
+/* Final byte-wise checks. */
+ .p2align 4,,7
+L10:
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ jne L10
+
+ xorl edi,edi
+
+#endif /* !__OPTIMIZE_SIZE__ */
+
+L15:
decl edx
andl edi,edx
-L1:
+L20:
movl edx,eax
leal -4(ebp),esp





Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]