faster memchr
Eric Blake
ebb9@byu.net
Mon May 26 22:57:00 GMT 2008
Yet another speedup.
2008-05-22 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memchr.
* libc/string/memchr.c (memchr) [!__OPTIMIZE_SIZE__]: Pre-align
pointer so unaligned searches aren't penalized.
* libc/machine/i386/memchr.S (memchr) [!__OPTIMIZE_SIZE__]: Word
operations are faster than repnz byte searches.
And the test program:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#define _CONST const
#undef memchr
void *
memchr1 (const void *src_void, int c, size_t length)
{
_CONST unsigned char *src = (_CONST unsigned char *) src_void;
unsigned char d = c;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
unsigned long *asrc;
unsigned long mask;
int i;
while (UNALIGNED (src))
{
if (!length--)
return NULL;
if (*src == d)
return (void *) src;
src++;
}
if (!TOO_SMALL (length))
{
/* If we get this far, we know that length is large and src is
word-aligned. */
/* The fast code reads the source one word at a time and only
performs the bytewise search on word-sized segments if they
contain the search character, which is detected by XORing
the word-sized segment with a word-sized block of the search
character and then detecting for the presence of NUL in the
result. */
asrc = (unsigned long *) src;
mask = d << 8 | d;
mask = mask << 16 | mask;
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
mask = (mask << i) | mask;
while (length >= LBLOCKSIZE)
{
if (DETECTCHAR (*asrc, mask))
break;
length -= LBLOCKSIZE;
asrc++;
}
/* If there are fewer than LBLOCKSIZE characters left,
then we resort to the bytewise loop. */
src = (unsigned char *) asrc;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (length--)
{
if (*src == d)
return (void *) src;
src++;
}
return NULL;
}
void *memchr0 (const void *m, int c, size_t n);
int main (int argc, char **argv)
{
if (argc < 5)
{
printf ("usage: %s size repeat goal offset [func]\n", argv[0]);
return 1;
}
int size = strtol (argv[1], NULL, 0);
int repeat = strtol (argv[2], NULL, 0);
int goal = strtol (argv[3], NULL, 0);
int offset = strtol (argv[4], NULL, 0);
void *(*func)(const void *, int, size_t);
func = argc > 5 ? (atoi (argv[5]) ? memchr1 : memchr0) : memchr;
unsigned char *buf = malloc (size);
if (size)
{
__builtin_memset (buf, 1, size - 1);
buf[size - 1] = 2;
}
char *expected = (goal == 1 && offset < size - 1 ? buf + offset
: goal == 2 && offset < size ? buf + size - 1 : NULL);
buf += offset;
size -= offset;
while (repeat--)
assert (func (buf, goal, size) == expected);
buf -= offset;
free (buf);
return 0;
}
And more interesting numbers:
for i in `seq 0 25` ; do
echo $i
for j in `seq 0 $i` ; do
for k in 0 1 2 ; do
./foo $i 1 $k $j 0
done
done
done
Proof that my patched assembly can handle any alignment for starting, and
handles whether the byte is found at the beginning, at any alignment at the
end, or not all.
$ time ./foo 1000000 1000 0 0
real 0m2.859s
user 0m2.890s
sys 0m0.000s
$ time ./foo 1000000 1000 0 1
real 0m2.859s
user 0m2.858s
sys 0m0.015s
# Unpatched assembly uses 'repnz scasb' - dirt slow, but no alignment issues
$ time ./foo 1000000 1000 0 0 1
real 0m0.453s
user 0m0.484s
sys 0m0.000s
$ time ./foo 1000000 1000 0 1 1
real 0m0.453s
user 0m0.468s
sys 0m0.015s
# Wow. Patched C is 6x faster, with no alignment problems.
$ time ./foo 1000000 1000 0 0 0
real 0m0.391s
user 0m0.405s
sys 0m0.015s
$ time ./foo 1000000 1000 0 1 0
real 0m0.390s
user 0m0.405s
sys 0m0.015s
# and patched assembly is 15% faster than patched C, on any alignment
Without further ado, here's my 7x speedup for x86 memchr:
Index: libc/string/memchr.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/memchr.c,v
retrieving revision 1.2
diff -u -p -r1.2 memchr.c
--- libc/string/memchr.c 28 Oct 2005 21:21:07 -0000 1.2
+++ libc/string/memchr.c 22 May 2008 22:42:32 -0000
@@ -20,7 +20,7 @@ DESCRIPTION
This function searches memory starting at <<*<[src]>>> for the
character <[c]>. The search only ends with the first
occurrence of <[c]>, or after <[length]> characters; in
- particular, <<NULL>> does not terminate the search.
+ particular, <<NUL>> does not terminate the search.
RETURNS
If the character <[c]> is found within <[length]> characters
@@ -64,6 +64,9 @@ QUICKREF
#error long int is not a 32bit or 64bit byte
#endif
+/* DETECTCHAR returns nonzero if (long)X contains the byte used
+ to fill (long)MASK. */
+#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
_PTR
_DEFUN (memchr, (src_void, c, length),
@@ -71,73 +74,61 @@ _DEFUN (memchr, (src_void, c, length),
int c _AND
size_t length)
{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
_CONST unsigned char *src = (_CONST unsigned char *) src_void;
+ unsigned char d = c;
- c &= 0xff;
-
- while (length--)
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
+ unsigned long *asrc;
+ unsigned long mask;
+ int i;
+
+ while (UNALIGNED (src))
{
- if (*src == c)
- return (char *) src;
+ if (!length--)
+ return NULL;
+ if (*src == d)
+ return (void *) src;
src++;
}
- return NULL;
-#else
- _CONST unsigned char *src = (_CONST unsigned char *) src_void;
- unsigned long *asrc;
- unsigned long buffer;
- unsigned long mask;
- int i, j;
- c &= 0xff;
-
- /* If the size is small, or src is unaligned, then
- use the bytewise loop. We can hope this is rare. */
- if (!TOO_SMALL (length) && !UNALIGNED (src))
+ if (!TOO_SMALL (length))
{
- /* The fast code reads the ASCII one word at a time and only
+ /* If we get this far, we know that length is large and src is
+ word-aligned. */
+ /* The fast code reads the source one word at a time and only
performs the bytewise search on word-sized segments if they
- contain the search character, which is detected by XORing
+ contain the search character, which is detected by XORing
the word-sized segment with a word-sized block of the search
- character and then detecting for the presence of NULL in the
+ character and then detecting for the presence of NUL in the
result. */
- asrc = (unsigned long*) src;
- mask = 0;
- for (i = 0; i < LBLOCKSIZE; i++)
- mask = (mask << 8) + c;
+ asrc = (unsigned long *) src;
+ mask = d << 8 | d;
+ mask = mask << 16 | mask;
+ for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+ mask = (mask << i) | mask;
while (length >= LBLOCKSIZE)
{
- buffer = *asrc;
- buffer ^= mask;
- if (DETECTNULL (buffer))
- {
- src = (unsigned char*) asrc;
- for ( j = 0; j < LBLOCKSIZE; j++ )
- {
- if (*src == c)
- return (char*) src;
- src++;
- }
- }
+ if (DETECTCHAR (*asrc, mask))
+ break;
length -= LBLOCKSIZE;
asrc++;
}
-
+
/* If there are fewer than LBLOCKSIZE characters left,
then we resort to the bytewise loop. */
- src = (unsigned char*) asrc;
+ src = (unsigned char *) asrc;
}
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
while (length--)
- {
- if (*src == c)
- return (char*) src;
+ {
+ if (*src == d)
+ return (void *) src;
src++;
- }
+ }
return NULL;
-#endif /* not PREFER_SIZE_OVER_SPEED */
}
Index: libc/machine/i386/memchr.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/memchr.S,v
retrieving revision 1.3
diff -u -p -r1.3 memchr.S
--- libc/machine/i386/memchr.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/memchr.S 22 May 2008 22:42:32 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -9,21 +9,23 @@
*/
#include "i386mach.h"
-
+
.global SYM (memchr)
SOTYPE_FUNCTION(memchr)
SYM (memchr):
pushl ebp
movl esp,ebp
- pushl edi
- movl 12(ebp),eax
- movl 16(ebp),ecx
- movl 8(ebp),edi
+ pushl edi
+ movzbl 12(ebp),eax
+ movl 16(ebp),ecx
+ movl 8(ebp),edi
xorl edx,edx
testl ecx,ecx
- jz L1
+ jz L20
+
+#ifdef __OPTIMIZE_SIZE__
cld
repnz
@@ -31,9 +33,79 @@ SYM (memchr):
setnz dl
decl edi
+
+#else /* !__OPTIMIZE_SIZE__ */
+/* Do byte-wise checks until string is aligned. */
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+ testl $3,edi
+ je L5
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ je L20
+
+/* Create a mask, then check a word at a time. */
+L5:
+ movb al,ah
+ movl eax,edx
+ sall $16,edx
+ orl edx,eax
+ pushl ebx
+
+ .p2align 4,,7
+L8:
+ subl $4,ecx
+ jc L9
+ movl (edi),edx
+ addl $4,edi
+ xorl eax,edx
+ leal -16843009(edx),ebx
+ notl edx
+ andl edx,ebx
+ testl $-2139062144,ebx
+ je L8
+
+ subl $4,edi
+
+L9:
+ popl ebx
+ xorl edx,edx
+ addl $4,ecx
+ je L20
+
+/* Final byte-wise checks. */
+ .p2align 4,,7
+L10:
+ cmpb (edi),al
+ je L15
+ incl edi
+ decl ecx
+ jne L10
+
+ xorl edi,edi
+
+#endif /* !__OPTIMIZE_SIZE__ */
+
+L15:
decl edx
andl edi,edx
-L1:
+L20:
movl edx,eax
leal -4(ebp),esp
More information about the Newlib
mailing list