This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

faster strlen


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

In the same vein as strchr. OK to apply?

2008-05-22 Eric Blake <ebb9@byu.net>

	Optimize the generic and x86 strlen.
	* libc/string/strlen.c (strlen) [!__OPTIMIZE_SIZE__]: Pre-align
	data so unaligned searches aren't penalized.
	* libc/machine/i386/strlen.S (strlen) [!__OPTIMIZE_SIZE__]: Word
	operations are faster than repnz byte searches.



I'm attaching my test program as well.  I compiled my changes to strlen.S
as the function strlen1, so I could compare to the pre-patch strlen on
cygwin.  When run as:

$ for i in `seq 0 16` ; do
| for j in `seq 0 $i` ; do
| ./foo $i 1 $j 0
| done; done

it proves that strlen handles any alignment for both the start of the
string as well as any alignment of the NUL at the end of the string (ie.
100% coverage of the code).

When run with longer strings and multiple iterations, I got the following
timings:

$ ./foo
usage: ./foo size repeat offset [func]
$ time ./foo 1000000 1000 0

real	0m2.071s
user	0m1.999s
sys	0m0.046s

$ time ./foo 1000000 1000 1

real	0m2.065s
user	0m1.921s
sys	0m0.031s

# Pre-patch assembly, uses 'repnz scasb', which is dirt slow
# at least alignment was not an issue with byte-wise algorithm

$ time ./foo 1000000 1000 0 0

real	0m0.707s
user	0m0.624s
sys	0m0.046s

$ time ./foo 1000000 1000 1 0

real	0m0.704s
user	0m0.655s
sys	0m0.046s

# patched assembly, 3x faster, alignment still doesn't matter

$ time ./foo 1000000 1000 0 1

real	0m0.702s
user	0m0.655s
sys	0m0.030s

$ time ./foo 1000000 1000 1 1

real	0m0.724s
user	0m0.686s
sys	0m0.031s

# patched C code, marginally slower than patched assembly

- --
Don't work too hard, make some time for fun as well!

Eric Blake             ebb9@byu.net
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.9 (Cygwin)
Comment: Public key at home.comcast.net/~ericblake/eblake.gpg
Comment: Using GnuPG with Mozilla - http://enigmail.mozdev.org

iEYEARECAAYFAkg1a/UACgkQ84KuGfSFAYD/WwCfXFRROzpPnnCThvNeuv/AzGJb
nGIAn06OYjM6DvOeQooj658yBF18bKt3
=Zkco
-----END PGP SIGNATURE-----
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))

#undef strlen

size_t
strlen2 (const char *str)
{
  const char *start = str;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  unsigned long *aligned_addr;

  /* Align the pointer, so we can search a word at a time.  */
  while (UNALIGNED (str))
    {
      if (!*str)
	return str - start;
      str++;
    }

  /* If the string is word-aligned, we can check for the presence of
     a null in each word-sized block.  */
  aligned_addr = (unsigned long *)str;
  while (!DETECTNULL (*aligned_addr))
    aligned_addr++;

  /* Once a null is detected, we check each byte in that block for a
     precise position of the null.  */
  str = (char *) aligned_addr;

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (*str)
    str++;
  return str - start;
}

size_t strlen1 (const char *);

int main (int argc, char **argv)
{
  if (argc < 4)
    {
      printf ("usage: %s size repeat offset [func]\n", argv[0]);
      return 1;
    }
  int size = strtol (argv[1], NULL, 0);
  int repeat = strtol (argv[2], NULL, 0);
  int offset = strtol (argv[3], NULL, 0);
  char *buf = malloc (size + 1);
  size_t (*func)(const char *);
  func = argc > 4 ? (atoi (argv[4]) ? strlen2 : strlen1) : strlen;

  memset (buf, 1, size);
  buf[size] = 0;

  size_t expected = size - offset;
  buf += offset;
  while (repeat--)
    assert (func (buf) == expected);
  buf -= offset;
  free (buf);
  return 0;
}
Index: libc/machine/i386/strlen.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/strlen.S,v
retrieving revision 1.3
diff -u -p -r1.3 strlen.S
--- libc/machine/i386/strlen.S	20 Dec 2002 21:31:19 -0000	1.3
+++ libc/machine/i386/strlen.S	22 May 2008 12:47:22 -0000
@@ -1,6 +1,6 @@
 /*
  * ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
  *
  * Permission to use, copy, modify, and distribute this
  * software is freely granted, provided that this notice
@@ -20,12 +20,75 @@ SYM (strlen):
 	pushl edi
 	movl 8(ebp),edx
 
+#ifdef __OPTIMIZE_SIZE__
 	cld
 	movl edx,edi
 	movl $4294967295,ecx
 	xor eax,eax
 	repnz
 	scasb
+#else
+/* Modern x86 hardware is much faster at double-word
+   manipulation than with bytewise repnz scasb.  */
+
+/* Do byte-wise checks until string is aligned.  */
+	movl edx,edi
+	test $3,edi
+	je L5
+	movb (edi),cl
+	incl edi
+	testb cl,cl
+	je L15
+
+	test $3,edi
+	je L5
+	movb (edi),cl
+	incl edi
+	testb cl,cl
+	je L15
+
+	test $3,edi
+	je L5
+	movb (edi),cl
+	incl edi
+	testb cl,cl
+	je L15
+
+L5:
+	subl $4,edi
+
+/* loop performing 4 byte mask checking for desired 0 byte */
+	.p2align 4,,7
+L10:
+	addl $4,edi
+	movl (edi),ecx
+	leal -16843009(ecx),eax
+	notl ecx
+	andl ecx,eax
+	testl $-2139062144,eax
+	je L10
+
+/* Find which of four bytes is 0.  */
+	notl ecx
+	incl edi
+
+	testb cl,cl
+	je L15
+	incl edi
+	shrl $8,ecx
+
+	testb cl,cl
+	je L15
+	incl edi
+	shrl $8,ecx
+
+	testb cl,cl
+	je L15
+	incl edi
+
+#endif
+
+L15:
 	subl edx,edi
 	leal -1(edi),eax
 
Index: libc/string/strlen.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/strlen.c,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 strlen.c
--- libc/string/strlen.c	17 Feb 2000 19:39:48 -0000	1.1.1.1
+++ libc/string/strlen.c	22 May 2008 12:47:22 -0000
@@ -1,7 +1,7 @@
-/* 
+/*
 FUNCTION
 	<<strlen>>---character string length
-	
+
 INDEX
 	strlen
 
@@ -57,32 +57,32 @@ size_t
 _DEFUN (strlen, (str),
 	_CONST char *str)
 {
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   _CONST char *start = str;
 
-  while (*str)
-    str++;
-
-  return str - start;
-#else
-  _CONST char *start = str;
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
   unsigned long *aligned_addr;
 
-  if (!UNALIGNED (str))
+  /* Align the pointer, so we can search a word at a time.  */
+  while (UNALIGNED (str))
     {
-      /* If the string is word-aligned, we can check for the presence of 
-         a null in each word-sized block.  */
-      aligned_addr = (unsigned long*)str;
-      while (!DETECTNULL (*aligned_addr))
-        aligned_addr++;
-
-      /* Once a null is detected, we check each byte in that block for a
-         precise position of the null.  */
-      str = (char*)aligned_addr;
+      if (!*str)
+	return str - start;
+      str++;
     }
- 
+
+  /* If the string is word-aligned, we can check for the presence of
+     a null in each word-sized block.  */
+  aligned_addr = (unsigned long *)str;
+  while (!DETECTNULL (*aligned_addr))
+    aligned_addr++;
+
+  /* Once a null is detected, we check each byte in that block for a
+     precise position of the null.  */
+  str = (char *) aligned_addr;
+
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
   while (*str)
     str++;
   return str - start;
-#endif /* not PREFER_SIZE_OVER_SPEED */
 }

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]