This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
faster memset

From: Eric Blake <ebb9 at byu dot net>
To: newlib at sources dot redhat dot com
Date: Thu, 22 May 2008 16:56:54 +0000 (UTC)
Subject: faster memset
Next in the list of unoptimized string functions.

2008-05-22 Eric Blake <ebb9@byu.net>

	Optimize the generic and x86 memset.
	* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align
	pointer so unaligned stores aren't penalized.
	* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
	Pre-align pointer so unaligned stores aren't penalized.  Prefer
	8-byte over 4-byte alignment.  Reduce register pressure.


Here's my test program, with memset0 as my patched assembly, and memset as the 
unpatched version, compiled at -O2 on cygwin:

#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

#undef memset

void *
memset1 (void *m, int c, size_t n)
{
  char *s = (char *) m;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  int i;
  unsigned long buffer;
  unsigned long *aligned_addr;
  unsigned int d = c & 0xff;	/* To avoid sign extension, copy C to an
				   unsigned variable.  */

  while (UNALIGNED (s))
    {
      if (n--)
        *s++ = (char) c;
      else
        return m;
    }

  if (!TOO_SMALL (n))
    {
      /* If we get this far, we know that n is large and s is word-aligned. */
      aligned_addr = (unsigned long *) s;

      /* Store D into each char sized location in BUFFER so that
         we can set large blocks quickly.  */
      buffer = (d << 8) | d;
      buffer |= (buffer << 16);
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        buffer = (buffer << i) | buffer;

      /* Unroll the loop.  */
      while (n >= LBLOCKSIZE*4)
        {
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          n -= 4*LBLOCKSIZE;
        }

      while (n >= LBLOCKSIZE)
        {
          *aligned_addr++ = buffer;
          n -= LBLOCKSIZE;
        }
      /* Pick up the remainder with a bytewise loop.  */
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    *s++ = (char) c;

  return m;
}

void *memset0 (void *m, int c, size_t n);

int main (int argc, char **argv)
{
  if (argc < 5)
    {
      printf ("usage: %s size repeat validate offset [func]\n", argv[0]);
      return 1;
    }
  int size = strtol (argv[1], NULL, 0);
  int repeat = strtol (argv[2], NULL, 0);
  int validate = strtol (argv[3], NULL, 0);
  int offset = strtol (argv[4], NULL, 0);
  void *(*func)(void *, int, size_t);
  func = argc > 5 ? (atoi (argv[5]) ? memset1 : memset0) : memset;
  unsigned char i = 0;
  int j;

  unsigned char *buf = malloc (size + 1);

  buf += offset;
  size -= offset;
  while (repeat--)
    {
      buf[size] = i;
      assert (func (buf, ++i, size) == buf);
      if (validate)
        {
          for (j = 0; j < size; j++)
            assert (buf[j] == i);
          assert (buf[j] == ((i - 1) & 0xff));
        }
    }
  buf -= offset;
  free (buf);
  return 0;
}


$ ./foo
usage: ./foo size repeat validate offset [func]
$ for i in `seq 0 25` ; do
  echo $i
    for j in `seq 0 $i` ; do
      ./foo $i 1 1 $j 0
    done
  done

proves that memset works regardless of starting alignment or length, with a 
limit large enough to trip the >=16 check in assembly and the unrolled loop in 
software.

For some timing comparison:

$ time ./foo 1000000 10000 0 0

real	0m0.906s
user	0m0.921s
sys	0m0.015s

$ time ./foo 1000000 10000 0 1

real	0m5.078s
user	0m5.093s
sys	0m0.015s

# OUCH!  unaligned memory falls back to bytewise writes, which is 5x slower!

$ time ./foo 1000000 10000 0 4

real	0m1.500s
user	0m1.515s
sys	0m0.015s

$ time ./foo 1000000 10000 0 8

real	0m0.922s
user	0m0.936s
sys	0m0.015s

# Weird!  Even though the instruction stream is nominally looping in 4-byte
# steps, a 'rep stosl' loop with edi%8==4 is 50% slower than with edi%8==0!
# Chalk one up to modern x86 hardware pipelining under the hood.

$ time ./foo 1000000 10000 0 0 1

real	0m1.515s
user	0m1.530s
sys	0m0.015s

$ time ./foo 1000000 10000 0 1 1

real	0m1.516s
user	0m1.530s
sys	0m0.015s

$ time ./foo 1000000 10000 0 4 1

real	0m1.516s
user	0m1.530s
sys	0m0.015s

# My fixed C loop is the same speed as unpatched assembly on 4-byte alignment,
# wins hands-down on unaligned data, but loses on 8-byte alignment.

$ time ./foo 1000000 10000 0 0 0

real	0m0.921s
user	0m0.936s
sys	0m0.015s

$ time ./foo 1000000 10000 0 1 0

real	0m0.906s
user	0m0.921s
sys	0m0.015s

$ time ./foo 1000000 10000 0 4 0

real	0m0.906s
user	0m0.921s
sys	0m0.015s

# My patched assembly is no longer sensitive to alignment, and always gets
# the speed of 8-byte alignment.
# This clinches it - for memset, x86 assembly is noticeably faster than C.

Index: libc/string/memset.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/memset.c,v
retrieving revision 1.6
diff -u -p -r1.6 memset.c
--- libc/string/memset.c	27 Nov 2002 18:10:16 -0000	1.6
+++ libc/string/memset.c	22 May 2008 16:52:41 -0000
@@ -22,7 +22,7 @@ DESCRIPTION
 	pointed to by <[dst]> to the value.
 
 RETURNS
-	<<memset>> returns the value of <[m]>.
+	<<memset>> returns the value of <[dst]>.
 
 PORTABILITY
 <<memset>> is ANSI C.
@@ -39,48 +39,42 @@ QUICKREF
 #define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
 #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
 
-_PTR 
+_PTR
 _DEFUN (memset, (m, c, n),
 	_PTR m _AND
 	int c _AND
 	size_t n)
 {
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
   char *s = (char *) m;
 
-  while (n-- != 0)
-    {
-      *s++ = (char) c;
-    }
-
-  return m;
-#else
-  char *s = (char *) m;
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
   int i;
   unsigned long buffer;
   unsigned long *aligned_addr;
   unsigned int d = c & 0xff;	/* To avoid sign extension, copy C to an
 				   unsigned variable.  */
 
-  if (!TOO_SMALL (n) && !UNALIGNED (m))
+  while (UNALIGNED (s))
     {
-      /* If we get this far, we know that n is large and m is word-aligned. */
-      aligned_addr = (unsigned long*)m;
+      if (n--)
+        *s++ = (char) c;
+      else
+        return m;
+    }
+
+  if (!TOO_SMALL (n))
+    {
+      /* If we get this far, we know that n is large and s is word-aligned. */
+      aligned_addr = (unsigned long *) s;
 
       /* Store D into each char sized location in BUFFER so that
          we can set large blocks quickly.  */
-      if (LBLOCKSIZE == 4)
-        {
-          buffer = (d << 8) | d;
-          buffer |= (buffer << 16);
-        }
-      else
-        {
-          buffer = 0;
-          for (i = 0; i < LBLOCKSIZE; i++)
-	    buffer = (buffer << 8) | d;
-        }
+      buffer = (d << 8) | d;
+      buffer |= (buffer << 16);
+      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+        buffer = (buffer << i) | buffer;
 
+      /* Unroll the loop.  */
       while (n >= LBLOCKSIZE*4)
         {
           *aligned_addr++ = buffer;
@@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
       s = (char*)aligned_addr;
     }
 
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
   while (n--)
-    {
-      *s++ = (char)d;
-    }
+    *s++ = (char) c;
 
   return m;
-#endif /* not PREFER_SIZE_OVER_SPEED */
 }
Index: libc/machine/i386/memset.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/memset.S,v
retrieving revision 1.3
diff -u -p -r1.3 memset.S
--- libc/machine/i386/memset.S	20 Dec 2002 21:31:19 -0000	1.3
+++ libc/machine/i386/memset.S	22 May 2008 16:52:41 -0000
@@ -1,6 +1,6 @@
 /*
  * ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
  *
  * Permission to use, copy, modify, and distribute this
  * software is freely granted, provided that this notice
@@ -18,43 +18,83 @@ SYM (memset):
 	pushl ebp
 	movl esp,ebp
 	pushl edi
-	pushl ebx
 	movl 8(ebp),edi
 	movl 12(ebp),eax
 	movl 16(ebp),ecx
 	cld
 
 #ifndef __OPTIMIZE_SIZE__
-	andl $255,eax
-	movl ecx,ebx
-	testl $3,edi
-	jne .L19
+/* Less than 16 bytes won't benefit from the 'rep stosl' loop.  */
 	cmpl $16,ecx
 	jbe .L19
-
-	movl eax,edx
-	sall $8,eax
-	orl edx,eax
-
+	cbw
+	testl $7,edi
+	je .L10
+
+/* It turns out that 8-byte aligned 'rep stosl' outperforms
+   4-byte aligned on some x86 platforms.  */
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+	testl $7,edi
+	je .L10
+
+	movb al,(edi)
+	incl edi
+	decl ecx
+
+/* At this point, ecx>8 and edi%8==0.  */
+.L10:
+	movb al,ah
 	movl eax,edx
 	sall $16,edx
 	orl edx,eax
 
+	movl ecx,edx
 	shrl $2,ecx
-	andl $3,ebx
+	andl $3,edx
 	rep
 	stosl
-	movl ebx,ecx
+	movl edx,ecx
 #endif /* not __OPTIMIZE_SIZE__ */
-	
+
 .L19:
 	rep
 	stosb
 
 	movl 8(ebp),eax
 
-	leal -8(ebp),esp
-	popl ebx
+	leal -4(ebp),esp
 	popl edi
 	leave
 	ret
Follow-Ups:
- Re: faster memset
  - From: Aaron J. Grier
- Re: faster memset
  - From: Jeff Johnston
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]