This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
faster memset
- From: Eric Blake <ebb9 at byu dot net>
- To: newlib at sources dot redhat dot com
- Date: Thu, 22 May 2008 16:56:54 +0000 (UTC)
- Subject: faster memset
Next in the list of unoptimized string functions.
2008-05-22 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset.
* libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align
pointer so unaligned stores aren't penalized.
* libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
Pre-align pointer so unaligned stores aren't penalized. Prefer
8-byte over 4-byte alignment. Reduce register pressure.
Here's my test program, with memset0 as my patched assembly, and memset as the
unpatched version, compiled at -O2 on cygwin:
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#define UNALIGNED(X) ((long)X & (sizeof (long) - 1))
#define LBLOCKSIZE (sizeof (long))
#define DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080)
#define DETECTCHAR(X,MASK) (DETECTNULL(X ^ MASK))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
#undef memset
void *
memset1 (void *m, int c, size_t n)
{
char *s = (char *) m;
#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
while (UNALIGNED (s))
{
if (n--)
*s++ = (char) c;
else
return m;
}
if (!TOO_SMALL (n))
{
/* If we get this far, we know that n is large and s is word-aligned. */
aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
buffer = (d << 8) | d;
buffer |= (buffer << 16);
for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
buffer = (buffer << i) | buffer;
/* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
*aligned_addr++ = buffer;
n -= 4*LBLOCKSIZE;
}
while (n >= LBLOCKSIZE)
{
*aligned_addr++ = buffer;
n -= LBLOCKSIZE;
}
/* Pick up the remainder with a bytewise loop. */
s = (char*)aligned_addr;
}
#endif /* not PREFER_SIZE_OVER_SPEED */
while (n--)
*s++ = (char) c;
return m;
}
void *memset0 (void *m, int c, size_t n);
int main (int argc, char **argv)
{
if (argc < 5)
{
printf ("usage: %s size repeat validate offset [func]\n", argv[0]);
return 1;
}
int size = strtol (argv[1], NULL, 0);
int repeat = strtol (argv[2], NULL, 0);
int validate = strtol (argv[3], NULL, 0);
int offset = strtol (argv[4], NULL, 0);
void *(*func)(void *, int, size_t);
func = argc > 5 ? (atoi (argv[5]) ? memset1 : memset0) : memset;
unsigned char i = 0;
int j;
unsigned char *buf = malloc (size + 1);
buf += offset;
size -= offset;
while (repeat--)
{
buf[size] = i;
assert (func (buf, ++i, size) == buf);
if (validate)
{
for (j = 0; j < size; j++)
assert (buf[j] == i);
assert (buf[j] == ((i - 1) & 0xff));
}
}
buf -= offset;
free (buf);
return 0;
}
$ ./foo
usage: ./foo size repeat validate offset [func]
$ for i in `seq 0 25` ; do
echo $i
for j in `seq 0 $i` ; do
./foo $i 1 1 $j 0
done
done
proves that memset works regardless of starting alignment or length, with a
limit large enough to trip the >=16 check in assembly and the unrolled loop in
software.
For some timing comparison:
$ time ./foo 1000000 10000 0 0
real 0m0.906s
user 0m0.921s
sys 0m0.015s
$ time ./foo 1000000 10000 0 1
real 0m5.078s
user 0m5.093s
sys 0m0.015s
# OUCH! unaligned memory falls back to bytewise writes, which is 5x slower!
$ time ./foo 1000000 10000 0 4
real 0m1.500s
user 0m1.515s
sys 0m0.015s
$ time ./foo 1000000 10000 0 8
real 0m0.922s
user 0m0.936s
sys 0m0.015s
# Weird! Even though the instruction stream is nominally looping in 4-byte
# steps, a 'rep stosl' loop with edi%8==4 is 50% slower than with edi%8==0!
# Chalk one up to modern x86 hardware pipelining under the hood.
$ time ./foo 1000000 10000 0 0 1
real 0m1.515s
user 0m1.530s
sys 0m0.015s
$ time ./foo 1000000 10000 0 1 1
real 0m1.516s
user 0m1.530s
sys 0m0.015s
$ time ./foo 1000000 10000 0 4 1
real 0m1.516s
user 0m1.530s
sys 0m0.015s
# My fixed C loop is the same speed as unpatched assembly on 4-byte alignment,
# wins hands-down on unaligned data, but loses on 8-byte alignment.
$ time ./foo 1000000 10000 0 0 0
real 0m0.921s
user 0m0.936s
sys 0m0.015s
$ time ./foo 1000000 10000 0 1 0
real 0m0.906s
user 0m0.921s
sys 0m0.015s
$ time ./foo 1000000 10000 0 4 0
real 0m0.906s
user 0m0.921s
sys 0m0.015s
# My patched assembly is no longer sensitive to alignment, and always gets
# the speed of 8-byte alignment.
# This clinches it - for memset, x86 assembly is noticeably faster than C.
Index: libc/string/memset.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/memset.c,v
retrieving revision 1.6
diff -u -p -r1.6 memset.c
--- libc/string/memset.c 27 Nov 2002 18:10:16 -0000 1.6
+++ libc/string/memset.c 22 May 2008 16:52:41 -0000
@@ -22,7 +22,7 @@ DESCRIPTION
pointed to by <[dst]> to the value.
RETURNS
- <<memset>> returns the value of <[m]>.
+ <<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
@@ -39,48 +39,42 @@ QUICKREF
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
-_PTR
+_PTR
_DEFUN (memset, (m, c, n),
_PTR m _AND
int c _AND
size_t n)
{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
char *s = (char *) m;
- while (n-- != 0)
- {
- *s++ = (char) c;
- }
-
- return m;
-#else
- char *s = (char *) m;
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
- if (!TOO_SMALL (n) && !UNALIGNED (m))
+ while (UNALIGNED (s))
{
- /* If we get this far, we know that n is large and m is word-aligned. */
- aligned_addr = (unsigned long*)m;
+ if (n--)
+ *s++ = (char) c;
+ else
+ return m;
+ }
+
+ if (!TOO_SMALL (n))
+ {
+ /* If we get this far, we know that n is large and s is word-aligned. */
+ aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
- if (LBLOCKSIZE == 4)
- {
- buffer = (d << 8) | d;
- buffer |= (buffer << 16);
- }
- else
- {
- buffer = 0;
- for (i = 0; i < LBLOCKSIZE; i++)
- buffer = (buffer << 8) | d;
- }
+ buffer = (d << 8) | d;
+ buffer |= (buffer << 16);
+ for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+ buffer = (buffer << i) | buffer;
+ /* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
@@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
s = (char*)aligned_addr;
}
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
while (n--)
- {
- *s++ = (char)d;
- }
+ *s++ = (char) c;
return m;
-#endif /* not PREFER_SIZE_OVER_SPEED */
}
Index: libc/machine/i386/memset.S
===================================================================
RCS file: /cvs/src/src/newlib/libc/machine/i386/memset.S,v
retrieving revision 1.3
diff -u -p -r1.3 memset.S
--- libc/machine/i386/memset.S 20 Dec 2002 21:31:19 -0000 1.3
+++ libc/machine/i386/memset.S 22 May 2008 16:52:41 -0000
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -18,43 +18,83 @@ SYM (memset):
pushl ebp
movl esp,ebp
pushl edi
- pushl ebx
movl 8(ebp),edi
movl 12(ebp),eax
movl 16(ebp),ecx
cld
#ifndef __OPTIMIZE_SIZE__
- andl $255,eax
- movl ecx,ebx
- testl $3,edi
- jne .L19
+/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
cmpl $16,ecx
jbe .L19
-
- movl eax,edx
- sall $8,eax
- orl edx,eax
-
+ cbw
+ testl $7,edi
+ je .L10
+
+/* It turns out that 8-byte aligned 'rep stosl' outperforms
+ 4-byte aligned on some x86 platforms. */
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+
+/* At this point, ecx>8 and edi%8==0. */
+.L10:
+ movb al,ah
movl eax,edx
sall $16,edx
orl edx,eax
+ movl ecx,edx
shrl $2,ecx
- andl $3,ebx
+ andl $3,edx
rep
stosl
- movl ebx,ecx
+ movl edx,ecx
#endif /* not __OPTIMIZE_SIZE__ */
-
+
.L19:
rep
stosb
movl 8(ebp),eax
- leal -8(ebp),esp
- popl ebx
+ leal -4(ebp),esp
popl edi
leave
ret