This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[RFC] improve generic strlen.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Cc: Wilco <wdijkstr at arm dot com>, Richard Earnshaw <rearnsha at arm dot com>
- Date: Tue, 23 Dec 2014 19:57:14 +0100
- Subject: [RFC] improve generic strlen.
- Authentication-results: sourceware.org; auth=none
Hi
As I promised earlier here is working version of improved strlen.
For long strings on x64 its twice as fast as current C version. Or 50%
slower when there is 1/128 probability of character 128.
For simplicity it assumes unaligned loads. For aligned ones I would need
to add define to mask first bytes.
Could you try this on arm or other arch to see what improvement is
possible?
Also while header is most important it needs platform specific
optimizations so if this is improvement next step is take assembly and
fix gcc mistakes.
#include <string.h>
#undef strlen
#include <stdint.h>
typedef uint64_t vec_t;
#define VEC_SIZE sizeof (vec_t)
#define ITER_SIZE 4 * VEC_SIZE
#define LOAD(x) (*((vec_t *) (x)))
#define ONES (((vec_t) ~0) / 255)
#define HIGH_BITS (128 * ONES)
size_t
strlen (const char *s)
{
vec_t m0, m1, m2, m3;
vec_t s0, s1, s2, s3;
vec_t m;
const char *s_start = s;
if (((uintptr_t) s) % 4096 < 4096 - ITER_SIZE)
{
s0 = LOAD (s);
m0 = (s0 - ONES) & (~s0) & HIGH_BITS;
if (m0)
return s - s_start - 1 + ffsl (m0) / 8;
s1 = LOAD (s + VEC_SIZE);
m1 = (s1 - ONES) & (~s1) & HIGH_BITS;
if (m1)
return s - s_start + VEC_SIZE - 1 + ffsl (m1) / 8;
s2 = LOAD (s + 2 * VEC_SIZE);
m2 = (s2 - ONES) & (~s2) & HIGH_BITS;
if (m2)
return s - s_start + 2 * VEC_SIZE - 1 + ffsl (m2) / 8;
s3 = LOAD (s + 3 * VEC_SIZE);
m3 = (s3 - ONES) & (~s3) & HIGH_BITS;
if (m3)
return s - s_start + 3 * VEC_SIZE - 1 + ffsl (m3) / 8;
}
else
{
int i;
for (i = 0; i < ITER_SIZE; i++)
if (!s[i])
return i;
}
s = (const char *) (((uintptr_t) s) & (~(ITER_SIZE - 1)));
while (1)
{
s += ITER_SIZE;
s0 = LOAD (s);
s1 = LOAD (s + VEC_SIZE);
s2 = LOAD (s + 2 * VEC_SIZE);
s3 = LOAD (s + 3 * VEC_SIZE);
m0 = s0 ^ (s0 - ONES);
m1 = s1 ^ (s1 - ONES);
m2 = s2 ^ (s2 - ONES);
m3 = s3 ^ (s3 - ONES);
m = (m0 | m1 | m2 | m3) & HIGH_BITS;
if (m)
{
/* Here gcc messes up loop, as it spills registers to save
intermediate value easily wasting 99 cycles in 99 iterations
just to save few cycles in last one. */
asm volatile ("" : : : "memory");
s0 = LOAD (s);
m0 = (s0 - ONES) & (~s0) & HIGH_BITS;
if (m0)
return s - s_start - 1 + ffsl (m0) / 8;
s1 = LOAD (s + VEC_SIZE);
m1 = (s1 - ONES) & (~s1) & HIGH_BITS;
if (m1)
return s - s_start + VEC_SIZE - 1 + ffsl (m1) / 8;
s2 = LOAD (s + 2 * VEC_SIZE);
m2 = (s2 - ONES) & (~s2) & HIGH_BITS;
if (m2)
return s - s_start + 2 * VEC_SIZE - 1 + ffsl (m2) / 8;
s3 = LOAD (s + 3 * VEC_SIZE);
m3 = (s3 - ONES) & (~s3) & HIGH_BITS;
if (m3)
return s - s_start + 3 * VEC_SIZE - 1 + ffsl (m3) / 8;
}
}
}