This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 5/*] Generic string function optimization: strcmp and strncmp
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Thu, 28 May 2015 17:23:31 +0200
- Subject: [PATCH 5/*] Generic string function optimization: strcmp and strncmp
- Authentication-results: sourceware.org; auth=none
- References: <20150527060121 dot GA19105 at domone>
This adds a platform generic optimization of strcmp and strncmp.
I adapted a x64 skeleton from strcmp. I correctly added end checks which
I didn't do before so I will also send optimized strncmp.
* string/diff_skeleton.h: New file.
* string/strcmp.c: Use diff skeleton.
* string/strncmp.c: Likewise.
diff --git a/string/diff_skeleton.h b/string/diff_skeleton.h
new file mode 100644
index 0000000..a2cdccc
--- /dev/null
+++ b/string/diff_skeleton.h
@@ -0,0 +1,181 @@
+/* Skeleton of *cmp string functions.
+ Copyright (C) 1991-2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+#ifndef BOUND
+# define BOUND(x) 0
+#endif
+
+
+static __always_inline
+int
+found_in_long_bytes(char *s1, char *s2, char **r1, char **r2)
+{
+ const unsigned long int *lptr1 = (const unsigned long int *) s1;
+ const unsigned long int *lptr2 = (const unsigned long int *) s2;
+
+ unsigned long int mask = EXPRESSION(*lptr1, *lptr2);
+ if (mask)
+ {
+ size_t found = first_nonzero_byte (mask);
+ *r1 = s1 + found;
+ *r2 = s2 + found;
+ return 1;
+ }
+ else
+ return 0;
+}
+
+static __always_inline
+int
+diff_skeleton (char **p1, char **p2, char *end)
+{
+ unsigned long int mask;
+ const unsigned long int *lptr1, *lptr2;
+ char *s1 = *p1, *s2 = *p2;
+
+
+#if _STRING_ARCH_unaligned == 0
+ /* We don't optimize for architectures without aligned load yet.
+ Problem is that header is hot and you need different tricks
+ with aligned load. However loop would be relatively easy by
+ emulating loads with mix of byte shifts. */
+
+ size_t i = byte_loop(s1, s2, SIZE_MAX, end);
+ *p1 = s1 + i;
+ *p2 = s2 + i;
+ return i != SIZE_MAX;
+#endif
+
+ /* We fetch 32 bytes while not crossing page boundary.
+ Most strings in practice are of that size and we avoid a loop.
+ This looks as best in practice, alternative below uses aligned load
+ but is slower when string starts just few
+ bytes before 32 byte boundary. A tradeoff is that we rarely could
+ fetch extra cache line without needing it but this optimization
+ does pay for that. */
+ if (!CROSS_PAGE (s1, 32) && !CROSS_PAGE (s2, 32))
+ {
+ if (found_in_long_bytes (s1 + 0 * LSIZE, s2 + 0 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 1 * LSIZE, s2 + 1 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 2 * LSIZE, s2 + 2 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 3 * LSIZE, s2 + 3 * LSIZE, p1, p2))
+ return 1;
+ if (sizeof (unsigned long int) == 4)
+ {
+ if (found_in_long_bytes (s1 + 4 * LSIZE, s2 + 4 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 5 * LSIZE, s2 + 5 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 6 * LSIZE, s2 + 6 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1 + 7 * LSIZE, s2 + 7 * LSIZE, p1, p2))
+ return 1;
+ }
+
+ if (BOUND (s1 + 32))
+ return 0;
+ }
+ else
+ {
+ size_t i = byte_loop(s1, s2, 32, end);
+
+ if (i==SIZE_MAX)
+ return 0;
+
+ if (i < 32)
+ {
+ *p1 = s1 + i;
+ *p2 = s2 + i;
+ return 1;
+ }
+ }
+ /* Now we read enough bytes to start a loop. */
+
+ char *s1_loop = PTR_ALIGN_DOWN (s1, 4 * LSIZE);
+ char *s2_loop = s2 - (s1 - s1_loop);
+ int until_cross_page = (4096 - (((uintptr_t) (s2_loop + 4 * LSIZE)) % 4096))\
+ / (4 * LSIZE);
+
+#ifdef COALIGN_HELP
+ if (((uintptr_t)s2_loop)%32==0)
+ until_cross_page = 1000000;
+#endif
+ while (!BOUND (s1_loop + 4 * LSIZE))
+ {
+ s1_loop += 4 * LSIZE;
+ s2_loop += 4 * LSIZE;
+
+
+ if (until_cross_page == 0)
+ {
+ uintptr_t shift = ((uintptr_t) s2_loop) % (4 * LSIZE);
+ if (found_in_long_bytes (s1_loop - shift + 0 * LSIZE,
+ s2_loop - shift + 0 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1_loop - shift + 1 * LSIZE,
+ s2_loop - shift + 1 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1_loop - shift + 2 * LSIZE,
+ s2_loop - shift + 2 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1_loop - shift + 3 * LSIZE,
+ s2_loop - shift + 3 * LSIZE, p1, p2))
+ return 1;
+
+ until_cross_page = 4096 / (4 * LSIZE);
+ if (BOUND (s1_loop + 4 * LSIZE - shift))
+ return 0;
+ }
+
+ until_cross_page--;
+
+ lptr1 = (const unsigned long int *) (s1_loop + 0 * LSIZE);
+ lptr2 = (const unsigned long int *) (s2_loop + 0 * LSIZE);
+ mask = EXPRESSION (*lptr1, *lptr2);
+ lptr1 = (const unsigned long int *) (s1_loop + 1 * LSIZE);
+ lptr2 = (const unsigned long int *) (s2_loop + 1 * LSIZE);
+ mask |= EXPRESSION (*lptr1, *lptr2);
+ lptr1 = (const unsigned long int *) (s1_loop + 2 * LSIZE);
+ lptr2 = (const unsigned long int *) (s2_loop + 2 * LSIZE);
+ mask |= EXPRESSION (*lptr1, *lptr2);
+ lptr1 = (const unsigned long int *) (s1_loop + 3 * LSIZE);
+ lptr2 = (const unsigned long int *) (s2_loop + 3 * LSIZE);
+ mask |= EXPRESSION (*lptr1, *lptr2);
+
+ if (mask)
+ {
+ if (found_in_long_bytes (s1_loop + 0 * LSIZE, s2_loop + 0 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1_loop + 1 * LSIZE, s2_loop + 1 * LSIZE, p1, p2))
+ return 1;
+ if (found_in_long_bytes (s1_loop + 2 * LSIZE, s2_loop + 2 * LSIZE, p1, p2))
+ return 1;
+
+ found_in_long_bytes (s1_loop + 3 * LSIZE, s2_loop + 3 * LSIZE, p1, p2);
+ return 1;
+ }
+ }
+ return 0;
+}
diff --git a/string/strcmp.c b/string/strcmp.c
index 4d4c044..c1d4c22 100644
--- a/string/strcmp.c
+++ b/string/strcmp.c
@@ -16,28 +16,37 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-
#undef strcmp
/* Compare S1 and S2, returning less than, equal to or
greater than zero if S1 is lexicographically less than,
equal to or greater than S2. */
+
+#include "string/common.h"
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t n, char *end)
+{
+ size_t i;
+ for (i = 0; i < n; i++)
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
+
+ return n;
+}
+
+#include "string/diff_skeleton.h"
+
int
-strcmp (const char *p1, const char *p2)
+strcmp (const char *s1_start, const char *s2_start)
{
- const unsigned char *s1 = (const unsigned char *) p1;
- const unsigned char *s2 = (const unsigned char *) p2;
- unsigned char c1, c2;
-
- do
- {
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0')
- return c1 - c2;
- }
- while (c1 == c2);
-
- return c1 - c2;
+ unsigned char *p1 = (unsigned char *) s1_start;
+ unsigned char *p2 = (unsigned char *) s2_start;
+
+ diff_skeleton ((char **) &p1, (char **) &p2, NULL);
+
+ return *p1 - *p2;
}
libc_hidden_builtin_def (strcmp)
diff --git a/string/strncmp.c b/string/strncmp.c
index 2a1137a..4c4e9f7 100644
--- a/string/strncmp.c
+++ b/string/strncmp.c
@@ -16,59 +16,55 @@
<http://www.gnu.org/licenses/>. */
#include <string.h>
-#include <memcopy.h>
#undef strncmp
-#ifndef STRNCMP
-#define STRNCMP strncmp
-#endif
+/* Compare S1 and S2, returning less than, equal to or
+ greater than zero if S1 is lexicographically less than,
+ equal to or greater than S2. */
-/* Compare no more than N characters of S1 and S2,
- returning less than, equal to or greater than zero
- if S1 is lexicographically less than, equal to or
- greater than S2. */
-int
-STRNCMP (const char *s1, const char *s2, size_t n)
-{
- unsigned char c1 = '\0';
- unsigned char c2 = '\0';
+#include "string/common.h"
+#define EXPRESSION(c1, c2) (c1 ^ c2) | contains_zero (c2)
+#define BOUND(p) ((uintptr_t) p >= (uintptr_t) end)
- if (n >= 4)
+static __always_inline
+size_t
+byte_loop(char *x, char *y, size_t c, char *end)
+{
+ size_t i;
+ for (i = 0; i < c; i++)
{
- size_t n4 = n >> 2;
- do
- {
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- } while (--n4 > 0);
- n &= 3;
+ if (x + i == end)
+ return SIZE_MAX;
+ if (x[i] == '\0' || x[i] != y[i])
+ return i;
}
- while (n > 0)
+ return c;
+}
+
+#include "string/diff_skeleton.h"
+
+int
+strncmp (const char *s1_start, const char *s2_start, size_t n)
+{
+ unsigned char *p1 = (unsigned char *) s1_start;
+ unsigned char *p2 = (unsigned char *) s2_start;
+ char *end = (char *) (((uintptr_t) p1) + n);
+ if ((uintptr_t) end <= (uintptr_t) p1)
{
- c1 = (unsigned char) *s1++;
- c2 = (unsigned char) *s2++;
- if (c1 == '\0' || c1 != c2)
- return c1 - c2;
- n--;
+ if (n == 0)
+ return 0;
+ end = (char *) UINTPTR_MAX;
}
- return c1 - c2;
-}
+ if (!diff_skeleton ((char **) &p1, (char **) &p2, end))
+ return 0;
-libc_hidden_builtin_def (STRNCMP)
+ if (BOUND (p1))
+ return 0;
+
+
+ return *p1 - *p2;
+}
+libc_hidden_builtin_def (strncmp)