This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v3] Improve fnmatch performance.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Paul Eggert <eggert at cs dot ucla dot edu>
- Cc: libc-alpha at sourceware dot org
- Date: Tue, 9 Jun 2015 16:27:48 +0200
- Subject: [PATCH v3] Improve fnmatch performance.
- Authentication-results: sourceware.org; auth=none
- References: <20150512235339 dot GA27716 at domone> <5553805B dot 3070304 at cs dot ucla dot edu>
On Wed, May 13, 2015 at 09:48:27AM -0700, Paul Eggert wrote:
> OndÅej BÃlka wrote:
> >How to synchronize this with gnulib? Only implementation specific detail
> >is utf8 detection.
>
> It could be something like this:
>
> #if _LIBC
> struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE];
> uint_fast32_t encoding =
> current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
> bool is_utf8 = encoding == !__cet_other;
> #else
> bool is_utf8 = STRCASEEQ (locale_charset (),
> "UTF-8", 'U','T','F','-','8',0,0,0,0)
> #endif
>
> We should package this sort of thing up and make it easier to use,
> but that could be another day.
Yes I will use that pattern, it needs to change details like that it
also works for single-byte encodings.
I also removed expect on MB_CUR_MAX as unicode is widespread.
Also I now return directly match when entire pattern is normal and
FNM_PERIOD or FNM_FILE_NAME wasn't set which could also help performance
a bit.
Then I could allow nonascii characters to start pattern unless its utf8
and you have FNM_CASEFOLD, would it be better to add two tables or check
for testing these?
* posix/fnmatch.c (fnmatch): Improve performance.
diff --git a/posix/fnmatch.c b/posix/fnmatch.c
index fd85efa..4c32992 100644
--- a/posix/fnmatch.c
+++ b/posix/fnmatch.c
@@ -131,6 +131,13 @@ extern int fnmatch (const char *pattern, const char *string, int flags);
# define ISWCTYPE(WC, WT) iswctype (WC, WT)
# endif
+# ifdef _LIBC
+# define STRCASESTR __strcasestr
+# else
+# define STRCASESTR strcasestr
+# endif
+
+
# if (HAVE_MBSTATE_T && HAVE_MBSRTOWCS) || _LIBC
/* In this case we are implementing the multibyte character handling. */
# define HANDLE_MULTIBYTE 1
@@ -332,8 +339,62 @@ fnmatch (pattern, string, flags)
const char *string;
int flags;
{
+
+ /* ASCII with \+/.*?[{(@! excluded. */
+ static unsigned char normal[256] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
+# if _LIBC
+ struct __locale_data *current = _NL_CURRENT_LOCALE->__locales[LC_COLLATE];
+ uint_fast32_t encoding =
+ current->values[_NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE)].word;
+ bool fast_encoding = (encoding != __cet_other);
+# else
+# if HANDLE_MULTIBYTE
+ bool is_utf8 = STRCASEEQ (locale_charset (),
+ "UTF-8", 'U','T','F','-','8',0,0,0,0);
+ bool fast_encoding = (MB_CUR_MAX == 1) || is_utf;
+# else
+ bool fast_encoding = true;
+# endif
+# endif
+
+ if (fast_encoding)
+ {
+ char start[8];
+ char *string2;
+ size_t i;
+ for (i = 0; i < 7 && normal[(unsigned char) pattern[i]]; i++)
+ start[i] = pattern[i];
+ start[i] = 0;
+ if (flags & FNM_CASEFOLD)
+ string2 = STRCASESTR (string, start);
+ else
+ string2 = strstr (string, start);
+ if (!string2)
+ return FNM_NOMATCH;
+
+ if (pattern[i] == '\0' && (flags & (FNM_FILE_NAME | FNM_PERIOD)) == 0)
+ return 0;
+ }
+
# if HANDLE_MULTIBYTE
- if (__builtin_expect (MB_CUR_MAX, 1) != 1)
+ if (MB_CUR_MAX != 1)
{
mbstate_t ps;
size_t n;