This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] Speed up fnmatch.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Fri, 25 Oct 2013 21:20:46 +0200
- Subject: Re: [PATCH] Speed up fnmatch.
- Authentication-results: sourceware.org; auth=none
- References: <20131009174359 dot GA31917 at domone dot podge>
On Wed, Oct 09, 2013 at 07:43:59PM +0200, OndÅej BÃlka wrote:
> Hi, fnmatch on multibyte string is slow for several reasons.
> Main reason is that we spend ages converting strings to wide
> strings.
>
> Most of time this overhead is not needed as user does a simple search
> with no special characters.
>
> For utf8 these could be checked by strstr as there is only one way how
> encode ascii-only sequence. I could generalize this to any encoding for
> which this is true.
>
> A strstr will quickly determine that there is no match because there is
> no match with start.
>
> I excluded all special characters from documentation and it passes test.
>
Which were not complete as I missed FNM_CASEFOLD.
As a benchmark I measured find on my home directory. This optimization
decreased user runtime by 30% and still has bit ineffective checks for
encoding.
$ gcc -O3 -ldl tst.c -fPIC -shared -o tst.so
# find -name "aeou*a" # cache entries
$ LD_PRELOAD=tst.so /usr/bin/time find -name "aeou*a"
Command exited with non-zero status 1
0.68user 1.11system 0:01.83elapsed 98%CPU (0avgtext+0avgdata 37900maxresident)k
0inputs+0outputs (0major+15574minor)pagefaults 0swaps
$ /usr/bin/time find -name "aeou*a"
Command exited with non-zero status 1
1.02user 1.06system 0:02.12elapsed 98%CPU (0avgtext+0avgdata 37868maxresident)k
0inputs+0outputs (0major+15561minor)pagefaults 0swaps
#define _GNU_SOURCE
#include <dlfcn.h>
#include <fnmatch.h>
#include <locale.h>
#include <langinfo.h>
#include <string.h>
#include <stdlib.h>
int
fnmatch (pattern, string, flags)
const char *pattern;
const char *string;
int flags;
{
int multibyte_needed = 0;
if (MB_CUR_MAX != 1)
{
char *encoding = nl_langinfo (CODESET);
if (strcmp (encoding, "UTF-8"))
multibyte_needed = 1;
}
/* ASCII with \+/.*?[{(@! excluded. */
static unsigned char normal[256] = {
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
if (!multibyte_needed && !(flags & FNM_CASEFOLD))
{
char start[16];
size_t i;
for (i = 0; i < 8 && normal[(unsigned char) pattern[i]]; i++)
start[i] = pattern[i];
start[i] = 0;
char *string2 = strstr (string, start);
if (!string2)
return FNM_NOMATCH;
}
int (*fnm) (const char *,const char *,int) = dlsym (RTLD_NEXT, "fnmatch");
return fnm (pattern, string, flags);
}