This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi! Two changes: 1) Both b\xc3\xa4r and b\xc3\xa4*r can be optimized quite easily (I knew about the former one, didn't know it didn't need any changes for the latter one as in that case the parse tree has already DUPop's ->left pointing to CONCAT tree of the bytes of the multi-byte character, so if just mb_partial flag is cleared, it ought to work right) 2) small optimization tweak in regexec.c - it is not necessary to compute nrules in the expected case of elem_len <= 1 && char_len <= 1 or for OP_PERIOD Todo: 1) Stepan Kasal suggested to optimize . in optimize_utf8 as well (probably adding new OP_UTF8_PERIOD node could do the job, will try to implement it and benchmark it. Another alternative would be to replace OP_PERIOD with the actual description of UTF-8 character (something like [00-7f]|([c2-df]|e0[a0-bf]|([e1-ef]|f0[90-bf]|([f1-f7]|f8[88-bf]|([f9-fb]|fc[84-bf]|fd[80-bf])[80-bf])[80-bf])[80-bf])[80-bf] where 00-ff are actually chars in hex and () are only used for grouping and not for register matching. Also, 00 would have to be removed if RE_DOT_NOT_NULL and '\n' would have to be removed if !RE_DOT_NEWLINE) and use unmodified regexec.c 2) perhaps we could get better code by caching at least nrules in dfa/re_string_t 3) oprofiling the code and see where are hot spots 2003-11-17 Jakub Jelinek <jakub@redhat.com> * posix/regcomp.c (optimize_utf8): Optimize multi-byte chars as well. * posix/bug-regex20.c (tests): Add new tests. Multi-byte char followed by dup operator is expected to be optimized. * posix/regexec.c (check_node_accept_bytes): Move nrules and j variables to the block where they are only used, initialize nrules only immediately before using it. --- libc/posix/regcomp.c.jj 2003-11-13 22:43:39.000000000 +0100 +++ libc/posix/regcomp.c 2003-11-15 14:07:49.000000000 +0100 @@ -964,17 +964,14 @@ static void optimize_utf8 (dfa) re_dfa_t *dfa; { - int node, i; + int node, i, mb_chars = 0; for (node = 0; node < dfa->nodes_len; ++node) switch (dfa->nodes[node].type) { case CHARACTER: - /* Chars >= 0x80 are optimizable in some cases (e.g. when not - followed by DUP operator, not in bracket etc.). - For now punt on them all. */ if (dfa->nodes[node].opr.c >= 0x80) - return; + mb_chars = 1; break; case ANCHOR: switch (dfa->nodes[node].opr.idx) @@ -1009,6 +1006,12 @@ optimize_utf8 (dfa) return; } + if (mb_chars) + for (node = 0; node < dfa->nodes_len; ++node) + if (dfa->nodes[node].type == CHARACTER + && dfa->nodes[node].opr.c >= 0x80) + dfa->nodes[node].mb_partial = 0; + /* The search can be in single byte locale. */ dfa->mb_cur_max = 1; dfa->is_utf8 = 0; --- libc/posix/bug-regex20.c.jj 2003-11-13 22:43:39.000000000 +0100 +++ libc/posix/bug-regex20.c 2003-11-15 14:33:29.000000000 +0100 @@ -43,15 +43,35 @@ static struct \xe2\x80\x94 EM DASH */ /* Should be optimized. */ {RE_SYNTAX_POSIX_BASIC, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\+z", + "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, + {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\{1,2\\}z", + "b\xc3\xa4rfoob\xc3\xa4z", 7, 1}, {RE_SYNTAX_POSIX_BASIC, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1}, {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1}, {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1}, + {RE_SYNTAX_POSIX_BASIC, "^x\\|x\xc3\xa4*z$", + "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1}, + {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{6\\}z\\+", + "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, + {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{2,36\\}z\\+", + "x\\\xc3\x84zz\xc3\xb6", -1, 1}, + {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{,3\\}z\\+", + "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1}, {RE_SYNTAX_POSIX_BASIC, "x[C]y", "axCy", 1, 1}, {RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 1}, {RE_SYNTAX_POSIX_BASIC, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1}, {RE_SYNTAX_POSIX_BASIC, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1}, {RE_SYNTAX_POSIX_BASIC, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, + {RE_SYNTAX_POSIX_BASIC, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1}, + {RE_SYNTAX_POSIX_BASIC, "\\(x\xc3\x84\\)z\\1\x61\\1", + "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1}, + {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1}, {RE_SYNTAX_POSIX_EXTENDED, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1}, {RE_SYNTAX_POSIX_EXTENDED, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1}, {RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1}, @@ -64,7 +84,6 @@ static struct {RE_SYNTAX_POSIX_EXTENDED, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1}, /* Should not be optimized. */ {RE_SYNTAX_POSIX_BASIC, "x.y", "ax\xe2\x80\x94yz", 1, 0}, - {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0}, {RE_SYNTAX_POSIX_BASIC, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, {RE_SYNTAX_POSIX_BASIC, "x[A-Z,]y", "axCy", 1, 0}, {RE_SYNTAX_POSIX_BASIC, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, @@ -77,7 +96,6 @@ static struct {RE_SYNTAX_POSIX_BASIC, "a\\wz", "a\xc3\x84z", 0, 0}, {RE_SYNTAX_POSIX_BASIC, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0}, {RE_SYNTAX_POSIX_EXTENDED, "x.y", "ax\xe2\x80\x94yz", 1, 0}, - {RE_SYNTAX_POSIX_EXTENDED, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0}, {RE_SYNTAX_POSIX_EXTENDED, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0}, {RE_SYNTAX_POSIX_EXTENDED, "x[A-Z,]y", "axCy", 1, 0}, {RE_SYNTAX_POSIX_EXTENDED, "x[^y]z", "ax\xe2\x80\x94z", 1, 0}, --- libc/posix/regexec.c.jj 2003-11-12 21:33:35.000000000 +0100 +++ libc/posix/regexec.c 2003-11-17 13:10:29.000000000 +0100 @@ -3484,10 +3484,6 @@ check_node_accept_bytes (preg, node_idx, int elem_len = re_string_elem_size_at (input, str_idx); int char_len = re_string_char_size_at (input, str_idx); int i; -# ifdef _LIBC - int j; - uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); -# endif /* _LIBC */ if (elem_len <= 1 && char_len <= 1) return 0; if (node->type == OP_PERIOD) @@ -3506,6 +3502,8 @@ check_node_accept_bytes (preg, node_idx, # ifdef _LIBC const unsigned char *pin = ((char *) re_string_get_buffer (input) + str_idx); + int j; + uint32_t nrules; # endif /* _LIBC */ int match_len = 0; wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars) @@ -3530,6 +3528,7 @@ check_node_accept_bytes (preg, node_idx, } # ifdef _LIBC + nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES); if (nrules != 0) { unsigned int in_collseq = 0; Jakub
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |