This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[PATCH] Optimize also UTF-8 mb chars in regexs, a small optimization for regexec.c

From: Jakub Jelinek <jakub at redhat dot com>
To: Ulrich Drepper <drepper at redhat dot com>, Roland McGrath <roland at redhat dot com>
Cc: Glibc hackers <libc-hacker at sources dot redhat dot com>
Date: Mon, 17 Nov 2003 12:42:28 +0100
Subject: [PATCH] Optimize also UTF-8 mb chars in regexs, a small optimization for regexec.c
Reply-to: Jakub Jelinek <jakub at redhat dot com>

Hi!

Two changes:
1) Both b\xc3\xa4r and b\xc3\xa4*r can be optimized quite easily
   (I knew about the former one, didn't know it didn't need any changes
    for the latter one as in that case the parse tree has already
    DUPop's ->left pointing to CONCAT tree of the bytes of the multi-byte
    character, so if just mb_partial flag is cleared, it ought to work
    right)
2) small optimization tweak in regexec.c - it is not necessary to compute
   nrules in the expected case of elem_len <= 1 && char_len <= 1
   or for OP_PERIOD

Todo:
1) Stepan Kasal suggested to optimize . in optimize_utf8 as well
   (probably adding new OP_UTF8_PERIOD node could do the job, will
   try to implement it and benchmark it.  Another alternative would
   be to replace OP_PERIOD with the actual description of UTF-8
   character (something like
[00-7f]|([c2-df]|e0[a0-bf]|([e1-ef]|f0[90-bf]|([f1-f7]|f8[88-bf]|([f9-fb]|fc[84-bf]|fd[80-bf])[80-bf])[80-bf])[80-bf])[80-bf]
   where 00-ff are actually chars in hex and () are only used for grouping
   and not for register matching.  Also, 00 would have to be removed if
   RE_DOT_NOT_NULL and '\n' would have to be removed if !RE_DOT_NEWLINE)
   and use unmodified regexec.c
2) perhaps we could get better code by caching at least nrules
   in dfa/re_string_t
3) oprofiling the code and see where are hot spots

2003-11-17  Jakub Jelinek  <jakub@redhat.com>

	* posix/regcomp.c (optimize_utf8): Optimize multi-byte chars as
	well.
	* posix/bug-regex20.c (tests): Add new tests.  Multi-byte char
	followed by dup operator is expected to be optimized.

	* posix/regexec.c (check_node_accept_bytes): Move nrules and j
	variables to the block where they are only used, initialize
	nrules only immediately before using it.

--- libc/posix/regcomp.c.jj	2003-11-13 22:43:39.000000000 +0100
+++ libc/posix/regcomp.c	2003-11-15 14:07:49.000000000 +0100
@@ -964,17 +964,14 @@ static void
 optimize_utf8 (dfa)
      re_dfa_t *dfa;
 {
-  int node, i;
+  int node, i, mb_chars = 0;
 
   for (node = 0; node < dfa->nodes_len; ++node)
     switch (dfa->nodes[node].type)
       {
       case CHARACTER:
-        /* Chars >= 0x80 are optimizable in some cases (e.g. when not
-	   followed by DUP operator, not in bracket etc.).
-	   For now punt on them all.  */
 	if (dfa->nodes[node].opr.c >= 0x80)
-	  return;
+	  mb_chars = 1;
 	break;
       case ANCHOR:
 	switch (dfa->nodes[node].opr.idx)
@@ -1009,6 +1006,12 @@ optimize_utf8 (dfa)
 	return;
       }
 
+  if (mb_chars)
+    for (node = 0; node < dfa->nodes_len; ++node)
+      if (dfa->nodes[node].type == CHARACTER
+	  && dfa->nodes[node].opr.c >= 0x80)
+	dfa->nodes[node].mb_partial = 0;
+
   /* The search can be in single byte locale.  */
   dfa->mb_cur_max = 1;
   dfa->is_utf8 = 0;
--- libc/posix/bug-regex20.c.jj	2003-11-13 22:43:39.000000000 +0100
+++ libc/posix/bug-regex20.c	2003-11-15 14:33:29.000000000 +0100
@@ -43,15 +43,35 @@ static struct
      \xe2\x80\x94	EM DASH  */
   /* Should be optimized.  */
   {RE_SYNTAX_POSIX_BASIC, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4*z", "b\xc3\xa4rfoobz", 7, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\+z",
+   "b\xc3\xa4rfoob\xc3\xa4\xc3\xa4z", 7, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\?z", "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
+  {RE_SYNTAX_POSIX_BASIC, "b\xc3\xa4\\{1,2\\}z",
+   "b\xc3\xa4rfoob\xc3\xa4z", 7, 1},
   {RE_SYNTAX_POSIX_BASIC, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1},
   {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
   {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
   {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\|x\xc3\xa4*z$",
+   "\xc3\xb6x\xc3\xa4\xc3\xa4z", 2, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{6\\}z\\+",
+   "x\\\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{2,36\\}z\\+",
+   "x\\\xc3\x84zz\xc3\xb6", -1, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\\xc3\x84\\{,3\\}z\\+",
+   "x\\\xc3\x84\xc3\x84\xc3\x84zz\xc3\xb6", 0, 1},
   {RE_SYNTAX_POSIX_BASIC, "x[C]y", "axCy", 1, 1},
   {RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 1},
   {RE_SYNTAX_POSIX_BASIC, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
   {RE_SYNTAX_POSIX_BASIC, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
   {RE_SYNTAX_POSIX_BASIC, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
+  {RE_SYNTAX_POSIX_BASIC, "\\`\xc3\x84\\|z\\'", "\xc3\x84\xe2\x80\x94", 0, 1},
+  {RE_SYNTAX_POSIX_BASIC, "\\(x\xc3\x84\\)z\\1\x61\\1",
+   "\xe2\x80\x94x\xc3\x84zx\xc3\x84\x61x\xc3\x84\xc3\x96", 3, 1},
+  {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
   {RE_SYNTAX_POSIX_EXTENDED, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
   {RE_SYNTAX_POSIX_EXTENDED, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1},
   {RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
@@ -64,7 +84,6 @@ static struct
   {RE_SYNTAX_POSIX_EXTENDED, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
   /* Should not be optimized.  */
   {RE_SYNTAX_POSIX_BASIC, "x.y", "ax\xe2\x80\x94yz", 1, 0},
-  {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
   {RE_SYNTAX_POSIX_BASIC, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
   {RE_SYNTAX_POSIX_BASIC, "x[A-Z,]y", "axCy", 1, 0},
   {RE_SYNTAX_POSIX_BASIC, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
@@ -77,7 +96,6 @@ static struct
   {RE_SYNTAX_POSIX_BASIC, "a\\wz", "a\xc3\x84z", 0, 0},
   {RE_SYNTAX_POSIX_BASIC, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
   {RE_SYNTAX_POSIX_EXTENDED, "x.y", "ax\xe2\x80\x94yz", 1, 0},
-  {RE_SYNTAX_POSIX_EXTENDED, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
   {RE_SYNTAX_POSIX_EXTENDED, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
   {RE_SYNTAX_POSIX_EXTENDED, "x[A-Z,]y", "axCy", 1, 0},
   {RE_SYNTAX_POSIX_EXTENDED, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
--- libc/posix/regexec.c.jj	2003-11-12 21:33:35.000000000 +0100
+++ libc/posix/regexec.c	2003-11-17 13:10:29.000000000 +0100
@@ -3484,10 +3484,6 @@ check_node_accept_bytes (preg, node_idx,
   int elem_len = re_string_elem_size_at (input, str_idx);
   int char_len = re_string_char_size_at (input, str_idx);
   int i;
-# ifdef _LIBC
-  int j;
-  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-# endif /* _LIBC */
   if (elem_len <= 1 && char_len <= 1)
     return 0;
   if (node->type == OP_PERIOD)
@@ -3506,6 +3502,8 @@ check_node_accept_bytes (preg, node_idx,
 # ifdef _LIBC
       const unsigned char *pin = ((char *) re_string_get_buffer (input)
 				  + str_idx);
+      int j;
+      uint32_t nrules;
 # endif /* _LIBC */
       int match_len = 0;
       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
@@ -3530,6 +3528,7 @@ check_node_accept_bytes (preg, node_idx,
 	}
 
 # ifdef _LIBC
+      nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
       if (nrules != 0)
 	{
 	  unsigned int in_collseq = 0;

	Jakub

Follow-Ups:
- Re: [PATCH] Optimize also UTF-8 mb chars in regexs, a small optimizationfor regexec.c
  - From: Ulrich Drepper

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]