This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.
Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Optimize UTF-8 regex handling on some patterns

From: Jakub Jelinek <jakub at redhat dot com>
To: Ulrich Drepper <drepper at redhat dot com>, Roland McGrath <roland at redhat dot com>
Cc: Glibc hackers <libc-hacker at sources dot redhat dot com>
Date: Wed, 12 Nov 2003 17:37:02 +0100
Subject: [PATCH] Optimize UTF-8 regex handling on some patterns
Reply-to: Jakub Jelinek <jakub at redhat dot com>
Hi!

The following patch should make regexec/re_search* go as fast in UTF-8
locales as in single byte ones for certain regexps.
E.g. when doing
LC_ALL=en_US.UTF-8 sed s/a/b/g /etc/termcap > /dev/null
there is really no need to convert anything from UTF-8 to UCS-4 during
searching.
The above command on my box takes 0m0.960s (user time) with a few
days old glibc, 0m0.720s with current CVS and 0m0.350s with CVS + this
patch (the same time as LC_ALL=C sed s/a/b/g /etc/termcap > /dev/null
takes).
bug-regex20.c looks at regex private data structure (sorry for the
changes in regex_internal.h which were needed to shut warnings up
in that case) and checks whether the optimization has been used
where it was supposed to and has not been used when it cannot be used.
Particularly, e.g. regexp's using . cannot be optimized that way,
since . can match more bytes.
This patch doesn't optimize [ABC] which is optimizable if COMPLEX_BRACKET
is changed into SIMPLE_BRACKET in optimize_utf8 (I've left it for a later
patch), though e.g. [^ABC] cannot be optimized.
Also, it doesn't optimize if there are any >= 0x80 characters, which again
is not necessary unless they are followed by a dup operator (*, ?, +, {}).
E.g. B\xc3\x84r pattern will only match B\xc3\x84r substring and nothing
else.  But it would need to change a little bit the parsed tree.

2003-11-12  Jakub Jelinek  <jakub@redhat.com>

	* posix/regcomp.c (optimize_utf8): New function.
	(re_compile_fastmap_iter): Use dfa->mb_cur_max > 1 instead
	of !icase.
	(re_compile_internal): Call optimize_utf8 if not case insensitive
	and in UTF-8 locale.
	* posix/regex_internal.h: Ifdef out some prototypes if
	RE_NO_INTERNAL_PROTOTYPES is defined to shut up warnings.
	* posix/Makefile (tests): Add bug-regex20.
	(bug-regex20-ENV): Add LOCPATH.
	* posix/bug-regex20.c: New test.

--- libc/posix/regcomp.c.jj	2003-11-12 15:40:27.000000000 +0100
+++ libc/posix/regcomp.c	2003-11-12 19:06:38.000000000 +0100
@@ -30,6 +30,9 @@ static void free_charset (re_charset_t *
 #endif /* RE_ENABLE_I18N */
 static void free_workarea_compile (regex_t *preg);
 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
+#ifdef RE_ENABLE_I18N
+static void optimize_utf8 (re_dfa_t *dfa);
+#endif
 static reg_errcode_t analyze (re_dfa_t *dfa);
 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -322,7 +325,7 @@ re_compile_fastmap_iter (bufp, init_stat
 	{
 	  re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 #ifdef RE_ENABLE_I18N
-	  if ((bufp->syntax & RE_ICASE) && !icase)
+	  if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 	    {
 	      unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 	      wchar_t wc;
@@ -389,7 +392,7 @@ re_compile_fastmap_iter (bufp, init_stat
 	      memset (&state, '\0', sizeof (state));
 	      __wcrtomb (buf, cset->mbchars[i], &state);
 	      re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
-	      if ((bufp->syntax & RE_ICASE) && !icase)
+	      if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 		{
 		  __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 		  re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
@@ -760,6 +763,12 @@ re_compile_internal (preg, pattern, leng
   if (BE (dfa->str_tree == NULL, 0))
     goto re_compile_internal_free_return;
 
+#ifdef RE_ENABLE_I18N
+  /* If possible, do searching in single byte encoding to speed things up.  */
+  if (dfa->is_utf8 && !(syntax & RE_ICASE))
+    optimize_utf8 (dfa);
+#endif
+
   /* Analyze the tree and collect information which is necessary to
      create the dfa.  */
   err = analyze (dfa);
@@ -945,6 +954,61 @@ create_initial_state (dfa)
   return REG_NOERROR;
 }
 
+#ifdef RE_ENABLE_I18N
+/* If it is possible to do searching in single byte encoding instead of UTF-8
+   to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
+   DFA nodes where needed.  */
+
+static void
+optimize_utf8 (dfa)
+     re_dfa_t *dfa;
+{
+  int node;
+
+  for (node = 0; node < dfa->nodes_len; ++node)
+    switch (dfa->nodes[node].type)
+      {
+      case CHARACTER:
+        /* Chars >= 0x80 are optimizable in some cases (e.g. when not
+	   followed by DUP operator, not in bracket etc.).
+	   For now punt on them all.  */
+	if (dfa->nodes[node].opr.c >= 0x80)
+	  return;
+	break;
+      case ANCHOR:
+	switch (dfa->nodes[node].opr.idx)
+	  {
+	  case LINE_FIRST:
+	  case LINE_LAST:
+	  case BUF_FIRST:
+	  case BUF_LAST:
+	    break;
+	  default:
+	    /* Word anchors etc. cannot be handled.  */
+	    return;
+	  }
+	break;
+      case OP_BACK_REF:
+      case OP_ALT:
+      case END_OF_RE:
+      case BACK_SLASH:
+      case OP_DUP_ASTERISK:
+      case OP_DUP_QUESTION:
+      case OP_DUP_PLUS:
+      case OP_OPEN_SUBEXP:
+      case OP_CLOSE_SUBEXP:
+	break;
+      default:
+	return;
+      }
+
+  /* The search can be in single byte locale.  */
+  dfa->mb_cur_max = 1;
+  dfa->is_utf8 = 0;
+  dfa->has_mb_node = dfa->nbackref > 0;
+}
+#endif
+
 /* Analyze the structure tree, and calculate "first", "next", "edest",
    "eclosure", and "inveclosure".  */
 
--- libc/posix/Makefile.jj	2003-11-12 13:48:26.000000000 +0100
+++ libc/posix/Makefile	2003-11-12 17:27:26.000000000 +0100
@@ -77,7 +77,7 @@ tests		:= tstgetopt testfnm runtests run
 		   tst-gnuglob tst-regex bug-regex5 bug-regex6 bug-regex7 \
 		   bug-regex8 bug-regex9 bug-regex10 bug-regex11 bug-regex12 \
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
-		   bug-regex17 bug-regex18 bug-regex19 \
+		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
 		   tst-nice tst-nanosleep transbug
 ifeq (yes,$(build-shared))
 test-srcs	:= globtest
@@ -157,6 +157,7 @@ bug-regex6-ENV = LOCPATH=$(common-objpfx
 bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex18-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex19-ENV = LOCPATH=$(common-objpfx)localedata
+bug-regex20-ENV = LOCPATH=$(common-objpfx)localedata
 
 testcases.h: TESTS TESTS2C.sed
 	sed -f TESTS2C.sed < $< > $@T
--- libc/posix/regex_internal.h.jj	2003-11-12 09:47:13.000000000 +0100
+++ libc/posix/regex_internal.h	2003-11-12 19:23:01.000000000 +0100
@@ -345,6 +345,7 @@ typedef struct re_string_t re_string_t;
 #define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)
 
 
+#ifndef RE_NO_INTERNAL_PROTOTYPES
 static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str,
 					 int len, int init_len,
 					 RE_TRANSLATE_TYPE trans, int icase,
@@ -371,6 +372,7 @@ static inline wint_t re_string_wchar_at 
 #endif /* RE_ENABLE_I18N */
 static unsigned int re_string_context_at (const re_string_t *input, int idx,
 					  int eflags, int newline_anchor);
+#endif
 #define re_string_peek_byte(pstr, offset) \
   ((pstr)->mbs[(pstr)->cur_idx + offset])
 #define re_string_peek_byte_case(pstr, offset) \
@@ -612,6 +614,7 @@ struct re_dfa_t
 };
 typedef struct re_dfa_t re_dfa_t;
 
+#ifndef RE_NO_INTERNAL_PROTOTYPES
 static reg_errcode_t re_node_set_alloc (re_node_set *set, int size);
 static reg_errcode_t re_node_set_init_1 (re_node_set *set, int elem);
 static reg_errcode_t re_node_set_init_2 (re_node_set *set, int elem1,
@@ -644,6 +647,7 @@ static re_dfastate_t *re_acquire_state_c
 						const re_node_set *nodes,
 						unsigned int context);
 static void free_state (re_dfastate_t *state);
+#endif
 
 
 typedef enum
@@ -697,7 +701,7 @@ bitset_not_merge (dest, src)
     dest[i] |= ~src[i];
 }
 
-#ifdef RE_ENABLE_I18N
+#if defined RE_ENABLE_I18N && !defined RE_NO_INTERNAL_PROTOTYPES
 /* Inline functions for re_string.  */
 static inline int
 re_string_char_size_at (pstr, idx)
--- libc/posix/bug-regex20.c.jj	2003-11-12 17:26:56.000000000 +0100
+++ libc/posix/bug-regex20.c	2003-11-12 19:23:20.000000000 +0100
@@ -0,0 +1,137 @@
+/* Test for UTF-8 regular expression optimizations.
+   Copyright (C) 2003 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Jakub Jelinek <jakub@redhat.com>, 2003.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sys/types.h>
+#include <mcheck.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <locale.h>
+
+#define RE_NO_INTERNAL_PROTOTYPES 1
+#include "regex_internal.h"
+
+static struct
+{
+  int syntax;
+  const char *pattern;
+  const char *string;
+  int res, optimize;
+} tests[] = {
+  /* \xc3\x84		LATIN CAPITAL LETTER A WITH DIAERESIS
+     \xc3\x96		LATIN CAPITAL LETTER O WITH DIAERESIS
+     \xc3\xa4		LATIN SMALL LETTER A WITH DIAERESIS
+     \xc3\xb6		LATIN SMALL LETTER O WITH DIAERESIS
+     \xe2\x80\x94	EM DASH  */
+  /* Should be optimized.  */
+  {RE_SYNTAX_POSIX_BASIC, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\|xy*z$", "\xc3\xb6xyyz", 2, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{6\\}z\\+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{2,36\\}z\\+", "x\\yzz\xc3\xb6", -1, 1},
+  {RE_SYNTAX_POSIX_BASIC, "^x\\\\y\\{,3\\}z\\+", "x\\yyyzz\xc3\xb6", 0, 1},
+  /* FIXME.  This one should be optimizable, but is not ATM.  */
+  {RE_SYNTAX_POSIX_BASIC, "x[ABC]y", "axCy", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "\\`x\\|z\\'", "x\xe2\x80\x94", 0, 1},
+  {RE_SYNTAX_POSIX_BASIC, "\\(xy\\)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
+  {RE_SYNTAX_POSIX_BASIC, "xy\\?z", "\xc3\x84xz\xc3\xb6", 2, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "foo", "b\xc3\xa4rfoob\xc3\xa4z", 4, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "^x|xy*z$", "\xc3\xb6xyyz", 2, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{6}z+", "x\\yyyyyyzz\xc3\xb6", 0, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{2,36}z+", "x\\yzz\xc3\xb6", -1, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "^x\\\\y{,3}z+", "x\\yyyzz\xc3\xb6", 0, 1},
+  /* FIXME.  This one should be optimizable, but is not ATM.  */
+  {RE_SYNTAX_POSIX_EXTENDED, "x[ABC]y", "axCy", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "\\`x|z\\'", "x\xe2\x80\x94", 0, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "(xy)z\\1a\\1", "\xe2\x80\x94xyzxyaxy\xc3\x84", 3, 1},
+  {RE_SYNTAX_POSIX_EXTENDED, "xy?z", "\xc3\x84xz\xc3\xb6", 2, 1},
+  /* Should not be optimized.  */
+  {RE_SYNTAX_POSIX_BASIC, "x.y", "ax\xe2\x80\x94yz", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x[A-Z,]y", "axCy", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
+  {RE_SYNTAX_POSIX_BASIC, "\\<g", "\xe2\x80\x94g", 3, 0},
+  {RE_SYNTAX_POSIX_BASIC, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
+  {RE_SYNTAX_POSIX_BASIC, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
+  {RE_SYNTAX_POSIX_BASIC, "a\\wz", "a\xc3\x84z", 0, 0},
+  {RE_SYNTAX_POSIX_BASIC, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x.y", "ax\xe2\x80\x94yz", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x\xc3\x96*y", "ax\xc3\x96\xc3\x96yz", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x[\xc3\x84\xc3\xa4]y", "ax\xc3\xa4y", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x[A-Z,]y", "axCy", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x[^y]z", "ax\xe2\x80\x94z", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x[[:alnum:]]z", "ax\xc3\x96z", 1, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "\\<g", "\xe2\x80\x94g", 3, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "\\bg\\b", "\xe2\x80\x94g", 3, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "\\Bg\\B", "\xc3\xa4g\xc3\xa4", 2, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "a\\wz", "a\xc3\x84z", 0, 0},
+  {RE_SYNTAX_POSIX_EXTENDED, "x\\Wz", "\xc3\x96x\xe2\x80\x94z", 2, 0},
+};
+
+int
+main (void)
+{
+  struct re_pattern_buffer regbuf;
+  const char *err;
+  size_t i;
+  int ret = 0;
+
+  mtrace ();
+
+  setlocale (LC_ALL, "de_DE.UTF-8");
+  for (i = 0; i < sizeof (tests) / sizeof (tests[0]); ++i)
+    {
+      int res, optimized;
+      re_set_syntax (tests[i].syntax);
+      memset (&regbuf, '\0', sizeof (regbuf));
+      err = re_compile_pattern (tests[i].pattern, strlen (tests[i].pattern),
+                                &regbuf);
+      if (err != NULL)
+	{
+	  printf ("re_compile_pattern failed: %s\n", err);
+	  ret = 1;
+	  continue;
+	}
+
+      /* Check if re_search will be done as multi-byte or single-byte.  */
+      optimized = ((re_dfa_t *) regbuf.buffer)->mb_cur_max == 1;
+      if (optimized != tests[i].optimize)
+        {
+          printf ("pattern %zd %soptimized while it should%s be\n",
+		  i, optimized ? "" : "not ", tests[i].optimize ? "" : " not");
+	  ret = 1;
+        }
+
+      res = re_search (&regbuf, tests[i].string, strlen (tests[i].string), 0,
+		       strlen (tests[i].string), NULL);
+      if (res != tests[i].res)
+	{
+	  printf ("re_search %zd failed: %d\n", i, res);
+	  ret = 1;
+	  regfree (&regbuf);
+	  continue;
+	}
+      regfree (&regbuf);
+    }
+
+  return ret;
+}

	Jakub
Follow-Ups:
- Re: [PATCH] Optimize UTF-8 regex handling on some patterns
  - From: Ulrich Drepper
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]