[PATCH] Three regex speedups, one of which is actually a bugfix

Jakub Jelinek jakub@redhat.com
Fri Jan 2 00:26:00 GMT 2004


On Thu, Jan 01, 2004 at 09:58:22PM +0100, Paolo Bonzini wrote:
> > It is not initialized always, because in the common case there is no
> > \<, \>, \b, \B, \w and \W in regular expression and so differentiating
> > between word and non-word characters is not needed at all.
> 
> But, every match goes down to build_tr_table, which calls IS_WORD_CHAR 256 times and

AFAIK this is up to 256 times, not always 256.  Often just once.

> brings __ctype_b_loc high in the profile.  That's why it's better to always

1) in glibc it doesn't ever call __ctype_b_loc (), but uses __thread access
2) __ctype_b_loc () is __attribute__((const)), so if the compiler doesn't
   for the !_LIBC case move it out of the loop, it should be improved to do
   so

If that

              if (BE (dfa->word_char != NULL, 0)
                  && (dfa->word_char[i] & mask))

in my patch instead of the

	      if (dfa->word_char[i] & mask)

in your patch shows up in profiles (note that dfa->word_char will be usually
NULL), then the loop can be duplicated for if (BE (dfa->word_char != NULL, 0))
(where it checks if (dfa->word_char[i] & mask)) and in the else loop do
trtable[ch] = dest_states[j]; unconditionally.  On IA-32 this took
additional 144 .text bytes but if the loop is executed many times on average
(e.g. with many periods in regex), it would certainly speed up the loop.

> initialize word_char.  If you use a cached bitset instead of calling isalnum, it
> makes no difference if the cached bitset is correct (initialized with isalnum) or
> all-zeros (unless you go down into branch prediction which is overkill, isn't it?).
> Using a flag to avoid iswalnum calls in IS_WIDE_WORD_CHAR is again a complementary
> optimization, which can be done with a separate patch.

But as I said, I've posted yesterday just one variant of the patch and
talked about 2 variants (word_char == NULL when word ops are not seen and
word_char empty bitmap).  Below is attached the second variant:

	Jakub
-------------- next part --------------
2004-01-02  Jakub Jelinek  <jakub@redhat.com>
	    Paolo Bonzini  <bonzini@gnu.org>

	* posix/regex_internal.h (re_const_bitset_ptr_t): New type.
	(re_string_t): Add newline_anchor, word_char and word_ops_used fields.
	(re_dfa_t): Change word_char type to bitset.  Add word_ops_used field.
	(re_string_context_at, re_string_reconstruct): Remove last argument.
	* posix/regex_internal.c (re_string_allocate): Initialize
	pstr->word_char and pstr->word_ops_used.
	(re_string_context_at): Remove newline_anchor argument.
	Use input->newline_anchor instead, swap && conditions.
	Only use IS_WIDE_WORD_CHAR if input->word_ops_used != 0.
	Use input->word_char bitmap instead of IS_WORD_CHAR.
	(re_string_reconstruct): Likewise.
	Adjust re_string_context_at caller.
	* posix/regexec.c (acquire_init_state_context,
	check_halt_state_context, transit_state, transit_state_sb,
	transit_state_mb, transit_state_bkref, check_arrival,
	check_node_accept): Adjust re_string_context_at and
	re_string_reconstruct callers.
	(re_search_internal): Likewise.  Set input.newline_anchor.
	(build_trtable): Use dfa->word_char bitmap instead of IS_WORD_CHAR.
	* posix/regcomp.c (init_word_char): Change return type to void.
	Set dfa->word_ops_used.
	(free_dfa_content): Don't free dfa->word_char.
	(parse_expression): Remove error handling for init_word_char.

--- libc/posix/regexec.c.jj	2004-01-01 15:44:58.000000000 +0100
+++ libc/posix/regexec.c	2004-01-02 01:03:53.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -616,6 +616,7 @@ re_search_internal (preg, string, length
     goto free_return;
   input.stop = stop;
   input.raw_stop = stop;
+  input.newline_anchor = preg->newline_anchor;
 
   err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
   if (BE (err != REG_NOERROR, 0))
@@ -708,8 +709,7 @@ re_search_internal (preg, string, length
 		  if (input.raw_mbs_idx + input.valid_raw_len <= match_first
 		      || match_first < input.raw_mbs_idx)
 		    {
-		      err = re_string_reconstruct (&input, match_first, eflags,
-						   preg->newline_anchor);
+		      err = re_string_reconstruct (&input, match_first, eflags);
 		      if (BE (err != REG_NOERROR, 0))
 			goto free_return;
 		    }
@@ -730,8 +730,7 @@ re_search_internal (preg, string, length
 
       /* Reconstruct the buffers so that the matcher can assume that
 	 the matching starts from the beginning of the buffer.  */
-      err = re_string_reconstruct (&input, match_first, eflags,
-				   preg->newline_anchor);
+      err = re_string_reconstruct (&input, match_first, eflags);
       if (BE (err != REG_NOERROR, 0))
 	goto free_return;
 #ifdef RE_ENABLE_I18N
@@ -939,8 +938,7 @@ acquire_init_state_context (err, preg, m
   if (dfa->init_state->has_constraint)
     {
       unsigned int context;
-      context = re_string_context_at (mctx->input, idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, idx - 1, mctx->eflags);
       if (IS_WORD_CONTEXT (context))
 	return dfa->init_state_word;
       else if (IS_ORDINARY_CONTEXT (context))
@@ -1103,8 +1101,7 @@ check_halt_state_context (preg, state, m
 #ifdef DEBUG
   assert (state->halt);
 #endif
-  context = re_string_context_at (mctx->input, idx, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, idx, mctx->eflags);
   for (i = 0; i < state->nodes.nelem; ++i)
     if (check_halt_node_context (dfa, state->nodes.elems[i], context))
       return state->nodes.elems[i];
@@ -2177,7 +2174,7 @@ transit_state (err, preg, mctx, state)
 	      context
 		= re_string_context_at (mctx->input,
 					re_string_cur_idx (mctx->input) - 1,
-					mctx->eflags, preg->newline_anchor);
+					mctx->eflags);
 	      if (IS_WORD_CONTEXT (context))
 		next_state = trtable[ch + SBC_MAX];
 	      else
@@ -2236,7 +2233,7 @@ transit_state (err, preg, mctx, state)
 
 	  context = re_string_context_at (mctx->input,
 					  re_string_cur_idx (mctx->input) - 1,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  next_state = mctx->state_log[cur_idx]
 	    = re_acquire_state_context (err, dfa, &next_nodes, context);
 	  /* We don't need to check errors here, since the return value of
@@ -2340,8 +2337,7 @@ transit_state_sb (err, preg, state, mctx
 	    }
 	}
     }
-  context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags);
   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
   /* We don't need to check errors here, since the return value of
      this function is next_state and ERR is already set.  */
@@ -2375,7 +2371,7 @@ transit_state_mb (preg, pstate, mctx)
 	{
 	  context = re_string_context_at (mctx->input,
 					  re_string_cur_idx (mctx->input),
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
 					   context))
 	    continue;
@@ -2412,8 +2408,7 @@ transit_state_mb (preg, pstate, mctx)
 	  if (BE (err != REG_NOERROR, 0))
 	    return err;
 	}
-      context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags);
       mctx->state_log[dest_idx]
 	= re_acquire_state_context (&err, dfa, &dest_nodes, context);
       if (dest_state != NULL)
@@ -2451,7 +2446,7 @@ transit_state_bkref (preg, nodes, mctx)
       if (node->constraint)
 	{
 	  context = re_string_context_at (mctx->input, cur_str_idx,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
 	    continue;
 	}
@@ -2483,7 +2478,7 @@ transit_state_bkref (preg, nodes, mctx)
 	  dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
 			  - bkref_ent->subexp_from);
 	  context = re_string_context_at (mctx->input, dest_str_idx - 1,
-					  mctx->eflags, preg->newline_anchor);
+					  mctx->eflags);
 	  dest_state = mctx->state_log[dest_str_idx];
 	  prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
 			: mctx->state_log[cur_str_idx]->nodes.nelem);
@@ -2757,8 +2752,7 @@ check_arrival (preg, mctx, path, top_nod
   mctx->input->cur_idx = str_idx;
 
   /* Setup initial node set.  */
-  context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
-				  preg->newline_anchor);
+  context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
   if (str_idx == top_str)
     {
       err = re_node_set_init_1 (&next_nodes, top_node);
@@ -2844,8 +2838,7 @@ check_arrival (preg, mctx, path, top_nod
 	      return err;
 	    }
 	}
-      context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags,
-				      preg->newline_anchor);
+      context = re_string_context_at (mctx->input, str_idx - 1, mctx->eflags);
       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
 	{
@@ -3304,7 +3297,7 @@ out_free:
 		;
 
 	      /* j-th destination accepts the word character ch.  */
-	      if (IS_WORD_CHAR (ch))
+	      if (dfa->word_char[i] & mask)
 		trtable[ch] = dest_states_word[j];
 	      else
 		trtable[ch] = dest_states[j];
@@ -3880,8 +3873,7 @@ check_node_accept (preg, node, mctx, idx
       /* The node has constraints.  Check whether the current context
 	 satisfies the constraints.  */
       unsigned int context = re_string_context_at (mctx->input, idx,
-						   mctx->eflags,
-						   preg->newline_anchor);
+						   mctx->eflags);
       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
 	return 0;
     }
--- libc/posix/regex_internal.h.jj	2003-12-28 00:34:45.000000000 +0100
+++ libc/posix/regex_internal.h	2004-01-02 01:13:09.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -121,6 +121,7 @@ extern const size_t __re_error_msgid_idx
 #define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)
 typedef unsigned int bitset[BITSET_UINTS];
 typedef unsigned int *re_bitset_ptr_t;
+typedef const unsigned int *re_const_bitset_ptr_t;
 
 #define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS)
 #define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS))
@@ -337,12 +338,16 @@ struct re_string_t
   unsigned int tip_context;
   /* The translation passed as a part of an argument of re_compile_pattern.  */
   RE_TRANSLATE_TYPE trans;
+  /* Copy of re_dfa_t's word_char.  */
+  re_const_bitset_ptr_t word_char;
   /* 1 if REG_ICASE.  */
   unsigned char icase;
   unsigned char is_utf8;
   unsigned char map_notascii;
   unsigned char mbs_allocated;
   unsigned char offsets_needed;
+  unsigned char newline_anchor;
+  unsigned char word_ops_used;
   int mb_cur_max;
 };
 typedef struct re_string_t re_string_t;
@@ -368,7 +373,7 @@ static reg_errcode_t re_string_construct
 					  int len, RE_TRANSLATE_TYPE trans,
 					  int icase, const re_dfa_t *dfa) internal_function;
 static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx,
-					    int eflags, int newline) internal_function;
+					    int eflags) internal_function;
 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
 						int new_buf_len) internal_function;
 # ifdef RE_ENABLE_I18N
@@ -384,7 +389,7 @@ static inline int re_string_char_size_at
 static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx) internal_function;
 # endif /* RE_ENABLE_I18N */
 static unsigned int re_string_context_at (const re_string_t *input, int idx,
-					  int eflags, int newline_anchor) internal_function;
+					  int eflags) internal_function;
 static unsigned char re_string_peek_byte_case (const re_string_t *pstr,
 					       int idx) internal_function;
 static unsigned char re_string_fetch_byte_case (re_string_t *pstr) internal_function;
@@ -597,7 +602,6 @@ struct re_fail_stack_t
 
 struct re_dfa_t
 {
-  re_bitset_ptr_t word_char;
   re_subexp_t *subexps;
   re_token_t *nodes;
   int nodes_alloc;
@@ -632,7 +636,9 @@ struct re_dfa_t
   unsigned int has_mb_node : 1;
   unsigned int is_utf8 : 1;
   unsigned int map_notascii : 1;
+  unsigned int word_ops_used : 1;
   int mb_cur_max;
+  bitset word_char;
 #ifdef DEBUG
   char* re_str;
 #endif
--- libc/posix/regex_internal.c.jj	2003-12-28 00:34:31.000000000 +0100
+++ libc/posix/regex_internal.c	2004-01-02 01:03:24.000000000 +0100
@@ -1,5 +1,5 @@
 /* Extended regular expression matching and search library.
-   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
 
@@ -67,6 +67,8 @@ re_string_allocate (pstr, str, len, init
   if (BE (ret != REG_NOERROR, 0))
     return ret;
 
+  pstr->word_char = dfa->word_char;
+  pstr->word_ops_used = dfa->word_ops_used;
   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
   pstr->valid_raw_len = pstr->valid_len;
@@ -572,9 +574,9 @@ re_string_translate_buffer (pstr)
    convert to upper case in case of REG_ICASE, apply translation.  */
 
 static reg_errcode_t
-re_string_reconstruct (pstr, idx, eflags, newline)
+re_string_reconstruct (pstr, idx, eflags)
      re_string_t *pstr;
-     int idx, eflags, newline;
+     int idx, eflags;
 {
   int offset = idx - pstr->raw_mbs_idx;
   if (offset < 0)
@@ -609,8 +611,7 @@ re_string_reconstruct (pstr, idx, eflags
 	 )
 	{
 	  /* Yes, move them to the front of the buffer.  */
-	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags,
-						    newline);
+	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 #ifdef RE_ENABLE_I18N
 	  if (pstr->mb_cur_max > 1)
 	    memmove (pstr->wcs, pstr->wcs + offset,
@@ -695,9 +696,12 @@ re_string_reconstruct (pstr, idx, eflags
 		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
-	      pstr->tip_context = (IS_WIDE_WORD_CHAR (wc) ? CONTEXT_WORD
-				   : ((newline && IS_WIDE_NEWLINE (wc))
-				      ? CONTEXT_NEWLINE : 0));
+	      pstr->tip_context = (BE (pstr->word_ops_used != 0, 0)
+				   && IS_WIDE_WORD_CHAR (wc))
+				  ? CONTEXT_WORD
+				  : ((IS_WIDE_NEWLINE (wc)
+				      && pstr->newline_anchor)
+				     ? CONTEXT_NEWLINE : 0);
 	    }
 	  else
 #endif /* RE_ENABLE_I18N */
@@ -705,9 +709,10 @@ re_string_reconstruct (pstr, idx, eflags
 	      int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 	      if (pstr->trans)
 		c = pstr->trans[c];
-	      pstr->tip_context = (IS_WORD_CHAR (c) ? CONTEXT_WORD
-				   : ((newline && IS_NEWLINE (c))
-				      ? CONTEXT_NEWLINE : 0));
+	      pstr->tip_context = bitset_contain (pstr->word_char, c)
+				  ? CONTEXT_WORD
+				  : ((IS_NEWLINE (c) && pstr->newline_anchor)
+				     ? CONTEXT_NEWLINE : 0);
 	    }
 	}
       if (!pstr->mbs_allocated)
@@ -843,9 +848,9 @@ re_string_destruct (pstr)
 /* Return the context at IDX in INPUT.  */
 
 static unsigned int
-re_string_context_at (input, idx, eflags, newline_anchor)
+re_string_context_at (input, idx, eflags)
      const re_string_t *input;
-     int idx, eflags, newline_anchor;
+     int idx, eflags;
 {
   int c;
   if (idx < 0 || idx == input->len)
@@ -874,17 +879,18 @@ re_string_context_at (input, idx, eflags
 	    return input->tip_context;
 	}
       wc = input->wcs[wc_idx];
-      if (IS_WIDE_WORD_CHAR (wc))
+      if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 	return CONTEXT_WORD;
-      return (newline_anchor && IS_WIDE_NEWLINE (wc)) ? CONTEXT_NEWLINE : 0;
+      return (IS_WIDE_NEWLINE (wc) && input->newline_anchor)
+	     ? CONTEXT_NEWLINE : 0;
     }
   else
 #endif
     {
       c = re_string_byte_at (input, idx);
-      if (IS_WORD_CHAR (c))
+      if (bitset_contain (input->word_char, c))
 	return CONTEXT_WORD;
-      return (newline_anchor && IS_NEWLINE (c)) ? CONTEXT_NEWLINE : 0;
+      return (IS_NEWLINE (c) && input->newline_anchor) ? CONTEXT_NEWLINE : 0;
     }
 }
 
--- libc/posix/regcomp.c.jj	2003-12-29 01:41:04.000000000 +0100
+++ libc/posix/regcomp.c	2004-01-02 01:03:11.000000000 +0100
@@ -24,7 +24,7 @@ static void re_compile_fastmap_iter (reg
 				     const re_dfastate_t *init_state,
 				     char *fastmap);
 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
-static reg_errcode_t init_word_char (re_dfa_t *dfa);
+static void init_word_char (re_dfa_t *dfa);
 #ifdef RE_ENABLE_I18N
 static void free_charset (re_charset_t *cset);
 #endif /* RE_ENABLE_I18N */
@@ -611,7 +611,6 @@ free_dfa_content (re_dfa_t *dfa)
         re_free (entry->array);
       }
   re_free (dfa->state_table);
-  re_free (dfa->word_char);
 #ifdef RE_ENABLE_I18N
   re_free (dfa->sb_char);
 #endif
@@ -839,7 +838,6 @@ init_dfa (dfa, pat_len)
 
   dfa->subexps_alloc = 1;
   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
-  /* dfa->word_char = NULL; */
 
   dfa->mb_cur_max = MB_CUR_MAX;
 #ifdef _LIBC
@@ -879,19 +877,16 @@ init_dfa (dfa, pat_len)
    "word".  In this case "word" means that it is the word construction
    character used by some operators like "\<", "\>", etc.  */
 
-static reg_errcode_t
+static void
 init_word_char (dfa)
      re_dfa_t *dfa;
 {
   int i, j, ch;
-  dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
-  if (BE (dfa->word_char == NULL, 0))
-    return REG_ESPACE;
+  dfa->word_ops_used = 1;
   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
     for (j = 0; j < UINT_BITS; ++j, ++ch)
       if (isalnum (ch) || ch == '_')
 	dfa->word_char[i] |= 1 << j;
-  return REG_NOERROR;
 }
 
 /* Free the work area which are only used while compiling.  */
@@ -2191,12 +2186,8 @@ parse_expression (regexp, preg, token, s
     case ANCHOR:
       if ((token->opr.ctx_type
 	   & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
-	  && dfa->word_char == NULL)
-	{
-	  *err = init_word_char (dfa);
-	  if (BE (*err != REG_NOERROR, 0))
-	    return NULL;
-	}
+	  && dfa->word_ops_used == 0)
+	init_word_char (dfa);
       if (token->opr.ctx_type == WORD_DELIM)
 	{
 	  bin_tree_t *tree_first, *tree_last;


More information about the Libc-alpha mailing list