This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

[regex] Small cleanup in subexpression handling

From: Paolo Bonzini <paolo dot bonzini at lu dot unisi dot ch>
To: Glibc Development <libc-alpha at sources dot redhat dot com>
Date: Wed, 01 Dec 2004 12:24:29 +0100
Subject: [regex] Small cleanup in subexpression handling

This cleanup has two parts. The first is the remove re_dfa_t's subexps field, which is only used to test for backreferences used within their definition like in (x\1). It can be replaced by a simple bitset.

The second is to make OP_BACK_REF 0-based (i.e. \1 corresponds to opr.idx==0) like OP_OPEN_SUBEXP and OP_CLOSE_SUBEXP are. This eliminates a good deal of compensation in regexec.c, with no runtime effects but simplifying some expression.

Since I was at it, I removed a duplicate conditional in check_subexp_limits. Outer ifs guarantee that (type == OP_OPEN_SUBEXP || type == OP_CLOSE_SUBEXP) && ent->subexp_to != str_idx.

Tested with valgrind to avoid stupid off-by-one mistakes like the one Jakub found yesterday.

Paolo

2004-12-01  Paolo Bonzini  <bonzini@gnu.org>

	* posix/regcomp.c (free_dfa_content, init_dfa): Remove
	references to re_dfa_t's subexps field.
	(parse_sub_exp, parse_expression): Do not use it.  Use
	completed_bkref_map instead.
	(create_initial_state, peek_token): Store a backreference \N
	with opr.idx = N-1.
	* posix/regexec.c (proceed_next_node, check_dst_limits,
	get_subexp): Likewise.
	(check_subexp_limits): Remove useless condition.
	* posix/regex_internal.h (re_subexp_t): Remove.
	(re_dfa_t): Remove subexps and subexps_alloc field, add
	completed_bkref_map.

--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -596,8 +596,6 @@ free_dfa_content (re_dfa_t *dfa)
 {
   int i, j;
 
-  re_free (dfa->subexps);
-
   if (dfa->nodes)
     for (i = 0; i < dfa->nodes_len; ++i)
       {
@@ -884,9 +882,6 @@ init_dfa (dfa, pat_len)
   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
   dfa->state_hash_mask = table_size - 1;
 
-  dfa->subexps_alloc = 1;
-  dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
-
   dfa->mb_cur_max = MB_CUR_MAX;
 #ifdef _LIBC
   if (dfa->mb_cur_max == 6
@@ -950,8 +945,7 @@ init_dfa (dfa, pat_len)
     }
 #endif
 
-  if (BE (dfa->nodes == NULL || dfa->state_table == NULL
-	  || dfa->subexps == NULL, 0))
+  if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
     return REG_ESPACE;
   return REG_NOERROR;
 }
@@ -1028,7 +1022,7 @@ create_initial_state (dfa)
 	    re_token_t *clexp_node;
 	    clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 	    if (clexp_node->type == OP_CLOSE_SUBEXP
-		&& clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
+		&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 	      break;
 	  }
 	if (clexp_idx == init_nodes.nelem)
@@ -1837,7 +1831,7 @@ peek_token (token, input, syntax)
 	  if (!(syntax & RE_NO_BK_REFS))
 	    {
 	      token->type = OP_BACK_REF;
-	      token->opr.idx = c2 - '0';
+	      token->opr.idx = c2 - '1';
 	    }
 	  break;
 	case '<':
@@ -2295,13 +2289,12 @@ parse_expression (regexp, preg, token, s
 	return NULL;
       break;
     case OP_BACK_REF:
-      if (BE (preg->re_nsub < token->opr.idx
-	      || dfa->subexps[token->opr.idx - 1].end == -1, 0))
+      if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
 	{
 	  *err = REG_ESUBREG;
 	  return NULL;
 	}
-      dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
+      dfa->used_bkref_map |= 1 << token->opr.idx;
       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
       if (BE (tree == NULL, 0))
 	{
@@ -2472,21 +2465,6 @@ parse_sub_exp (regexp, preg, token, synt
   bin_tree_t *tree, *left_par, *right_par;
   size_t cur_nsub;
   cur_nsub = preg->re_nsub++;
-  if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
-    {
-      re_subexp_t *new_array;
-      dfa->subexps_alloc *= 2;
-      new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
-      if (BE (new_array == NULL, 0))
-	{
-	  dfa->subexps_alloc /= 2;
-	  *err = REG_ESPACE;
-	  return NULL;
-	}
-      dfa->subexps = new_array;
-    }
-  dfa->subexps[cur_nsub].start = dfa->nodes_len;
-  dfa->subexps[cur_nsub].end = -1;
 
   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
   if (BE (left_par == NULL, 0))
@@ -2512,7 +2490,7 @@ parse_sub_exp (regexp, preg, token, synt
       return NULL;
     }
   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
-  dfa->subexps[cur_nsub].end = dfa->nodes_len;
+  dfa->completed_bkref_map |= 1 << cur_nsub;
   tree = ((tree == NULL) ? right_par
 	  : create_tree (dfa, tree, right_par, CONCAT, 0));
   tree = create_tree (dfa, left_par, tree, CONCAT, 0);


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -495,13 +495,6 @@ struct re_dfastate_t
 };
 typedef struct re_dfastate_t re_dfastate_t;
 
-typedef struct
-{
-  /* start <= node < end  */
-  int start;
-  int end;
-} re_subexp_t;
-
 struct re_state_table_entry
 {
   int num;
@@ -606,7 +599,6 @@ struct re_fail_stack_t
 
 struct re_dfa_t
 {
-  re_subexp_t *subexps;
   re_token_t *nodes;
   int nodes_alloc;
   int nodes_len;
@@ -626,13 +618,15 @@ struct re_dfa_t
   int str_tree_storage_idx;
 
   /* number of subexpressions `re_nsub' is in regex_t.  */
-  int subexps_alloc;
   unsigned int state_hash_mask;
   int states_alloc;
   int init_node;
   int nbackref; /* The number of backreference in this dfa.  */
+
   /* Bitmap expressing which backreference is used.  */
   unsigned int used_bkref_map;
+  unsigned int completed_bkref_map;
+
   unsigned int has_plural_match : 1;
   /* If this dfa has "multibyte node", which is a backreference or
      a node which can accept multibyte character or multi character


--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -1260,7 +1260,7 @@ proceed_next_node (mctx, nregs, regs, pi
 #endif /* RE_ENABLE_I18N */
       if (type == OP_BACK_REF)
 	{
-	  int subexp_idx = dfa->nodes[node].opr.idx;
+	  int subexp_idx = dfa->nodes[node].opr.idx + 1;
 	  naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
 	  if (fs != NULL)
 	    {
@@ -1853,7 +1853,7 @@ check_dst_limits (mctx, limits, dst_node
       int subexp_idx;
       struct re_backref_cache_entry *ent;
       ent = mctx->bkref_ents + limits->elems[lim_idx];
-      subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
+      subexp_idx = dfa->nodes[ent->node].opr.idx;
 
       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
 					   subexp_idx, dst_node, dst_idx,
@@ -1902,8 +1902,7 @@ check_dst_limits_calc_pos_1 (mctx, bound
 		    continue;
 
 		  if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
-		      && (ent->eps_reachable_subexps_map
-			  & (1 << (subexp_idx - 1))) == 0)
+		      && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
 		    continue;
 
 		  /* Recurse trying to reach the OP_OPEN_SUBEXP and
@@ -1929,7 +1928,7 @@ check_dst_limits_calc_pos_1 (mctx, bound
 		  if (cpos == 0 && (boundaries & 2))
 		    return 0;
 
-		  ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
+		  ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
 	        }
 	      while (ent++->more);
 	    }
@@ -2003,7 +2002,7 @@ check_subexp_limits (dfa, dest_nodes, ca
       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
 	continue; /* This is unrelated limitation.  */
 
-      subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
+      subexp_idx = dfa->nodes[ent->node].opr.idx;
       if (ent->subexp_to == str_idx)
 	{
 	  int ops_node = -1;
@@ -2060,16 +2059,12 @@ check_subexp_limits (dfa, dest_nodes, ca
 		{
 		  if (subexp_idx != dfa->nodes[node].opr.idx)
 		    continue;
-		  if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
-		      || (type == OP_OPEN_SUBEXP))
-		    {
-		      /* It is against this limitation.
-			 Remove it form the current sifted state.  */
-		      err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
-						   candidates);
-		      if (BE (err != REG_NOERROR, 0))
-			return err;
-		    }
+		  /* It is against this limitation.
+		     Remove it form the current sifted state.  */
+		  err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
+					       candidates);
+		  if (BE (err != REG_NOERROR, 0))
+		    return err;
 		}
 	    }
 	}
@@ -2657,7 +2652,7 @@ get_subexp (mctx, bkref_node, bkref_str_
       while (entry++->more);
     }
 
-  subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
+  subexp_num = dfa->nodes[bkref_node].opr.idx;
 
   /* For each sub expression  */
   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)

Follow-Ups:
- Re: [regex] Small cleanup in subexpression handling
  - From: Ulrich Drepper

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]