This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
[regex] Small cleanup in subexpression handling
- From: Paolo Bonzini <paolo dot bonzini at lu dot unisi dot ch>
- To: Glibc Development <libc-alpha at sources dot redhat dot com>
- Date: Wed, 01 Dec 2004 12:24:29 +0100
- Subject: [regex] Small cleanup in subexpression handling
This cleanup has two parts. The first is the remove re_dfa_t's subexps
field, which is only used to test for backreferences used within their
definition like in (x\1). It can be replaced by a simple bitset.
The second is to make OP_BACK_REF 0-based (i.e. \1 corresponds to
opr.idx==0) like OP_OPEN_SUBEXP and OP_CLOSE_SUBEXP are. This
eliminates a good deal of compensation in regexec.c, with no runtime
effects but simplifying some expression.
Since I was at it, I removed a duplicate conditional in
check_subexp_limits. Outer ifs guarantee that (type == OP_OPEN_SUBEXP
|| type == OP_CLOSE_SUBEXP) && ent->subexp_to != str_idx.
Tested with valgrind to avoid stupid off-by-one mistakes like the one
Jakub found yesterday.
Paolo
2004-12-01 Paolo Bonzini <bonzini@gnu.org>
* posix/regcomp.c (free_dfa_content, init_dfa): Remove
references to re_dfa_t's subexps field.
(parse_sub_exp, parse_expression): Do not use it. Use
completed_bkref_map instead.
(create_initial_state, peek_token): Store a backreference \N
with opr.idx = N-1.
* posix/regexec.c (proceed_next_node, check_dst_limits,
get_subexp): Likewise.
(check_subexp_limits): Remove useless condition.
* posix/regex_internal.h (re_subexp_t): Remove.
(re_dfa_t): Remove subexps and subexps_alloc field, add
completed_bkref_map.
--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -596,8 +596,6 @@ free_dfa_content (re_dfa_t *dfa)
{
int i, j;
- re_free (dfa->subexps);
-
if (dfa->nodes)
for (i = 0; i < dfa->nodes_len; ++i)
{
@@ -884,9 +882,6 @@ init_dfa (dfa, pat_len)
dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
dfa->state_hash_mask = table_size - 1;
- dfa->subexps_alloc = 1;
- dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
-
dfa->mb_cur_max = MB_CUR_MAX;
#ifdef _LIBC
if (dfa->mb_cur_max == 6
@@ -950,8 +945,7 @@ init_dfa (dfa, pat_len)
}
#endif
- if (BE (dfa->nodes == NULL || dfa->state_table == NULL
- || dfa->subexps == NULL, 0))
+ if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
return REG_ESPACE;
return REG_NOERROR;
}
@@ -1028,7 +1022,7 @@ create_initial_state (dfa)
re_token_t *clexp_node;
clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
if (clexp_node->type == OP_CLOSE_SUBEXP
- && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
+ && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
break;
}
if (clexp_idx == init_nodes.nelem)
@@ -1837,7 +1831,7 @@ peek_token (token, input, syntax)
if (!(syntax & RE_NO_BK_REFS))
{
token->type = OP_BACK_REF;
- token->opr.idx = c2 - '0';
+ token->opr.idx = c2 - '1';
}
break;
case '<':
@@ -2295,13 +2289,12 @@ parse_expression (regexp, preg, token, s
return NULL;
break;
case OP_BACK_REF:
- if (BE (preg->re_nsub < token->opr.idx
- || dfa->subexps[token->opr.idx - 1].end == -1, 0))
+ if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
{
*err = REG_ESUBREG;
return NULL;
}
- dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
+ dfa->used_bkref_map |= 1 << token->opr.idx;
tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
if (BE (tree == NULL, 0))
{
@@ -2472,21 +2465,6 @@ parse_sub_exp (regexp, preg, token, synt
bin_tree_t *tree, *left_par, *right_par;
size_t cur_nsub;
cur_nsub = preg->re_nsub++;
- if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
- {
- re_subexp_t *new_array;
- dfa->subexps_alloc *= 2;
- new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
- if (BE (new_array == NULL, 0))
- {
- dfa->subexps_alloc /= 2;
- *err = REG_ESPACE;
- return NULL;
- }
- dfa->subexps = new_array;
- }
- dfa->subexps[cur_nsub].start = dfa->nodes_len;
- dfa->subexps[cur_nsub].end = -1;
left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
if (BE (left_par == NULL, 0))
@@ -2512,7 +2490,7 @@ parse_sub_exp (regexp, preg, token, synt
return NULL;
}
right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
- dfa->subexps[cur_nsub].end = dfa->nodes_len;
+ dfa->completed_bkref_map |= 1 << cur_nsub;
tree = ((tree == NULL) ? right_par
: create_tree (dfa, tree, right_par, CONCAT, 0));
tree = create_tree (dfa, left_par, tree, CONCAT, 0);
--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -495,13 +495,6 @@ struct re_dfastate_t
};
typedef struct re_dfastate_t re_dfastate_t;
-typedef struct
-{
- /* start <= node < end */
- int start;
- int end;
-} re_subexp_t;
-
struct re_state_table_entry
{
int num;
@@ -606,7 +599,6 @@ struct re_fail_stack_t
struct re_dfa_t
{
- re_subexp_t *subexps;
re_token_t *nodes;
int nodes_alloc;
int nodes_len;
@@ -626,13 +618,15 @@ struct re_dfa_t
int str_tree_storage_idx;
/* number of subexpressions `re_nsub' is in regex_t. */
- int subexps_alloc;
unsigned int state_hash_mask;
int states_alloc;
int init_node;
int nbackref; /* The number of backreference in this dfa. */
+
/* Bitmap expressing which backreference is used. */
unsigned int used_bkref_map;
+ unsigned int completed_bkref_map;
+
unsigned int has_plural_match : 1;
/* If this dfa has "multibyte node", which is a backreference or
a node which can accept multibyte character or multi character
--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -1260,7 +1260,7 @@ proceed_next_node (mctx, nregs, regs, pi
#endif /* RE_ENABLE_I18N */
if (type == OP_BACK_REF)
{
- int subexp_idx = dfa->nodes[node].opr.idx;
+ int subexp_idx = dfa->nodes[node].opr.idx + 1;
naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
if (fs != NULL)
{
@@ -1853,7 +1853,7 @@ check_dst_limits (mctx, limits, dst_node
int subexp_idx;
struct re_backref_cache_entry *ent;
ent = mctx->bkref_ents + limits->elems[lim_idx];
- subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
+ subexp_idx = dfa->nodes[ent->node].opr.idx;
dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
subexp_idx, dst_node, dst_idx,
@@ -1902,8 +1902,7 @@ check_dst_limits_calc_pos_1 (mctx, bound
continue;
if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
- && (ent->eps_reachable_subexps_map
- & (1 << (subexp_idx - 1))) == 0)
+ && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
continue;
/* Recurse trying to reach the OP_OPEN_SUBEXP and
@@ -1929,7 +1928,7 @@ check_dst_limits_calc_pos_1 (mctx, bound
if (cpos == 0 && (boundaries & 2))
return 0;
- ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
+ ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
}
while (ent++->more);
}
@@ -2003,7 +2002,7 @@ check_subexp_limits (dfa, dest_nodes, ca
if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
continue; /* This is unrelated limitation. */
- subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
+ subexp_idx = dfa->nodes[ent->node].opr.idx;
if (ent->subexp_to == str_idx)
{
int ops_node = -1;
@@ -2060,16 +2059,12 @@ check_subexp_limits (dfa, dest_nodes, ca
{
if (subexp_idx != dfa->nodes[node].opr.idx)
continue;
- if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
- || (type == OP_OPEN_SUBEXP))
- {
- /* It is against this limitation.
- Remove it form the current sifted state. */
- err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
- candidates);
- if (BE (err != REG_NOERROR, 0))
- return err;
- }
+ /* It is against this limitation.
+ Remove it form the current sifted state. */
+ err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
+ candidates);
+ if (BE (err != REG_NOERROR, 0))
+ return err;
}
}
}
@@ -2657,7 +2652,7 @@ get_subexp (mctx, bkref_node, bkref_str_
while (entry++->more);
}
- subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
+ subexp_num = dfa->nodes[bkref_node].opr.idx;
/* For each sub expression */
for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)