This is the mail archive of the
libc-alpha@sources.redhat.com
mailing list for the glibc project.
Re: [PATCH] More regex fixes and testcases
- From: Paolo Bonzini <paolo dot bonzini at polimi dot it>
- To: libc-alpha at sources dot redhat dot com
- Date: Mon, 06 Oct 2003 19:22:26 +0200
- Subject: Re: [PATCH] More regex fixes and testcases
- References: <3F815C2C.9080006@polimi.it>
- Reply-to: bonzini at gnu dot org
This fixes the issues that Jakub found in his review.
Paolo
2003-10-05 Paolo Bonzini <bonzini@gnu.org>
* posix/bug-regex11.c: Add three failing testcases.
--- libc/posix/bug-regex11.c 2003-10-06 11:52:23.000000000 +0200
+++ bug-regex11.c 2003-10-06 12:06:23.000000000 +0200
@@ -57,8 +57,11 @@
{ "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
{ "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
{ "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+ { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 1, 2 } } },
+ { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+ { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } },
};
int
main (void)
{
2003-10-05 Paolo Bonzini <bonzini@gnu.org>
* posix/bug-regex11.c: Add more backreference-related
test cases.
(main): Show the failing regex in the error messages.
* posix/regexec.c (check_dst_limits_calc_pos):
Simplify some nested conditionals. Replace if's with
a switch statement.
(check_dst_limits_calc_pos <TYPE_BKREF>): Rename
parameter NODE to FROM_NODE, it shadows a local
variable; don't recurse if FROM_NODE does not change
in the recursive invocation, fixing an infinite loop
in the ()\1*\1* regex.
(sift_states_backward): Fix function comment.
* posix/regcomp.c (calc_epsdest): Add an assertion.
diff -u -p -b -r1.6 bug-regex11.c
--- libc/posix/bug-regex11.c 2 Oct 2003 22:40:16 -0000 1.6
+++ libc/posix/bug-regex11.c 6 Oct 2003 09:53:38 -0000
@@ -35,6 +35,7 @@ struct
/* Test for newline handling in regex. */
{ "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
/* Other tests. */
+ { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
{ ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
{ { 0, 21 }, { 15, 16 }, { 16, 18 } } },
{ ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
@@ -52,6 +53,10 @@ struct
/* Here ^ cannot be treated as an anchor according to POSIX. */
{ "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
{ "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+ /* More tests on backreferences. */
+ { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+ { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+ { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
};
int
@@ -71,14 +76,14 @@ main (void)
{
char buf[500];
regerror (n, &re, buf, sizeof (buf));
- printf ("regcomp %zd failed: %s\n", i, buf);
+ printf ("%s: regcomp %zd failed: %s\n", tests[i].pattern, i, buf);
ret = 1;
continue;
}
if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
{
- printf ("regexec %zd failed\n", i);
+ printf ("%s: regexec %zd failed\n", tests[i].pattern, i);
ret = 1;
regfree (&re);
continue;
@@ -90,8 +95,8 @@ main (void)
{
if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
break;
- printf ("regexec match failure rm[%d] %d..%d\n",
- n, rm[n].rm_so, rm[n].rm_eo);
+ printf ("%s: regexec %zd match failure rm[%d] %d..%d\n",
+ tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
ret = 1;
break;
}
diff -u -p -b -r1.42 regcomp.c
--- libc/posix/regcomp.c 2 Oct 2003 22:39:46 -0000 1.42
+++ libc/posix/regcomp.c 6 Oct 2003 09:53:39 -0000
@@ -1149,6 +1149,8 @@ calc_epsdest (dfa, node)
|| dfa->nodes[idx].type == OP_CLOSE_SUBEXP
|| dfa->nodes[idx].type == OP_BACK_REF)
re_node_set_init_1 (dfa->edests + idx, node->next);
+ else
+ assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
}
}
diff -u -p -b -r1.27 regexec.c
--- libc/posix/regexec.c 23 Sep 2003 05:30:23 -0000 1.27
+++ libc/posix/regexec.c 6 Oct 2003 09:53:39 -0000
@@ -1387,7 +1387,7 @@ update_regs (dfa, pmatch, cur_node, cur_
away the node `a'.
ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
throwed away, we throw away the node `a'.
- 3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
+ 3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
node `a'.
ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
@@ -1724,60 +1724,98 @@ check_dst_limits (dfa, limits, mctx, dst
}
static int
-check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
+check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
str_idx)
re_dfa_t *dfa;
re_match_context_t *mctx;
re_node_set *eclosures;
- int limit, subexp_idx, node, str_idx;
+ int limit, subexp_idx, from_node, str_idx;
{
struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
- int pos = (str_idx < lim->subexp_from ? -1
- : (lim->subexp_to < str_idx ? 1 : 0));
- if (pos == 0
- && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
- {
int node_idx;
+
+ /* If we are outside the range of the subexpression, return -1 or 1. */
+ if (str_idx < lim->subexp_from)
+ return -1;
+
+ if (lim->subexp_to < str_idx)
+ return 1;
+
+ /* If we are within the subexpression, return 0. */
+ if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
+ return 0;
+
+ /* Else, we are on the boundary: examine the nodes on the epsilon
+ closure. */
for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
{
int node = eclosures->elems[node_idx];
- re_token_type_t type= dfa->nodes[node].type;
- if (type == OP_BACK_REF)
+ switch (dfa->nodes[node].type)
+ {
+ case OP_BACK_REF:
{
int bi = search_cur_bkref_entry (mctx, str_idx);
for (; bi < mctx->nbkref_ents; ++bi)
{
struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
+ int dst, cpos;
+
+ /* If this backreference goes beyond the point we're
+ examining, don't go any further. */
if (ent->str_idx > str_idx)
break;
- if (ent->node == node && ent->subexp_from == ent->subexp_to)
- {
- int cpos, dst;
+
+ if (ent->node != node || ent->subexp_from != ent->subexp_to)
+ continue;
+
+ /* Recurse trying to reach the OP_OPEN_SUBEXP and
+ OP_CLOSE_SUBEXP cases below. But, if the
+ destination node is the same node as the source
+ node, don't recurse because it would cause an
+ infinite loop: a regex that exhibits this behavior
+ is ()\1*\1* */
dst = dfa->edests[node].elems[0];
+ if (dst == from_node)
+ {
+ if (str_idx == lim->subexp_from)
+ return -1;
+ else /* if (str_idx == lim->subexp_to) */
+ return 0;
+ }
+
cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
dfa->eclosures + dst,
subexp_idx, dst,
str_idx);
- if ((str_idx == lim->subexp_from && cpos == -1)
- || (str_idx == lim->subexp_to && cpos == 0))
- return cpos;
- }
- }
+
+ if (cpos == -1 && str_idx == lim->subexp_from)
+ return -1;
+
+ if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
+ return 0;
}
- if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
- && str_idx == lim->subexp_from)
- {
- pos = -1;
break;
}
- if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
- && str_idx == lim->subexp_to)
+
+ case OP_OPEN_SUBEXP:
+ if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
+ return -1;
+ break;
+
+ case OP_CLOSE_SUBEXP:
+ if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
+ return 0;
+ break;
+
+ default:
break;
}
- if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
- pos = 1;
}
- return pos;
+
+ if (str_idx == lim->subexp_to)
+ return 1;
+ else
+ return 0;
}
/* Check the limitations of sub expressions LIMITS, and remove the nodes