From fae72712d33b8760aa826a40f00319a4b845fd4c Mon Sep 17 00:00:00 2001 From: Serguei Makarov Date: Mon, 21 Jan 2013 14:15:00 -0500 Subject: [PATCH] PR11334 partial support: ^ anchoring operator $ is not yet completely implemented --- re2c-migrate/re2c-dfa.cxx | 20 ++++++++++++++----- re2c-migrate/re2c-regex.cxx | 27 ++++++++++++++++++++++++++ re2c-migrate/re2c-regex.h | 25 ++++++++++++++++++++++++ re2c-migrate/regcomp.base/regtest.in.0 | 10 ++++++++++ re2c-migrate/stapregex.cxx | 14 ++++++++++--- 5 files changed, 88 insertions(+), 8 deletions(-) diff --git a/re2c-migrate/re2c-dfa.cxx b/re2c-migrate/re2c-dfa.cxx index 2c1e02fd0..092c7c4f2 100644 --- a/re2c-migrate/re2c-dfa.cxx +++ b/re2c-migrate/re2c-dfa.cxx @@ -234,7 +234,7 @@ State::~State() /* Mark all Ins reachable from a given point without slurping a character. The list of Ins locations gets written out to a given work array starting at cP. */ -static Ins **closure(Ins **cP, Ins *i) +static Ins **closure(Ins **cP, Ins *i, bool isInitial) { while (!isMarked(i)) { @@ -243,13 +243,22 @@ static Ins **closure(Ins **cP, Ins *i) if (i->i.tag == FORK) { - cP = closure(cP, i + 1); + cP = closure(cP, i + 1, isInitial); i = (Ins*) i->i.link; } else if (i->i.tag == GOTO || i->i.tag == CTXT) { i = (Ins*) i->i.link; } + else if (i->i.tag == INIT && isInitial) + { + /* TODOXXX The kernel of an initial vs. a + non-initial state is distinguished not by + the presence of the INIT itself, but by the + presence of additional kernel Ins on the + other side of the INIT. */ + i = (Ins*) i->i.link; + } else break; } @@ -280,8 +289,9 @@ DFA::DFA(Ins *ins, unsigned ni, unsigned lb, unsigned ub, const Char *rep) memset((char*) goTo, 0, nc*sizeof(GoTo)); /* Build the initial state, based on the set of Ins - that can be reached from &ins[0]. */ - findState(work, closure(work, &ins[0]) - work); + that can be reached from &ins[0], including Ins + reachable only through ^/INIT operations. */ + findState(work, closure(work, &ins[0], true) - work); while (toDo) { @@ -323,7 +333,7 @@ DFA::DFA(Ins *ins, unsigned ni, unsigned lb, unsigned ub, const Char *rep) i = (Ins*) go->to; for (cP = work; i; i = (Ins*) i->c.link) - cP = closure(cP, i + i->c.bump); + cP = closure(cP, i + i->c.bump, false); go->to = findState(work, cP - work); } diff --git a/re2c-migrate/re2c-regex.cxx b/re2c-migrate/re2c-regex.cxx index 1de8e430f..51947762b 100644 --- a/re2c-migrate/re2c-regex.cxx +++ b/re2c-migrate/re2c-regex.cxx @@ -71,6 +71,10 @@ const Ins* showIns(std::ostream &o, const Ins &i, const Ins &base) o << "fork " << ((Ins*) i.i.link - &base); break; + case INIT: + o << "init " << ((Ins*) i.i.link - &base); + break; + case CTXT: o << "ctxt"; break; @@ -331,6 +335,29 @@ void MatchOp::split(CharSet &s) s.fix->nxt = NULL; } +const char *AnchorOp::type = "AnchorOp"; + +void AnchorOp::calcSize(Char *rep) +{ + size = 1; +} + +unsigned AnchorOp::fixedLength() +{ + return 0; +} + +void AnchorOp::compile(Char *rep, Ins *i) +{ + i->i.tag = INIT; + i->i.link = &i[1]; +} + +void AnchorOp::split(CharSet &s) +{ + ; +} + RegExp * mkDiff(RegExp *e1, RegExp *e2) { MatchOp *m1, *m2; diff --git a/re2c-migrate/re2c-regex.h b/re2c-migrate/re2c-regex.h index dcba49d18..b55637590 100644 --- a/re2c-migrate/re2c-regex.h +++ b/re2c-migrate/re2c-regex.h @@ -23,6 +23,8 @@ const unsigned GOTO = 1; const unsigned FORK = 2; const unsigned TERM = 3; const unsigned CTXT = 4; +const unsigned INIT = 5; // for ^ operator +// TODOXXX const unsigned ENDI = 6; // for $ operator?? union Ins { @@ -331,6 +333,29 @@ private: #endif }; +// TODOXXX for now, only supports left-anchoring (^) +class AnchorOp: public RegExp +{ + +public: + static const char *type; + +public: + const char *typeOf() + { + return type; + } + + void split(CharSet&); + void calcSize(Char*); + unsigned fixedLength(); + void compile(Char*, Ins*); + void display(std::ostream &o) const + { + o << "^"; + } +}; + class RuleOp: public RegExp { public: diff --git a/re2c-migrate/regcomp.base/regtest.in.0 b/re2c-migrate/regcomp.base/regtest.in.0 index eb7e15e21..1d9f1f08d 100644 --- a/re2c-migrate/regcomp.base/regtest.in.0 +++ b/re2c-migrate/regcomp.base/regtest.in.0 @@ -12,6 +12,16 @@ # 2 means regex compilation should fail (cleanly): 2:[:abc +# FIXME -- initial tests just for left-anchoring +0:: +0::abc +0:^abc:abc +0:^:abc +1:a^:abc +1:^abc:zabc +1:ab^c:abc +0:a*^b:b + # FIXME -- matching should not be left-anchored by default 0:abc:zabc diff --git a/re2c-migrate/stapregex.cxx b/re2c-migrate/stapregex.cxx index 78db7cf41..347367ac7 100644 --- a/re2c-migrate/stapregex.cxx +++ b/re2c-migrate/stapregex.cxx @@ -173,7 +173,8 @@ stapdfa::stapdfa (const string& func_name, const string& re) : orig_input(re), func_name(func_name) { if (!failRE) { - regex_parser p("[\\000-\\377]"); + //regex_parser p("[\\000-\\377]"); + regex_parser p(""); failRE = p.parse(); } if (!padRE) { @@ -186,6 +187,7 @@ stapdfa::stapdfa (const string& func_name, const string& re) // compile ast to DFA content = genCode (ast); + // cerr << content; content->prepare(); } @@ -439,9 +441,15 @@ regex_parser::parse_factor () result = parse_expr (); expect (')'); } - else if (c == '^' || c == '$') + else if (c == '^') { - parse_error(_F("FIXME -- '%c' not yet supported", c)); + result = new AnchorOp(); + } + else if (c == '$') + { + // TEMPORARY HACK + // parse_error(_F("FIXME -- '%c' not yet supported", c)); + result = sc->ranToRE(SubStr("[\\000]")); } else // escaped or ordinary character -- not yet swallowed { -- 2.43.5