]>
sourceware.org Git - systemtap.git/blob - stapregex-tree.h
2 // Copyright (C) 2012-2013 Red Hat Inc.
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
14 #ifndef STAPREGEX_TREE_H
15 #define STAPREGEX_TREE_H
22 #include "stapregex-defines.h"
24 // Numbering scheme for tags representing start and end of n'th subexpression:
25 #define TAG_START(n) (2*(n))
26 #define TAG_END(n) (2*(n)+1)
30 typedef std::pair
<rchar
, rchar
> segment
;
33 std::deque
<segment
> segments
; // -- [lb, ub], XXX sorted ascending
35 range () {} // -- empty range
36 range (rchar lb
, rchar ub
); // -- a segment [lb, ub]
37 range (const std::string
& str
); // -- character class (no named entities)
39 void print(std::ostream
& o
) const;
42 std::ostream
& operator<< (std::ostream
&, const range
&);
43 std::ostream
& operator<< (std::ostream
&, const range
*);
45 // NB: be sure to deallocate the old ranges if they are no longer used
46 range
*range_union(range
*a
, range
*b
);
47 range
*range_invert(range
*ran
);
49 // ------------------------------------------------------------------------
51 /* For the NFA representation, re2c uses an assembler-like notation,
52 which should be easy enough to understand based on the meaning of
53 FORK. An NFA is considered to be a tagged-NFA if it uses the TAG opcode.
55 The only tricky thing here is instituting sensible defaults for
56 tagged-NFA transition priorities. This is done by setting i->param = 0
57 on a FORK instruction if the FORK-target has lower priority, and
58 i->param = 1 if it has higher priority. FORK is the only place
59 where discriminating between transitions needs to be done. */
61 /* Opcodes for the assembly notation: */
62 const unsigned CHAR
= 0; // -- match character set (one successful outcome)
63 const unsigned GOTO
= 1;
64 const unsigned FORK
= 2; // -- nondeterministic choice; param marks priority
65 const unsigned ACCEPT
= 3; // -- final states; param marks success/fail
66 const unsigned TAG
= 4; // -- subexpression tracking; param marks tag #
67 const unsigned INIT
= 5; // -- opcode for ^ operator
69 /* To represent an NFA, allocate a continuous array of these ins units: */
72 unsigned int tag
:3; // -- opcode
73 unsigned int marked
:1; // -- internal use; for algorithmic manipulation
74 unsigned int param
:8; // -- numerical operand, e.g. tag #
75 void *link
; // -- other instruction, e.g. FORK/GOTO target
78 /* For the CHAR opcodes, we follow the instruction with a sequence of
79 these special character-matching units, in ascending order: */
81 rchar value
; // -- character to match
82 unsigned short bump
; // -- relative address of success-outcome insn
86 inline bool marked(ins
*i
) { return i
->i
.marked
!= 0; }
87 inline void mark(ins
*i
) { i
->i
.marked
= 1; }
88 inline void unmark(ins
*i
) { i
->i
.marked
= 0; }
90 /* Helper function for printing out one ins element in a sequence: */
91 const ins
* show_ins(std::ostream
&o
, const ins
*i
, const ins
*base
);
93 /* Perform the obvious optimization/compression on an ins node
94 (namely, collapsing chained GOTOs).
96 NB: This function sets 'marked' flags on the node; the caller is
97 responsible for clearing these flags for subsequent use. */
98 void ins_optimize(ins
*i
);
100 // ------------------------------------------------------------------------
103 int num_tags
; // -- number of tag_op id's used in expression, -1 if unknown
104 int size
; // -- number of instructions required for ins representation
106 regexp () : num_tags(-1), size(-1) {}
107 virtual ~regexp () {}
109 virtual const std::string
type_of() const = 0;
111 /* Is regexp left-anchored? This function is used for optimization
112 purposes, so it's always safe to return false. */
113 virtual bool anchored() const { return false; }
115 /* Length of array needed for ins array representation: */
116 virtual void calc_size() = 0;
117 unsigned ins_size() { if (size
< 0) calc_size(); return size
; }
119 /* Compile to (part of) an already-allocated ins array: */
120 virtual void compile(ins
*i
) = 0;
122 /* Allocate a fresh ins array and compile: */
125 /* Print out, with a careful eye as to bracketing:
126 priority == 0 -- don't bracket anything
127 priority == 1 -- bracket alt_op, but not cat_op
128 priority == 2 -- bracket all compound operators */
129 virtual void print(std::ostream
& o
, unsigned priority
= 0) const = 0;
132 std::ostream
& operator << (std::ostream
&o
, const regexp
& re
);
133 std::ostream
& operator << (std::ostream
&o
, const regexp
* re
);
135 // ------------------------------------------------------------------------
137 struct null_op
: public regexp
{
138 const std::string
type_of() const { return "null_op"; }
140 void compile(ins
*i
);
142 void print (std::ostream
&o
, unsigned) const {
143 o
<< "{null}"; // XXX: pick a better pseudo-notation?
147 struct anchor_op
: public regexp
{
149 anchor_op (rchar type
);
150 const std::string
type_of() const { return "anchor_op"; }
151 bool anchored () const { return type
== '^'; }
153 void compile(ins
*i
);
155 void print (std::ostream
&o
, unsigned) const {
160 struct tag_op
: public regexp
{
162 tag_op (unsigned id
);
163 const std::string
type_of() const { return "tag_op"; }
165 void compile(ins
*i
);
167 void print (std::ostream
&o
, unsigned) const {
168 o
<< "{t_" << id
<< "}";
172 struct match_op
: public regexp
{
174 match_op (range
*ran
);
175 const std::string
type_of() const { return "match_op"; }
177 void compile(ins
*i
);
178 void print (std::ostream
&o
, unsigned) const { o
<< ran
; }
181 struct alt_op
: public regexp
{
184 alt_op (regexp
*a
, regexp
*b
, bool prefer_second
= false);
185 const std::string
type_of() const { return "alt_op"; }
186 bool anchored () const { return a
->anchored() && b
->anchored(); }
188 void compile(ins
*i
);
190 void print (std::ostream
&o
, unsigned priority
) const {
191 if (priority
>= 1) o
<< "(";
192 a
->print(o
, 0); o
<< "|"; b
->print(o
, 0);
193 if (priority
>= 1) o
<< ")";
197 struct cat_op
: public regexp
{
199 cat_op (regexp
*a
, regexp
*b
);
200 const std::string
type_of() const { return "cat_op"; }
201 bool anchored () const {
202 return a
->anchored(); // XXX: doesn't catch all cases, but that's all right
205 void compile(ins
*i
);
207 void print (std::ostream
&o
, unsigned priority
) const {
208 if (priority
>= 2) o
<< "(";
209 a
->print(o
, 1); b
->print(o
, 1);
210 if (priority
>= 2) o
<< ")";
214 struct close_op
: public regexp
{
217 close_op (regexp
*re
, bool prefer_shorter
= false);
218 const std::string
type_of() const { return "close_op"; }
219 bool anchored () const { return re
->anchored(); }
221 void compile(ins
*i
);
223 void print (std::ostream
&o
, unsigned) const {
224 re
->print(o
, 2); o
<< "+";
228 struct closev_op
: public regexp
{
230 int nmin
, nmax
; // -- use -1 to denote unboundedness in that direction
231 closev_op (regexp
*re
, int nmin
, int nmax
);
232 const std::string
type_of() const { return "closev_op"; }
233 bool anchored () const { return nmin
> 0 && re
->anchored(); }
235 void compile(ins
*i
);
237 void print (std::ostream
&o
, unsigned) const {
238 re
->print(o
, 2); o
<< "{" << nmin
<< "," << nmax
<< "}";
242 /* The following is somewhat generalized to allow implementing support
243 for multiple distinct success outcomes, like in the original re2c: */
244 struct rule_op
: public regexp
{
246 unsigned outcome
; // -- 0 -> failure; 1 -> success; prefer success outcomes
247 rule_op (regexp
*re
, unsigned outcome
);
248 const std::string
type_of() const { return "rule_op"; }
249 bool anchored () const { return re
->anchored(); }
251 void compile(ins
*i
);
253 void print (std::ostream
&o
, unsigned) const {
255 if (outcome
) o
<< "{success_" << outcome
<< "}";
256 else o
<< "{failure_0}";
260 // ------------------------------------------------------------------------
262 regexp
*str_to_re(const std::string
& str
);
264 regexp
*make_alt(regexp
* a
, regexp
* b
);
265 regexp
*make_dot(bool allow_zero
= false);
267 // ------------------------------------------------------------------------
269 struct regex_error
: public std::runtime_error
271 int pos
; // -1 denotes error at unknown/indeterminate position
272 regex_error (const std::string
& msg
):
273 runtime_error(msg
), pos(-1) {}
274 regex_error (const std::string
& msg
, int pos
):
275 runtime_error(msg
), pos(pos
) {}
276 ~regex_error () throw () {}
283 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.05228 seconds and 6 git commands to generate.