]> sourceware.org Git - systemtap.git/blob - stapregex-tree.h
stapregex PR15065 (8/8) :: add back re2c's mini-optimizer
[systemtap.git] / stapregex-tree.h
1 // -*- C++ -*-
2 // Copyright (C) 2012-2013 Red Hat Inc.
3 //
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
7 // later version.
8 //
9 // ---
10 //
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
13
14 #ifndef STAPREGEX_TREE_H
15 #define STAPREGEX_TREE_H
16
17 #include <string>
18 #include <deque>
19 #include <utility>
20 #include <stdexcept>
21
22 #include "stapregex-defines.h"
23
24 // Numbering scheme for tags representing start and end of n'th subexpression:
25 #define TAG_START(n) (2*(n))
26 #define TAG_END(n) (2*(n)+1)
27
28 namespace stapregex {
29
30 typedef std::pair<rchar, rchar> segment;
31
32 struct range {
33 std::deque<segment> segments; // -- [lb, ub], XXX sorted ascending
34
35 range () {} // -- empty range
36 range (rchar lb, rchar ub); // -- a segment [lb, ub]
37 range (const std::string& str); // -- character class (no named entities)
38
39 void print(std::ostream& o) const;
40 };
41
42 std::ostream& operator<< (std::ostream&, const range&);
43 std::ostream& operator<< (std::ostream&, const range*);
44
45 // NB: be sure to deallocate the old ranges if they are no longer used
46 range *range_union(range *a, range *b);
47 range *range_invert(range *ran);
48
49 // ------------------------------------------------------------------------
50
51 /* For the NFA representation, re2c uses an assembler-like notation,
52 which should be easy enough to understand based on the meaning of
53 FORK. An NFA is considered to be a tagged-NFA if it uses the TAG opcode.
54
55 The only tricky thing here is instituting sensible defaults for
56 tagged-NFA transition priorities. This is done by setting i->param = 0
57 on a FORK instruction if the FORK-target has lower priority, and
58 i->param = 1 if it has higher priority. FORK is the only place
59 where discriminating between transitions needs to be done. */
60
61 /* Opcodes for the assembly notation: */
62 const unsigned CHAR = 0; // -- match character set (one successful outcome)
63 const unsigned GOTO = 1;
64 const unsigned FORK = 2; // -- nondeterministic choice; param marks priority
65 const unsigned ACCEPT = 3; // -- final states; param marks success/fail
66 const unsigned TAG = 4; // -- subexpression tracking; param marks tag #
67 const unsigned INIT = 5; // -- opcode for ^ operator
68
69 /* To represent an NFA, allocate a continuous array of these ins units: */
70 union ins {
71 struct {
72 unsigned int tag:3; // -- opcode
73 unsigned int marked:1; // -- internal use; for algorithmic manipulation
74 unsigned int param:8; // -- numerical operand, e.g. tag #
75 void *link; // -- other instruction, e.g. FORK/GOTO target
76 } i;
77
78 /* For the CHAR opcodes, we follow the instruction with a sequence of
79 these special character-matching units, in ascending order: */
80 struct {
81 rchar value; // -- character to match
82 unsigned short bump; // -- relative address of success-outcome insn
83 } c;
84 };
85
86 inline bool marked(ins *i) { return i->i.marked != 0; }
87 inline void mark(ins *i) { i->i.marked = 1; }
88 inline void unmark(ins *i) { i->i.marked = 0; }
89
90 /* Helper function for printing out one ins element in a sequence: */
91 const ins* show_ins(std::ostream &o, const ins *i, const ins *base);
92
93 /* Perform the obvious optimization/compression on an ins node
94 (namely, collapsing chained GOTOs).
95
96 NB: This function sets 'marked' flags on the node; the caller is
97 responsible for clearing these flags for subsequent use. */
98 void ins_optimize(ins *i);
99
100 // ------------------------------------------------------------------------
101
102 struct regexp {
103 int num_tags; // -- number of tag_op id's used in expression, -1 if unknown
104 int size; // -- number of instructions required for ins representation
105
106 regexp () : num_tags(-1), size(-1) {}
107 virtual ~regexp () {}
108
109 virtual const std::string type_of() const = 0;
110
111 /* Is regexp left-anchored? This function is used for optimization
112 purposes, so it's always safe to return false. */
113 virtual bool anchored() const { return false; }
114
115 /* Length of array needed for ins array representation: */
116 virtual void calc_size() = 0;
117 unsigned ins_size() { if (size < 0) calc_size(); return size; }
118
119 /* Compile to (part of) an already-allocated ins array: */
120 virtual void compile(ins *i) = 0;
121
122 /* Allocate a fresh ins array and compile: */
123 ins *compile();
124
125 /* Print out, with a careful eye as to bracketing:
126 priority == 0 -- don't bracket anything
127 priority == 1 -- bracket alt_op, but not cat_op
128 priority == 2 -- bracket all compound operators */
129 virtual void print(std::ostream& o, unsigned priority = 0) const = 0;
130 };
131
132 std::ostream& operator << (std::ostream &o, const regexp& re);
133 std::ostream& operator << (std::ostream &o, const regexp* re);
134
135 // ------------------------------------------------------------------------
136
137 struct null_op : public regexp {
138 const std::string type_of() const { return "null_op"; }
139 void calc_size();
140 void compile(ins *i);
141
142 void print (std::ostream &o, unsigned) const {
143 o << "{null}"; // XXX: pick a better pseudo-notation?
144 }
145 };
146
147 struct anchor_op : public regexp {
148 rchar type;
149 anchor_op (rchar type);
150 const std::string type_of() const { return "anchor_op"; }
151 bool anchored () const { return type == '^'; }
152 void calc_size();
153 void compile(ins *i);
154
155 void print (std::ostream &o, unsigned) const {
156 o << type;
157 }
158 };
159
160 struct tag_op : public regexp {
161 unsigned id;
162 tag_op (unsigned id);
163 const std::string type_of() const { return "tag_op"; }
164 void calc_size();
165 void compile(ins *i);
166
167 void print (std::ostream &o, unsigned) const {
168 o << "{t_" << id << "}";
169 }
170 };
171
172 struct match_op : public regexp {
173 range *ran;
174 match_op (range *ran);
175 const std::string type_of() const { return "match_op"; }
176 void calc_size();
177 void compile(ins *i);
178 void print (std::ostream &o, unsigned) const { o << ran; }
179 };
180
181 struct alt_op : public regexp {
182 regexp *a, *b;
183 bool prefer_second;
184 alt_op (regexp *a, regexp *b, bool prefer_second = false);
185 const std::string type_of() const { return "alt_op"; }
186 bool anchored () const { return a->anchored() && b->anchored(); }
187 void calc_size();
188 void compile(ins *i);
189
190 void print (std::ostream &o, unsigned priority) const {
191 if (priority >= 1) o << "(";
192 a->print(o, 0); o << "|"; b->print(o, 0);
193 if (priority >= 1) o << ")";
194 }
195 };
196
197 struct cat_op : public regexp {
198 regexp *a, *b;
199 cat_op (regexp *a, regexp *b);
200 const std::string type_of() const { return "cat_op"; }
201 bool anchored () const {
202 return a->anchored(); // XXX: doesn't catch all cases, but that's all right
203 }
204 void calc_size();
205 void compile(ins *i);
206
207 void print (std::ostream &o, unsigned priority) const {
208 if (priority >= 2) o << "(";
209 a->print(o, 1); b->print(o, 1);
210 if (priority >= 2) o << ")";
211 }
212 };
213
214 struct close_op : public regexp {
215 regexp *re;
216 bool prefer_shorter;
217 close_op (regexp *re, bool prefer_shorter = false);
218 const std::string type_of() const { return "close_op"; }
219 bool anchored () const { return re->anchored(); }
220 void calc_size();
221 void compile(ins *i);
222
223 void print (std::ostream &o, unsigned) const {
224 re->print(o, 2); o << "+";
225 }
226 };
227
228 struct closev_op : public regexp {
229 regexp *re;
230 int nmin, nmax; // -- use -1 to denote unboundedness in that direction
231 closev_op (regexp *re, int nmin, int nmax);
232 const std::string type_of() const { return "closev_op"; }
233 bool anchored () const { return nmin > 0 && re->anchored(); }
234 void calc_size();
235 void compile(ins *i);
236
237 void print (std::ostream &o, unsigned) const {
238 re->print(o, 2); o << "{" << nmin << "," << nmax << "}";
239 }
240 };
241
242 /* The following is somewhat generalized to allow implementing support
243 for multiple distinct success outcomes, like in the original re2c: */
244 struct rule_op : public regexp {
245 regexp *re;
246 unsigned outcome; // -- 0 -> failure; 1 -> success; prefer success outcomes
247 rule_op (regexp *re, unsigned outcome);
248 const std::string type_of() const { return "rule_op"; }
249 bool anchored () const { return re->anchored(); }
250 void calc_size();
251 void compile(ins *i);
252
253 void print (std::ostream &o, unsigned) const {
254 re->print(o, 1);
255 if (outcome) o << "{success_" << outcome << "}";
256 else o << "{failure_0}";
257 }
258 };
259
260 // ------------------------------------------------------------------------
261
262 regexp *str_to_re(const std::string& str);
263
264 regexp *make_alt(regexp* a, regexp* b);
265 regexp *make_dot(bool allow_zero = false);
266
267 // ------------------------------------------------------------------------
268
269 struct regex_error: public std::runtime_error
270 {
271 int pos; // -1 denotes error at unknown/indeterminate position
272 regex_error (const std::string& msg):
273 runtime_error(msg), pos(-1) {}
274 regex_error (const std::string& msg, int pos):
275 runtime_error(msg), pos(pos) {}
276 ~regex_error () throw () {}
277 };
278
279 };
280
281 #endif
282
283 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.05228 seconds and 6 git commands to generate.