]>
Commit | Line | Data |
---|---|---|
28a27de2 SM |
1 | // -*- C++ -*- |
2 | // Copyright (C) 2012-2013 Red Hat Inc. | |
3 | // | |
4 | // This file is part of systemtap, and is free software. You can | |
5 | // redistribute it and/or modify it under the terms of the GNU General | |
6 | // Public License (GPL); either version 2, or (at your option) any | |
7 | // later version. | |
8 | // | |
9 | // --- | |
10 | // | |
11 | // This file incorporates code from the re2c project; please see | |
12 | // the file README.stapregex for details. | |
13 | ||
14 | #include <string> | |
15 | #include <deque> | |
1acfc030 JS |
16 | #include <iterator> |
17 | #include <algorithm> | |
28a27de2 SM |
18 | #include <utility> |
19 | #include <cmath> | |
179688ce | 20 | #include <cassert> |
28a27de2 SM |
21 | |
22 | #include "stapregex-parse.h" | |
23 | #include "stapregex-tree.h" | |
24 | ||
25 | using namespace std; | |
26 | ||
27 | namespace stapregex { | |
28 | ||
248f3856 | 29 | range::range (char lb, char ub) |
28a27de2 | 30 | { |
248f3856 | 31 | segments.push_back(make_pair(lb,ub)); |
28a27de2 SM |
32 | } |
33 | ||
34 | range::range (const string& str) | |
35 | { | |
36 | cursor cur(&str); // no unescaping | |
37 | ||
38 | if (cur.finished) return; | |
39 | ||
40 | range *ran = stapregex_getrange(cur); | |
41 | ||
42 | while (!cur.finished) | |
43 | { | |
44 | range *add = stapregex_getrange(cur); | |
45 | range *new_ran = ( ran != NULL ? range_union(ran, add) : add ); | |
46 | delete ran; if (new_ran != add) delete add; | |
47 | ran = new_ran; | |
48 | } | |
49 | ||
50 | segments = ran->segments; | |
51 | delete ran; | |
52 | } | |
53 | ||
07777235 SM |
54 | void |
55 | range::print (std::ostream& o) const | |
56 | { | |
57 | if (segments.empty()) | |
58 | { | |
59 | o << "{none}"; // XXX: pick a better pseudo-notation? | |
60 | return; | |
61 | } | |
62 | ||
248f3856 | 63 | if (segments.size() == 1 && segments[0].first == segments[0].second) |
07777235 | 64 | { |
248f3856 | 65 | print_escaped (o, segments[0].first); |
07777235 SM |
66 | return; |
67 | } | |
68 | ||
69 | o << "["; | |
70 | for (deque<segment>::const_iterator it = segments.begin(); | |
71 | it != segments.end(); it++) | |
72 | { | |
73 | char lb = it->first; char ub = it->second; | |
248f3856 SM |
74 | if (lb == ub) |
75 | { | |
76 | print_escaped (o, lb); | |
77 | } | |
78 | else | |
79 | { | |
80 | print_escaped (o, lb); | |
81 | o << "-"; | |
82 | print_escaped (o, ub); | |
83 | } | |
07777235 SM |
84 | } |
85 | o << "]"; | |
86 | } | |
87 | ||
88 | std::ostream& | |
89 | operator << (std::ostream& o, const range& ran) | |
90 | { | |
91 | ran.print(o); | |
92 | return o; | |
93 | } | |
94 | ||
95 | std::ostream& | |
96 | operator << (std::ostream& o, const range* ran) | |
97 | { | |
98 | if (ran) | |
99 | o << *ran; | |
100 | else | |
101 | o << "{none}"; // XXX: pick a better pseudo-notation? | |
102 | return o; | |
103 | } | |
104 | ||
105 | // ------------------------------------------------------------------------ | |
106 | ||
28a27de2 SM |
107 | range * |
108 | range_union(range *old_a, range *old_b) | |
109 | { | |
179688ce | 110 | if (old_a == NULL && old_b == NULL) return NULL; |
1acfc030 JS |
111 | if (old_a == NULL) return new range(*old_b); |
112 | if (old_b == NULL) return new range(*old_a); | |
28a27de2 SM |
113 | |
114 | /* First, gather the segments from both ranges into one sorted pile: */ | |
115 | deque<segment> s; | |
1acfc030 JS |
116 | merge(old_a->segments.begin(), old_a->segments.end(), |
117 | old_b->segments.begin(), old_b->segments.end(), | |
118 | inserter(s, s.end())); | |
28a27de2 SM |
119 | |
120 | /* Now go through and merge overlapping segments. */ | |
121 | range *ran = new range; | |
122 | ||
123 | while (!s.empty()) | |
124 | { | |
125 | /* Merge adjacent overlapping segments. */ | |
248f3856 | 126 | while (s.size() >= 2 && s[0].second >= s[1].first) |
28a27de2 SM |
127 | { |
128 | segment merged = make_pair(min(s[0].first, s[1].first), | |
129 | max(s[0].second, s[1].second)); | |
130 | s.pop_front(); s.pop_front(); | |
131 | s.push_front(merged); | |
132 | } | |
133 | ||
134 | /* Place non-overlapping segment in range. */ | |
135 | ran->segments.push_back(s.front()); s.pop_front(); | |
136 | } | |
137 | ||
138 | return ran; | |
139 | } | |
140 | ||
141 | range * | |
142 | range_invert(range *old_ran) | |
143 | { | |
144 | range ran(*old_ran); | |
145 | range *new_ran = new range; | |
146 | ||
e5f8ae47 | 147 | char start = '\1'; // exclude '\0' |
28a27de2 SM |
148 | |
149 | while (!ran.segments.empty()) { | |
e5f8ae47 | 150 | char end = ran.segments.front().first - 1; |
28a27de2 | 151 | if (start <= end) new_ran->segments.push_back(make_pair(start, end)); |
248f3856 | 152 | start = ran.segments.front().second + 1; |
28a27de2 SM |
153 | ran.segments.pop_front(); |
154 | } | |
155 | ||
9d9595d6 | 156 | if ((unsigned)start < (unsigned)NUM_REAL_CHARS) |
e5f8ae47 | 157 | new_ran->segments.push_back(make_pair(start, NUM_REAL_CHARS-1)); |
28a27de2 SM |
158 | |
159 | return new_ran; | |
160 | } | |
161 | ||
162 | // ------------------------------------------------------------------------ | |
163 | ||
07777235 SM |
164 | const ins* |
165 | show_ins (std::ostream &o, const ins *i, const ins *base) | |
166 | { | |
167 | o.width(3); o << (i - base) << ": "; | |
168 | ||
169 | const ins *ret = &i[1]; | |
170 | ||
171 | switch (i->i.tag) | |
172 | { | |
173 | case CHAR: | |
174 | o << "match "; | |
175 | for (; ret < (ins *) i->i.link; ++ret) print_escaped(o, ret->c.value); | |
176 | break; | |
177 | ||
178 | case GOTO: | |
179 | o << "goto " << ((ins *) i->i.link - base); | |
180 | break; | |
181 | ||
182 | case FORK: | |
183 | o << "fork(" << ( i->i.param ? "prefer" : "avoid" ) << ") " | |
184 | << ((ins *) i->i.link - base); | |
185 | break; | |
186 | ||
187 | case ACCEPT: | |
188 | o << "accept(" << i->i.param << ")"; | |
189 | break; | |
190 | ||
191 | case TAG: | |
192 | o << "tag(" << i->i.param << ")"; | |
193 | break; | |
194 | ||
195 | case INIT: | |
196 | o << "init"; | |
197 | break; | |
198 | } | |
199 | ||
200 | return ret; | |
201 | } | |
202 | ||
203 | // ------------------------------------------------------------------------ | |
204 | ||
205 | ins * | |
206 | regexp::compile() | |
207 | { | |
248f3856 SM |
208 | unsigned k = ins_size(); |
209 | ||
210 | ins *i = new ins[k + 1]; | |
07777235 | 211 | compile(i); |
248f3856 SM |
212 | |
213 | // Append an infinite-loop GOTO to avoid edges going outside the array: | |
214 | i[k].i.tag = GOTO; | |
215 | i[k].i.link = &i[k]; | |
216 | ||
07777235 SM |
217 | return i; |
218 | } | |
219 | ||
220 | std::ostream& | |
221 | operator << (std::ostream &o, const regexp& re) | |
222 | { | |
223 | re.print (o); | |
224 | return o; | |
225 | } | |
226 | ||
227 | std::ostream& | |
228 | operator << (std::ostream &o, const regexp* re) | |
229 | { | |
230 | o << *re; | |
231 | return o; | |
232 | } | |
233 | ||
234 | // ------------------------------------------------------------------------ | |
235 | ||
236 | void | |
237 | null_op::calc_size() | |
238 | { | |
239 | size = 0; | |
240 | } | |
241 | ||
242 | void | |
243 | null_op::compile(ins *i) | |
244 | { | |
245 | ; | |
246 | } | |
247 | ||
28a27de2 SM |
248 | anchor_op::anchor_op(char type) : type(type) {} |
249 | ||
07777235 SM |
250 | void |
251 | anchor_op::calc_size() | |
252 | { | |
253 | size = ( type == '^' ? 1 : 2 ); | |
254 | } | |
255 | ||
256 | void | |
257 | anchor_op::compile(ins *i) | |
258 | { | |
259 | if (type == '^') | |
260 | { | |
261 | i->i.tag = INIT; | |
262 | i->i.link = &i[1]; | |
263 | } | |
264 | else // type == '$' | |
265 | { | |
266 | i->i.tag = CHAR; | |
267 | i->i.link = &i[2]; | |
268 | ins *j = &i[1]; | |
269 | j->c.value = '\0'; | |
270 | j->c.bump = 1; | |
271 | } | |
272 | } | |
273 | ||
28a27de2 SM |
274 | tag_op::tag_op(unsigned id) : id(id) {} |
275 | ||
07777235 SM |
276 | void |
277 | tag_op::calc_size() | |
278 | { | |
279 | size = 1; | |
280 | } | |
281 | ||
282 | void | |
283 | tag_op::compile(ins *i) | |
284 | { | |
285 | i->i.tag = TAG; | |
286 | i->i.param = id; | |
287 | } | |
288 | ||
28a27de2 SM |
289 | match_op::match_op(range *ran) : ran(ran) {} |
290 | ||
07777235 SM |
291 | void |
292 | match_op::calc_size() | |
293 | { | |
294 | size = 1; | |
295 | ||
296 | for (deque<segment>::iterator it = ran->segments.begin(); | |
297 | it != ran->segments.end(); it++) | |
298 | { | |
248f3856 | 299 | size += it->second - it->first + 1; |
07777235 SM |
300 | } |
301 | } | |
302 | ||
303 | void | |
304 | match_op::compile(ins *i) | |
305 | { | |
306 | unsigned bump = ins_size(); | |
307 | i->i.tag = CHAR; | |
308 | i->i.link = &i[bump]; // mark end of table | |
309 | ||
310 | ins *j = &i[1]; | |
311 | for (deque<segment>::iterator it = ran->segments.begin(); | |
312 | it != ran->segments.end(); it++) | |
313 | { | |
112e9e2b | 314 | for (unsigned c = it->first; c <= (unsigned) it->second; c++) |
07777235 SM |
315 | { |
316 | j->c.value = c; | |
317 | j->c.bump = --bump; // mark end of table | |
318 | j++; | |
319 | } | |
320 | } | |
321 | } | |
322 | ||
248f3856 SM |
323 | alt_op::alt_op(regexp *a, regexp *b, bool prefer_second) |
324 | : a(a), b(b), prefer_second(prefer_second) {} | |
28a27de2 | 325 | |
07777235 SM |
326 | void |
327 | alt_op::calc_size() | |
328 | { | |
329 | size = a->ins_size() + b->ins_size() + 2; | |
330 | } | |
331 | ||
332 | void | |
333 | alt_op::compile(ins *i) | |
334 | { | |
335 | i->i.tag = FORK; | |
248f3856 | 336 | i->i.param = prefer_second ? 1 : 0; // preferred alternative to match |
07777235 SM |
337 | ins *j = &i[a->ins_size() + 1]; |
338 | i->i.link = &j[1]; | |
339 | a->compile(&i[1]); | |
340 | j->i.tag = GOTO; | |
341 | j->i.link = &j[b->ins_size() + 1]; | |
342 | b->compile(&j[1]); | |
343 | } | |
344 | ||
28a27de2 SM |
345 | cat_op::cat_op(regexp *a, regexp *b) : a(a), b(b) {} |
346 | ||
07777235 SM |
347 | void |
348 | cat_op::calc_size() | |
349 | { | |
350 | size = a->ins_size() + b->ins_size(); | |
351 | } | |
352 | ||
353 | void | |
354 | cat_op::compile(ins *i) | |
355 | { | |
356 | a->compile(&i[0]); | |
357 | b->compile(&i[a->ins_size()]); | |
358 | } | |
359 | ||
248f3856 SM |
360 | close_op::close_op(regexp *re, bool prefer_shorter) |
361 | : re(re), prefer_shorter(prefer_shorter) {} | |
28a27de2 | 362 | |
07777235 SM |
363 | void |
364 | close_op::calc_size() | |
365 | { | |
366 | size = re->ins_size() + 1; | |
367 | } | |
368 | ||
369 | void | |
370 | close_op::compile(ins *i) | |
371 | { | |
372 | re->compile(&i[0]); | |
373 | i += re->ins_size(); | |
374 | i->i.tag = FORK; | |
248f3856 | 375 | i->i.param = prefer_shorter ? 0 : 1; // XXX: match greedily by default |
07777235 SM |
376 | i->i.link = i - re->ins_size(); |
377 | } | |
378 | ||
379 | closev_op::closev_op(regexp *re, int nmin, int nmax) | |
380 | : re(re), nmin(nmin), nmax(nmax) {} | |
381 | ||
382 | void | |
383 | closev_op::calc_size() | |
384 | { | |
385 | unsigned k = re->ins_size(); | |
386 | ||
387 | if (nmax >= 0) | |
388 | size = k * nmin + (nmax - nmin) * (1 + k); | |
389 | else | |
390 | size = k * nmin + 1; | |
391 | } | |
392 | ||
393 | void | |
394 | closev_op::compile(ins *i) | |
395 | { | |
396 | unsigned k = re->ins_size(); | |
397 | ||
398 | ins *jumppoint = i + ((nmax - nmin) * (1 + k)); | |
399 | ||
400 | for (int st = nmin; st < nmax; st++) | |
401 | { | |
402 | i->i.tag = FORK; | |
403 | i->i.param = 0; // XXX: this matches greedily | |
404 | i->i.link = jumppoint; | |
405 | i++; | |
406 | re->compile(&i[0]); | |
407 | i += k; | |
408 | } | |
409 | ||
410 | for (int st = 0; st < nmin; st++) | |
411 | { | |
412 | re->compile(&i[0]); | |
413 | i += k; | |
414 | ||
415 | if (nmax < 0 && st == 0) | |
416 | { | |
417 | i->i.tag = FORK; | |
418 | i->i.param = 1; // XXX: this matches greedily | |
419 | i->i.link = i - k; | |
420 | i++; | |
421 | } | |
422 | } | |
423 | } | |
424 | ||
425 | rule_op::rule_op(regexp *re, unsigned outcome) : re(re), outcome(outcome) {} | |
426 | ||
427 | void | |
428 | rule_op::calc_size() | |
429 | { | |
430 | size = re->ins_size() + 1; | |
431 | } | |
432 | ||
433 | void | |
434 | rule_op::compile(ins *i) | |
435 | { | |
436 | re->compile(&i[0]); | |
437 | i += re->ins_size(); | |
438 | i->i.tag = ACCEPT; | |
439 | i->i.param = outcome; | |
440 | } | |
28a27de2 SM |
441 | |
442 | // ------------------------------------------------------------------------ | |
443 | ||
444 | regexp * | |
445 | match_char(char c) | |
446 | { | |
447 | return new match_op(new range(c,c)); | |
448 | } | |
449 | ||
450 | regexp * | |
451 | str_to_re(const string& str) | |
452 | { | |
453 | if (str.empty()) return new null_op; | |
454 | ||
455 | regexp *re = match_char(str[0]); | |
456 | ||
457 | for (unsigned i = 1; i < str.length(); i++) | |
458 | re = new cat_op(re, match_char(str[i])); | |
459 | ||
460 | return re; | |
461 | } | |
462 | ||
463 | regexp * | |
464 | do_alt(regexp *a, regexp *b) | |
465 | { | |
466 | if (a == NULL) return b; | |
467 | if (b == NULL) return a; | |
468 | return new alt_op(a,b); | |
469 | } | |
470 | ||
471 | regexp * | |
472 | make_alt(regexp *a, regexp *b) | |
473 | { | |
474 | /* Optimize the case of building alternatives of match_op. */ | |
475 | regexp *e1 = NULL, *e2 = NULL; | |
476 | range *r1 = NULL, *r2 = NULL; | |
477 | ||
478 | if (a->type_of() == "alt_op") | |
479 | { | |
480 | alt_op *aa = (alt_op *)a; | |
481 | if (aa->a->type_of() == "match_op") | |
482 | { | |
483 | r1 = ((match_op *) aa->a)->ran; e1 = aa->b; | |
484 | } | |
485 | else | |
486 | e1 = a; | |
487 | } | |
488 | else if (a->type_of() == "match_op") | |
489 | { | |
490 | r1 = ((match_op *) a)->ran; e1 = NULL; | |
491 | } | |
179688ce SM |
492 | else |
493 | e1 = a; | |
28a27de2 SM |
494 | |
495 | if (b->type_of() == "alt_op") | |
496 | { | |
497 | alt_op *bb = (alt_op *)b; | |
498 | if (bb->a->type_of() == "match_op") | |
499 | { | |
500 | r2 = ((match_op *) bb->a)->ran; e2 = bb->b; | |
501 | } | |
502 | else | |
503 | e2 = b; | |
504 | } | |
505 | else if (b->type_of() == "match_op") | |
506 | { | |
507 | r2 = ((match_op *) b)->ran; e2 = NULL; | |
508 | } | |
179688ce SM |
509 | else |
510 | e2 = b; | |
28a27de2 SM |
511 | |
512 | range *u = range_union(r1, r2); | |
513 | delete r1; delete r2; | |
514 | ||
515 | match_op *m = u != NULL ? new match_op(u) : NULL; | |
516 | regexp *r = do_alt(m, do_alt(e1, e2)); | |
179688ce | 517 | assert (r != NULL); |
28a27de2 SM |
518 | return r; |
519 | } | |
520 | ||
521 | regexp * | |
248f3856 | 522 | make_dot(bool allow_zero) |
28a27de2 | 523 | { |
248f3856 | 524 | return new match_op(new range(allow_zero ? 0 : 1, NUM_REAL_CHARS-1)); |
28a27de2 SM |
525 | } |
526 | ||
527 | }; | |
528 | ||
529 | /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */ |