]>
Commit | Line | Data |
---|---|---|
28a27de2 SM |
1 | // -*- C++ -*- |
2 | // Copyright (C) 2012-2013 Red Hat Inc. | |
3 | // | |
4 | // This file is part of systemtap, and is free software. You can | |
5 | // redistribute it and/or modify it under the terms of the GNU General | |
6 | // Public License (GPL); either version 2, or (at your option) any | |
7 | // later version. | |
8 | // | |
9 | // --- | |
10 | // | |
11 | // This file incorporates code from the re2c project; please see | |
12 | // the file README.stapregex for details. | |
13 | ||
14 | #include <string> | |
15 | #include <deque> | |
1acfc030 JS |
16 | #include <iterator> |
17 | #include <algorithm> | |
28a27de2 SM |
18 | #include <utility> |
19 | #include <cmath> | |
179688ce | 20 | #include <cassert> |
28a27de2 SM |
21 | |
22 | #include "stapregex-parse.h" | |
23 | #include "stapregex-tree.h" | |
24 | ||
25 | using namespace std; | |
26 | ||
27 | namespace stapregex { | |
28 | ||
6c9b8d26 | 29 | range::range (rchar lb, rchar ub) |
28a27de2 | 30 | { |
248f3856 | 31 | segments.push_back(make_pair(lb,ub)); |
28a27de2 SM |
32 | } |
33 | ||
34 | range::range (const string& str) | |
35 | { | |
36 | cursor cur(&str); // no unescaping | |
37 | ||
38 | if (cur.finished) return; | |
39 | ||
40 | range *ran = stapregex_getrange(cur); | |
41 | ||
42 | while (!cur.finished) | |
43 | { | |
44 | range *add = stapregex_getrange(cur); | |
45 | range *new_ran = ( ran != NULL ? range_union(ran, add) : add ); | |
46 | delete ran; if (new_ran != add) delete add; | |
47 | ran = new_ran; | |
48 | } | |
49 | ||
50 | segments = ran->segments; | |
51 | delete ran; | |
52 | } | |
53 | ||
07777235 SM |
54 | void |
55 | range::print (std::ostream& o) const | |
56 | { | |
57 | if (segments.empty()) | |
58 | { | |
59 | o << "{none}"; // XXX: pick a better pseudo-notation? | |
60 | return; | |
61 | } | |
62 | ||
248f3856 | 63 | if (segments.size() == 1 && segments[0].first == segments[0].second) |
07777235 | 64 | { |
248f3856 | 65 | print_escaped (o, segments[0].first); |
07777235 SM |
66 | return; |
67 | } | |
68 | ||
69 | o << "["; | |
70 | for (deque<segment>::const_iterator it = segments.begin(); | |
71 | it != segments.end(); it++) | |
72 | { | |
6c9b8d26 | 73 | rchar lb = it->first; rchar ub = it->second; |
248f3856 SM |
74 | if (lb == ub) |
75 | { | |
76 | print_escaped (o, lb); | |
77 | } | |
78 | else | |
79 | { | |
80 | print_escaped (o, lb); | |
81 | o << "-"; | |
82 | print_escaped (o, ub); | |
83 | } | |
07777235 SM |
84 | } |
85 | o << "]"; | |
86 | } | |
87 | ||
88 | std::ostream& | |
89 | operator << (std::ostream& o, const range& ran) | |
90 | { | |
91 | ran.print(o); | |
92 | return o; | |
93 | } | |
94 | ||
95 | std::ostream& | |
96 | operator << (std::ostream& o, const range* ran) | |
97 | { | |
98 | if (ran) | |
99 | o << *ran; | |
100 | else | |
101 | o << "{none}"; // XXX: pick a better pseudo-notation? | |
102 | return o; | |
103 | } | |
104 | ||
105 | // ------------------------------------------------------------------------ | |
106 | ||
28a27de2 SM |
107 | range * |
108 | range_union(range *old_a, range *old_b) | |
109 | { | |
179688ce | 110 | if (old_a == NULL && old_b == NULL) return NULL; |
1acfc030 JS |
111 | if (old_a == NULL) return new range(*old_b); |
112 | if (old_b == NULL) return new range(*old_a); | |
28a27de2 SM |
113 | |
114 | /* First, gather the segments from both ranges into one sorted pile: */ | |
115 | deque<segment> s; | |
1acfc030 JS |
116 | merge(old_a->segments.begin(), old_a->segments.end(), |
117 | old_b->segments.begin(), old_b->segments.end(), | |
118 | inserter(s, s.end())); | |
28a27de2 SM |
119 | |
120 | /* Now go through and merge overlapping segments. */ | |
121 | range *ran = new range; | |
122 | ||
123 | while (!s.empty()) | |
124 | { | |
125 | /* Merge adjacent overlapping segments. */ | |
248f3856 | 126 | while (s.size() >= 2 && s[0].second >= s[1].first) |
28a27de2 SM |
127 | { |
128 | segment merged = make_pair(min(s[0].first, s[1].first), | |
129 | max(s[0].second, s[1].second)); | |
130 | s.pop_front(); s.pop_front(); | |
131 | s.push_front(merged); | |
132 | } | |
133 | ||
134 | /* Place non-overlapping segment in range. */ | |
135 | ran->segments.push_back(s.front()); s.pop_front(); | |
136 | } | |
137 | ||
138 | return ran; | |
139 | } | |
140 | ||
141 | range * | |
142 | range_invert(range *old_ran) | |
143 | { | |
144 | range ran(*old_ran); | |
145 | range *new_ran = new range; | |
146 | ||
6c9b8d26 | 147 | rchar start = '\1'; // exclude '\0' |
28a27de2 SM |
148 | |
149 | while (!ran.segments.empty()) { | |
6c9b8d26 | 150 | rchar end = ran.segments.front().first - 1; |
28a27de2 | 151 | if (start <= end) new_ran->segments.push_back(make_pair(start, end)); |
248f3856 | 152 | start = ran.segments.front().second + 1; |
28a27de2 SM |
153 | ran.segments.pop_front(); |
154 | } | |
155 | ||
9d9595d6 | 156 | if ((unsigned)start < (unsigned)NUM_REAL_CHARS) |
e5f8ae47 | 157 | new_ran->segments.push_back(make_pair(start, NUM_REAL_CHARS-1)); |
28a27de2 SM |
158 | |
159 | return new_ran; | |
160 | } | |
161 | ||
162 | // ------------------------------------------------------------------------ | |
163 | ||
20d6e21f SM |
164 | void |
165 | ins_optimize (ins *i) | |
166 | { | |
167 | while (!marked(i)) | |
168 | { | |
169 | mark (i); // -- aka "this node has already been optimized" | |
170 | ||
171 | if (i->i.tag == CHAR) | |
172 | { | |
173 | i = (ins *) i->i.link; // -- skip node | |
174 | } | |
175 | else if (i->i.tag == GOTO || i->i.tag == FORK) | |
176 | { | |
177 | ins *target = (ins *) i->i.link; | |
178 | ins_optimize(target); | |
179 | ||
180 | if (target->i.tag == GOTO) | |
181 | i->i.link = target->i.link == target ? i : target; | |
182 | ||
183 | if (i->i.tag == FORK) | |
184 | { | |
185 | ins *follow = (ins *) &i[1]; | |
186 | ins_optimize(follow); | |
187 | ||
188 | if (follow->i.tag == GOTO && follow->i.link == follow) | |
189 | { | |
190 | i->i.tag = GOTO; | |
191 | } | |
192 | else if (i->i.link == i) | |
193 | { | |
194 | i->i.tag = GOTO; | |
195 | i->i.link = follow; | |
196 | } | |
197 | } | |
198 | } | |
199 | else | |
200 | ++i; // -- skip node | |
201 | } | |
202 | } | |
203 | ||
204 | // ------------------------------------------------------------------------ | |
205 | ||
07777235 SM |
206 | const ins* |
207 | show_ins (std::ostream &o, const ins *i, const ins *base) | |
208 | { | |
209 | o.width(3); o << (i - base) << ": "; | |
210 | ||
211 | const ins *ret = &i[1]; | |
212 | ||
213 | switch (i->i.tag) | |
214 | { | |
215 | case CHAR: | |
216 | o << "match "; | |
217 | for (; ret < (ins *) i->i.link; ++ret) print_escaped(o, ret->c.value); | |
218 | break; | |
219 | ||
220 | case GOTO: | |
221 | o << "goto " << ((ins *) i->i.link - base); | |
222 | break; | |
223 | ||
224 | case FORK: | |
225 | o << "fork(" << ( i->i.param ? "prefer" : "avoid" ) << ") " | |
226 | << ((ins *) i->i.link - base); | |
227 | break; | |
228 | ||
229 | case ACCEPT: | |
230 | o << "accept(" << i->i.param << ")"; | |
231 | break; | |
232 | ||
233 | case TAG: | |
234 | o << "tag(" << i->i.param << ")"; | |
235 | break; | |
236 | ||
237 | case INIT: | |
238 | o << "init"; | |
239 | break; | |
240 | } | |
241 | ||
242 | return ret; | |
243 | } | |
244 | ||
245 | // ------------------------------------------------------------------------ | |
246 | ||
247 | ins * | |
248 | regexp::compile() | |
249 | { | |
248f3856 SM |
250 | unsigned k = ins_size(); |
251 | ||
252 | ins *i = new ins[k + 1]; | |
20d6e21f SM |
253 | // XXX Keep Valgrind from complaining in ins_optimize(): |
254 | for (unsigned ix = 0; ix <= k; ix++) i[ix].i.marked = 0; | |
07777235 | 255 | compile(i); |
20d6e21f | 256 | |
248f3856 SM |
257 | // Append an infinite-loop GOTO to avoid edges going outside the array: |
258 | i[k].i.tag = GOTO; | |
259 | i[k].i.link = &i[k]; | |
260 | ||
07777235 SM |
261 | return i; |
262 | } | |
263 | ||
264 | std::ostream& | |
265 | operator << (std::ostream &o, const regexp& re) | |
266 | { | |
267 | re.print (o); | |
268 | return o; | |
269 | } | |
270 | ||
271 | std::ostream& | |
272 | operator << (std::ostream &o, const regexp* re) | |
273 | { | |
274 | o << *re; | |
275 | return o; | |
276 | } | |
277 | ||
278 | // ------------------------------------------------------------------------ | |
279 | ||
280 | void | |
281 | null_op::calc_size() | |
282 | { | |
283 | size = 0; | |
284 | } | |
285 | ||
286 | void | |
dabd71bb | 287 | null_op::compile(ins *) |
07777235 SM |
288 | { |
289 | ; | |
290 | } | |
291 | ||
6c9b8d26 | 292 | anchor_op::anchor_op(rchar type) : type(type) {} |
28a27de2 | 293 | |
07777235 SM |
294 | void |
295 | anchor_op::calc_size() | |
296 | { | |
297 | size = ( type == '^' ? 1 : 2 ); | |
298 | } | |
299 | ||
300 | void | |
301 | anchor_op::compile(ins *i) | |
302 | { | |
303 | if (type == '^') | |
304 | { | |
305 | i->i.tag = INIT; | |
306 | i->i.link = &i[1]; | |
307 | } | |
308 | else // type == '$' | |
309 | { | |
310 | i->i.tag = CHAR; | |
311 | i->i.link = &i[2]; | |
312 | ins *j = &i[1]; | |
313 | j->c.value = '\0'; | |
314 | j->c.bump = 1; | |
315 | } | |
316 | } | |
317 | ||
28a27de2 SM |
318 | tag_op::tag_op(unsigned id) : id(id) {} |
319 | ||
07777235 SM |
320 | void |
321 | tag_op::calc_size() | |
322 | { | |
323 | size = 1; | |
324 | } | |
325 | ||
326 | void | |
327 | tag_op::compile(ins *i) | |
328 | { | |
329 | i->i.tag = TAG; | |
330 | i->i.param = id; | |
331 | } | |
332 | ||
28a27de2 SM |
333 | match_op::match_op(range *ran) : ran(ran) {} |
334 | ||
07777235 SM |
335 | void |
336 | match_op::calc_size() | |
337 | { | |
338 | size = 1; | |
339 | ||
340 | for (deque<segment>::iterator it = ran->segments.begin(); | |
341 | it != ran->segments.end(); it++) | |
342 | { | |
248f3856 | 343 | size += it->second - it->first + 1; |
07777235 SM |
344 | } |
345 | } | |
346 | ||
347 | void | |
348 | match_op::compile(ins *i) | |
349 | { | |
350 | unsigned bump = ins_size(); | |
351 | i->i.tag = CHAR; | |
352 | i->i.link = &i[bump]; // mark end of table | |
353 | ||
354 | ins *j = &i[1]; | |
355 | for (deque<segment>::iterator it = ran->segments.begin(); | |
356 | it != ran->segments.end(); it++) | |
357 | { | |
112e9e2b | 358 | for (unsigned c = it->first; c <= (unsigned) it->second; c++) |
07777235 SM |
359 | { |
360 | j->c.value = c; | |
361 | j->c.bump = --bump; // mark end of table | |
362 | j++; | |
363 | } | |
364 | } | |
365 | } | |
366 | ||
248f3856 SM |
367 | alt_op::alt_op(regexp *a, regexp *b, bool prefer_second) |
368 | : a(a), b(b), prefer_second(prefer_second) {} | |
28a27de2 | 369 | |
07777235 SM |
370 | void |
371 | alt_op::calc_size() | |
372 | { | |
373 | size = a->ins_size() + b->ins_size() + 2; | |
374 | } | |
375 | ||
376 | void | |
377 | alt_op::compile(ins *i) | |
378 | { | |
379 | i->i.tag = FORK; | |
248f3856 | 380 | i->i.param = prefer_second ? 1 : 0; // preferred alternative to match |
07777235 SM |
381 | ins *j = &i[a->ins_size() + 1]; |
382 | i->i.link = &j[1]; | |
383 | a->compile(&i[1]); | |
384 | j->i.tag = GOTO; | |
385 | j->i.link = &j[b->ins_size() + 1]; | |
386 | b->compile(&j[1]); | |
387 | } | |
388 | ||
28a27de2 SM |
389 | cat_op::cat_op(regexp *a, regexp *b) : a(a), b(b) {} |
390 | ||
07777235 SM |
391 | void |
392 | cat_op::calc_size() | |
393 | { | |
394 | size = a->ins_size() + b->ins_size(); | |
395 | } | |
396 | ||
397 | void | |
398 | cat_op::compile(ins *i) | |
399 | { | |
400 | a->compile(&i[0]); | |
401 | b->compile(&i[a->ins_size()]); | |
402 | } | |
403 | ||
248f3856 SM |
404 | close_op::close_op(regexp *re, bool prefer_shorter) |
405 | : re(re), prefer_shorter(prefer_shorter) {} | |
28a27de2 | 406 | |
07777235 SM |
407 | void |
408 | close_op::calc_size() | |
409 | { | |
410 | size = re->ins_size() + 1; | |
411 | } | |
412 | ||
413 | void | |
414 | close_op::compile(ins *i) | |
415 | { | |
416 | re->compile(&i[0]); | |
417 | i += re->ins_size(); | |
418 | i->i.tag = FORK; | |
248f3856 | 419 | i->i.param = prefer_shorter ? 0 : 1; // XXX: match greedily by default |
07777235 SM |
420 | i->i.link = i - re->ins_size(); |
421 | } | |
422 | ||
423 | closev_op::closev_op(regexp *re, int nmin, int nmax) | |
424 | : re(re), nmin(nmin), nmax(nmax) {} | |
425 | ||
426 | void | |
427 | closev_op::calc_size() | |
428 | { | |
429 | unsigned k = re->ins_size(); | |
430 | ||
431 | if (nmax >= 0) | |
432 | size = k * nmin + (nmax - nmin) * (1 + k); | |
433 | else | |
434 | size = k * nmin + 1; | |
435 | } | |
436 | ||
437 | void | |
438 | closev_op::compile(ins *i) | |
439 | { | |
440 | unsigned k = re->ins_size(); | |
441 | ||
442 | ins *jumppoint = i + ((nmax - nmin) * (1 + k)); | |
443 | ||
444 | for (int st = nmin; st < nmax; st++) | |
445 | { | |
446 | i->i.tag = FORK; | |
447 | i->i.param = 0; // XXX: this matches greedily | |
448 | i->i.link = jumppoint; | |
449 | i++; | |
450 | re->compile(&i[0]); | |
451 | i += k; | |
452 | } | |
453 | ||
454 | for (int st = 0; st < nmin; st++) | |
455 | { | |
456 | re->compile(&i[0]); | |
457 | i += k; | |
458 | ||
459 | if (nmax < 0 && st == 0) | |
460 | { | |
461 | i->i.tag = FORK; | |
462 | i->i.param = 1; // XXX: this matches greedily | |
463 | i->i.link = i - k; | |
464 | i++; | |
465 | } | |
466 | } | |
467 | } | |
468 | ||
469 | rule_op::rule_op(regexp *re, unsigned outcome) : re(re), outcome(outcome) {} | |
470 | ||
471 | void | |
472 | rule_op::calc_size() | |
473 | { | |
474 | size = re->ins_size() + 1; | |
475 | } | |
476 | ||
477 | void | |
478 | rule_op::compile(ins *i) | |
479 | { | |
480 | re->compile(&i[0]); | |
481 | i += re->ins_size(); | |
482 | i->i.tag = ACCEPT; | |
483 | i->i.param = outcome; | |
484 | } | |
28a27de2 SM |
485 | |
486 | // ------------------------------------------------------------------------ | |
487 | ||
488 | regexp * | |
6c9b8d26 | 489 | match_char(rchar c) |
28a27de2 SM |
490 | { |
491 | return new match_op(new range(c,c)); | |
492 | } | |
493 | ||
494 | regexp * | |
495 | str_to_re(const string& str) | |
496 | { | |
497 | if (str.empty()) return new null_op; | |
498 | ||
499 | regexp *re = match_char(str[0]); | |
500 | ||
501 | for (unsigned i = 1; i < str.length(); i++) | |
502 | re = new cat_op(re, match_char(str[i])); | |
503 | ||
504 | return re; | |
505 | } | |
506 | ||
507 | regexp * | |
508 | do_alt(regexp *a, regexp *b) | |
509 | { | |
510 | if (a == NULL) return b; | |
511 | if (b == NULL) return a; | |
512 | return new alt_op(a,b); | |
513 | } | |
514 | ||
515 | regexp * | |
516 | make_alt(regexp *a, regexp *b) | |
517 | { | |
518 | /* Optimize the case of building alternatives of match_op. */ | |
519 | regexp *e1 = NULL, *e2 = NULL; | |
520 | range *r1 = NULL, *r2 = NULL; | |
521 | ||
522 | if (a->type_of() == "alt_op") | |
523 | { | |
524 | alt_op *aa = (alt_op *)a; | |
525 | if (aa->a->type_of() == "match_op") | |
526 | { | |
527 | r1 = ((match_op *) aa->a)->ran; e1 = aa->b; | |
528 | } | |
529 | else | |
530 | e1 = a; | |
531 | } | |
532 | else if (a->type_of() == "match_op") | |
533 | { | |
534 | r1 = ((match_op *) a)->ran; e1 = NULL; | |
535 | } | |
179688ce SM |
536 | else |
537 | e1 = a; | |
28a27de2 SM |
538 | |
539 | if (b->type_of() == "alt_op") | |
540 | { | |
541 | alt_op *bb = (alt_op *)b; | |
542 | if (bb->a->type_of() == "match_op") | |
543 | { | |
544 | r2 = ((match_op *) bb->a)->ran; e2 = bb->b; | |
545 | } | |
546 | else | |
547 | e2 = b; | |
548 | } | |
549 | else if (b->type_of() == "match_op") | |
550 | { | |
551 | r2 = ((match_op *) b)->ran; e2 = NULL; | |
552 | } | |
179688ce SM |
553 | else |
554 | e2 = b; | |
28a27de2 SM |
555 | |
556 | range *u = range_union(r1, r2); | |
557 | delete r1; delete r2; | |
558 | ||
559 | match_op *m = u != NULL ? new match_op(u) : NULL; | |
560 | regexp *r = do_alt(m, do_alt(e1, e2)); | |
179688ce | 561 | assert (r != NULL); |
28a27de2 SM |
562 | return r; |
563 | } | |
564 | ||
565 | regexp * | |
248f3856 | 566 | make_dot(bool allow_zero) |
28a27de2 | 567 | { |
248f3856 | 568 | return new match_op(new range(allow_zero ? 0 : 1, NUM_REAL_CHARS-1)); |
28a27de2 SM |
569 | } |
570 | ||
571 | }; | |
572 | ||
573 | /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */ |