]> sourceware.org Git - systemtap.git/blame - stapregex-dfa.cxx
stapregex REWRITE -- further bug fixes (esp. dealing with infinite loops)
[systemtap.git] / stapregex-dfa.cxx
CommitLineData
07777235
SM
1// -*- C++ -*-
2// Copyright (C) 2012-2013 Red Hat Inc.
3//
4// This file is part of systemtap, and is free software. You can
5// redistribute it and/or modify it under the terms of the GNU General
6// Public License (GPL); either version 2, or (at your option) any
7// later version.
8//
9// ---
10//
11// This file incorporates code from the re2c project; please see
12// the file README.stapregex for details.
13
14#include <string>
15#include <iostream>
54f92ee4 16#include <sstream>
07777235
SM
17#include <set>
18#include <list>
19#include <map>
20#include <vector>
21#include <queue>
22#include <utility>
23
24#include "translator-output.h"
25
26#include "stapregex-parse.h"
27#include "stapregex-tree.h"
28#include "stapregex-dfa.h"
29
54f92ee4
SM
30// Uncomment to show result of ins (NFA) compilation:
31//#define STAPREGEX_DEBUG_INS
32// Uncomment to display result of DFA compilation in a compact format:
33//#define STAPREGEX_DEBUG_DFA
34
07777235
SM
35using namespace std;
36
37namespace stapregex {
38
39regexp *pad_re = NULL;
40regexp *fail_re = NULL;
41
42dfa *
43stapregex_compile (regexp *re, const std::string& match_snippet,
44 const std::string& fail_snippet)
45{
46 if (pad_re == NULL) {
47 // build regexp for ".*"
48 pad_re = make_dot ();
248f3856
SM
49 pad_re = new close_op (pad_re, true); // -- prefer shorter match
50 pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match
07777235
SM
51 }
52 if (fail_re == NULL) {
9f232afb 53 // build regexp for ".*$", but allow '\0' and support fail outcome
9b7e0f3d 54 fail_re = make_dot (true); // -- allow '\0'
248f3856
SM
55 fail_re = new close_op (fail_re, true); // -- prefer shorter match
56 fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match
9f232afb 57 fail_re = new cat_op (fail_re, new anchor_op('$'));
248f3856 58 fail_re = new rule_op(fail_re, 0);
9f232afb
SM
59 // XXX: this approach creates one extra spurious-but-safe state
60 // (safe because the matching procedure stops after encountering '\0')
07777235
SM
61 }
62
63 vector<string> outcomes(2);
64 outcomes[0] = fail_snippet;
65 outcomes[1] = match_snippet;
66
67 int num_tags = re->num_tags;
68
69 // Pad & wrap re in appropriate rule_ops to control match behaviour:
8d2eb09e
SM
70 bool anchored = re->anchored ();
71 if (!anchored) re = new cat_op(pad_re, re); // -- left-padding
07777235
SM
72 re = new rule_op(re, 1);
73 re = new alt_op(re, fail_re);
74
248f3856
SM
75#ifdef STAPREGEX_DEBUG_INS
76 cerr << "RESULTING INS FROM REGEX " << re << ":" << endl;
77#endif
78
07777235 79 ins *i = re->compile();
248f3856
SM
80
81#ifdef STAPREGEX_DEBUG_INS
82 for (const ins *j = i; (j - i) < re->ins_size() + 1; )
83 {
84 j = show_ins(cerr, j, i); cerr << endl;
85 }
86 cerr << endl;
87#endif
88
89 // TODOXXX optimize ins as in re2c
90
07777235
SM
91 dfa *d = new dfa(i, num_tags, outcomes);
92
8d2eb09e
SM
93 // Carefully deallocate temporary scaffolding:
94 if (!anchored) delete ((rule_op*) ((alt_op*) re)->a)->re; // -- new cat_op
07777235
SM
95 delete ((alt_op*) re)->a; // -- new rule_op
96 delete re; // -- new alt_op
97 // NB: deleting a regular expression DOES NOT deallocate its
98 // children. The original re parameter is presumed to be retained
99 // indefinitely as part of a stapdfa table, or such....
100
101 return d;
102}
103
104// ------------------------------------------------------------------------
105
106/* Now follows the heart of the tagged-DFA algorithm. This is a basic
107 implementation of the algorithm described in Ville Laurikari's
108 Masters thesis and summarized in the paper "NFAs with Tagged
109 Transitions, their Conversion to Deterministic Automata and
110 Application to Regular Expressions"
111 (http://laurikari.net/ville/spire2000-tnfa.pdf).
112
113 HERE BE DRAGONS (and not the friendly kind) */
114
07777235
SM
115/* Functions to deal with relative transition priorities: */
116
117arc_priority
118refine_higher(const arc_priority& a)
119{
120 return make_pair(2 * a.first + 1, a.second + 1);
121}
122
123arc_priority
124refine_lower (const arc_priority& a)
125{
126 return make_pair(2 * a.first, a.second + 1);
127}
128
129int
130arc_compare (const arc_priority& a, const arc_priority& b)
131{
132 unsigned long x = a.first;
133 unsigned long y = b.first;
134
135 if (a.second > b.second)
136 x = x << (a.second - b.second);
137 else if (a.second < b.second)
138 y = y << (b.second - a.second);
139
140 return ( x == y ? 0 : x < y ? -1 : 1 );
141}
142
143/* Manage the linked list of states in a DFA: */
144
145state::state (state_kernel *kernel)
146 : next(NULL), kernel(kernel), accepts(false), accept_outcome(0) {}
147
148state *
149dfa::add_state (state *s)
150{
151 s->label = nstates++;
152
153 if (last == NULL)
154 {
155 last = s;
156 first = last;
157 }
158 else
159 {
160 // append to the end
161 last->next = s;
162 last = last->next;
163 }
164
165 return last;
166}
167
168/* Operations to build a simple kernel prior to taking closure: */
169
9b7e0f3d
SM
170void
171add_kernel (state_kernel *kernel, ins *i)
07777235
SM
172{
173 kernel_point point;
174 point.i = i;
175 point.priority = make_pair(0,0);
176 // NB: point->map_items is empty
177
178 kernel->push_back(point);
179}
180
181state_kernel *
182make_kernel (ins *i)
183{
184 state_kernel *kernel = new state_kernel;
185 add_kernel (kernel, i);
186 return kernel;
187}
188
189/* Compute the set of kernel_points that are 'tag-wise unambiguously
190 reachable' from a given initial set of points. Absent tagging, this
191 becomes a bog-standard NFA e_closure construction. */
192state_kernel *
193te_closure (state_kernel *start, int ntags, bool is_initial = false)
194{
195 state_kernel *closure = new state_kernel(*start);
196 queue<kernel_point> worklist;
197
198 /* To avoid searching through closure incessantly when retrieving
199 information about existing elements, the following caches are
200 needed: */
201 vector<unsigned> max_tags (ntags, 0);
202 map<ins *, list<kernel_point> > closure_map;
203
204 /* Reset priorities and cache initial elements of closure: */
205 for (state_kernel::iterator it = closure->begin();
206 it != closure->end(); it++)
207 {
208 it->priority = make_pair(0,0);
209 worklist.push(*it);
210
211 // Store the element in relevant caches:
212
213 for (list<map_item>::const_iterator jt = it->map_items.begin();
214 jt != it->map_items.end(); jt++)
215 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
216
217 closure_map[it->i].push_back(*it);
218 }
219
220 while (!worklist.empty())
221 {
222 kernel_point point = worklist.front(); worklist.pop();
223
224 // Identify e-transitions depending on the opcode.
225 // There are at most two e-transitions emerging from an insn.
226 // If we have two e-transitions, the 'other' has higher priority.
227
228 ins *target = NULL; int tag = -1;
229 ins *other_target = NULL; int other_tag = -1;
230
9b7e0f3d
SM
231 // TODOXXX line-by-line proceeds below
232
07777235
SM
233 bool do_split = false;
234
235 if (point.i->i.tag == TAG)
236 {
237 target = &point.i[1];
238 tag = (int) point.i->i.param;
239 }
179688ce
SM
240 else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link)
241 {
242 /* Workaround for a FORK that points to itself: */
243 target = &point.i[1];
244 }
07777235
SM
245 else if (point.i->i.tag == FORK)
246 {
247 do_split = true;
248 // Relative priority of two e-transitions depends on param:
249 if (point.i->i.param)
250 {
251 // Prefer jumping to link.
252 target = &point.i[1];
253 other_target = (ins *) point.i->i.link;
254 }
255 else
256 {
257 // Prefer stepping to next instruction.
258 target = (ins *) point.i->i.link;
259 other_target = &point.i[1];
260 }
261 }
262 else if (point.i->i.tag == GOTO)
263 {
264 target = (ins *) point.i->i.link;
265 }
266 else if (point.i->i.tag == INIT && is_initial)
267 {
268 target = &point.i[1];
269 }
270
179688ce
SM
271 bool already_found;
272
07777235
SM
273 // Data for the endpoint of the first transition:
274 kernel_point next;
275 next.i = target;
276 next.priority = do_split ? refine_lower(point.priority) : point.priority;
277 next.map_items = point.map_items;
278
9b7e0f3d
SM
279 // Date for the endpoint of the second transition:
280 kernel_point other_next;
281 other_next.i = other_target;
282 other_next.priority = do_split ? refine_higher(point.priority) : point.priority;
283 other_next.map_items = point.map_items;
284
179688ce
SM
285 // Do infinite-loop-check:
286 other_next.parents = point.parents;
287 if (point.parents.find(other_next.i) != point.parents.end())
288 {
289 other_target = NULL;
290 other_tag = -1;
291 }
292 other_next.parents.insert(other_next.i);
293
294 next.parents = point.parents;
295 if (point.parents.find(next.i) != point.parents.end())
296 {
297 target = NULL;
298 tag = -1;
299 goto next_target;
300 }
301 next.parents.insert(next.i);
302
07777235 303 another_transition:
9b7e0f3d
SM
304 if (target == NULL)
305 continue;
07777235
SM
306
307 // Deal with the current e-transition:
308
309 if (tag >= 0)
310 {
311 /* Delete all existing next.map_items of the form m[tag,x]. */
312 for (list<map_item>::iterator it = next.map_items.begin();
313 it != next.map_items.end(); it++)
314 if (it->first == (unsigned) tag)
315 next.map_items.erase (it);
316
317 /* Add m[tag,x] to next.map_items, where x is the smallest
318 nonnegative integer such that m[tag,x] does not occur
319 anywhere in closure. Then update the cache. */
320 unsigned x = max_tags[tag];
321 next.map_items.push_back(make_pair(tag, ++x));
322 max_tags[tag] = x;
323 }
324
179688ce 325 already_found = false;
07777235
SM
326
327 /* Deal with similar transitions that have a different priority. */
328 for (list<kernel_point>::iterator it = closure_map[next.i].begin();
248f3856 329 it != closure_map[next.i].end(); )
07777235
SM
330 {
331 int result = arc_compare(it->priority, next.priority);
179688ce 332
07777235 333 if (result > 0) {
248f3856
SM
334 // obnoxious shuffle to avoid iterator invalidation
335 list<kernel_point>::iterator old_it = it;
336 it++;
337 closure_map[next.i].erase(old_it);
338 continue;
07777235
SM
339 } else if (result == 0) {
340 already_found = true;
341 }
248f3856 342 it++;
07777235
SM
343 }
344
345 if (!already_found) {
07777235
SM
346 // Store the element in relevant caches:
347
248f3856
SM
348 closure_map[next.i].push_back(next);
349
07777235
SM
350 for (list<map_item>::iterator jt = next.map_items.begin();
351 jt != next.map_items.end(); jt++)
352 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
353
248f3856
SM
354 // Store the element in closure:
355 closure->push_back(next);
356 worklist.push(next);
07777235
SM
357 }
358
179688ce 359 next_target:
07777235 360 // Now move to dealing with the second e-transition, if any.
07777235
SM
361 target = other_target; other_target = NULL;
362 tag = other_tag; other_tag = -1;
9b7e0f3d 363 next = other_next;
07777235
SM
364
365 goto another_transition;
366 }
367
368 return closure;
369}
370
371/* Find the set of reordering commands (if any) that will get us from
372 state s to some existing state in the dfa (returns the state in
373 question, appends reordering commands to r). Returns NULL is no
374 suitable state is found. */
375state *
376dfa::find_equivalent (state *s, tdfa_action &r)
377{
378 state *answer = NULL;
379
380 for (state_kernel::iterator it = s->kernel->begin();
381 it != s->kernel->end(); it++)
9b7e0f3d 382 mark(it->i);
8d2eb09e 383
07777235
SM
384 /* Check kernels of existing states for size equivalence and for
385 unmarked items (similar to re2c's original algorithm): */
386 unsigned n = s->kernel->size();
387 for (state *t = first; t != NULL; t = t->next)
388 {
389 if (t->kernel->size() == n)
390 {
391 for (state_kernel::iterator it = t->kernel->begin();
392 it != t->kernel->end(); it++)
8d2eb09e
SM
393 if (!marked(it->i))
394 goto next_state;
07777235 395
8d2eb09e
SM
396 // TODOXXX check for existence of reordering r
397 answer = t;
398 goto cleanup;
07777235 399 }
8d2eb09e
SM
400 next_state:
401 ;
07777235
SM
402 }
403
404 cleanup:
405 for (state_kernel::iterator it = s->kernel->begin();
406 it != s->kernel->end(); it++)
407 unmark(it->i);
408
409 return answer;
410}
411
412
413dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets)
414 : orig_nfa(i), nstates(0), ntags(ntags), outcome_snippets(outcome_snippets)
415{
416 /* Initialize empty linked list of states: */
417 first = last = NULL;
418
419 ins *start = &i[0];
9f232afb 420 state_kernel *initial_kernel = te_closure(make_kernel(start), ntags, true);
248f3856
SM
421 state *initial = add_state(new state(initial_kernel));
422 queue<state *> worklist; worklist.push(initial);
07777235
SM
423
424 while (!worklist.empty())
425 {
248f3856 426 state *curr = worklist.front(); worklist.pop();
07777235
SM
427
428 vector<list<ins *> > edges(NUM_REAL_CHARS);
429
430 /* Using the CHAR instructions in kernel, build the initial
431 table of spans for curr. Also check for final states. */
432
248f3856
SM
433 for (list<kernel_point>::iterator it = curr->kernel->begin();
434 it != curr->kernel->end(); it++)
07777235
SM
435 {
436 if (it->i->i.tag == CHAR)
437 {
438 for (ins *j = &it->i[1]; j < (ins *) it->i->i.link; j++)
439 edges[j->c.value].push_back((ins *) it->i->i.link);
440 }
441 else if (it->i->i.tag == ACCEPT)
442 {
443 /* Always prefer the highest numbered outcome: */
444 curr->accepts = true;
445 curr->accept_outcome = max(it->i->i.param, curr->accept_outcome);
446 }
447 }
448
aaed700d 449 for (unsigned c = 0; c < NUM_REAL_CHARS; )
07777235
SM
450 {
451 list <ins *> e = edges[c];
452 assert (!e.empty()); // XXX: ensured by fail_re in stapregex_compile
453
454 span s;
455
456 s.lb = c;
457
458 while (++c < NUM_REAL_CHARS && edges[c] == e) ;
459
248f3856 460 s.ub = c - 1;
07777235
SM
461
462 s.reach_pairs = new state_kernel;
463
464 for (list<ins *>::iterator it = e.begin();
465 it != e.end(); it++)
466 add_kernel (s.reach_pairs, *it);
467
468 curr->spans.push_back(s);
469 }
470
471 /* For each of the spans in curr, determine the reachable
472 points assuming a character in the span. */
473 for (list<span>::iterator it = curr->spans.begin();
474 it != curr->spans.end(); it++)
475 {
476 state_kernel *reach_pairs = it->reach_pairs;
477
478 /* Set up candidate target state: */
479 state_kernel *u_pairs = te_closure(reach_pairs, ntags);
480 state *target = new state(u_pairs);
481 tdfa_action c;
482
483 /* Generate position-save commands for any map items
248f3856 484 that do not appear in curr->kernel: */
07777235
SM
485
486 set<map_item> all_items;
248f3856
SM
487 for (state_kernel::const_iterator jt = curr->kernel->begin();
488 jt != curr->kernel->end(); jt++)
07777235
SM
489 for (list<map_item>::const_iterator kt = jt->map_items.begin();
490 kt != jt->map_items.end(); jt++)
491 all_items.insert(*kt);
492
493 list<map_item> store_items;
494 for (state_kernel::const_iterator jt = u_pairs->begin();
495 jt != u_pairs->end(); jt++)
496 for (list<map_item>::const_iterator kt = jt->map_items.begin();
497 kt != jt->map_items.end(); kt++)
498 if (all_items.find(*kt) == all_items.end())
499 store_items.push_back(*kt);
500
501 for (list<map_item>::iterator jt = store_items.begin();
502 jt != store_items.end(); jt++)
503 {
504 // append m[i,n] <- <curr position> to c
505 tdfa_insn insn;
506 insn.to = *jt;
507 insn.save_pos = true;
508 c.push_back(insn);
509 }
510
511 /* If there is a state t_prime in states such that some
512 sequence of reordering commands r produces t_prime
513 from target, use t_prime as the target state,
514 appending the reordering commands to c. */
515 state *t_prime = find_equivalent(target, c);
9b7e0f3d 516 if (t_prime != NULL)
248f3856 517 {
07777235
SM
518 delete target;
519 }
520 else
521 {
522 /* We need to actually add target to the dfa: */
523 t_prime = target;
524 add_state(t_prime);
248f3856 525 worklist.push(t_prime);
07777235
SM
526
527 if (t_prime->accepts)
528 {
529 // TODOXXX set the finisher of t_prime
530 }
531 }
532
533 /* Set the transition: */
534 it->to = t_prime;
535 it->action = c;
536 }
537 }
538}
539
540dfa::~dfa ()
541{
542 state * s;
543 while ((s = first))
544 {
545 first = s->next;
546 delete s;
547 }
548
549 delete orig_nfa;
550}
551
552// ------------------------------------------------------------------------
553
54f92ee4
SM
554// TODOXXX add emission instructions for tag_ops
555
556void
557span::emit_jump (translator_output *o, const dfa *d) const
558{
559 o->newline () << "printf(\" --> GOTO yystate%d\\n\", " << to->label << ");";
560
561 // TODOXXX tags feature allows proper longest-match priority
562 if (to->accepts)
563 {
564 emit_final(o, d);
565 }
566 else
567 {
568 o->newline () << "YYCURSOR++;";
569 o->newline () << "goto yystate" << to->label << ";";
570 }
571}
572
573/* Assuming the target DFA of the span is a final state, emit code to
574 (TODOXXX) cleanup tags and exit with a final answer. */
575void
576span::emit_final (translator_output *o, const dfa *d) const
577{
578 assert (to->accepts); // XXX: must guarantee correct usage of emit_final()
579 o->newline() << d->outcome_snippets[to->accept_outcome];
580 o->newline() << "goto yyfinish;";
581}
582
583string c_char(char c)
584{
585 stringstream o;
586 o << "'";
587 print_escaped(o, c);
588 o << "'";
589 return o.str();
590}
591
592void
593state::emit (translator_output *o, const dfa *d) const
594{
595 o->newline() << "yystate" << label << ": ";
596 o->newline () << "printf(\"READ '%s' %c\", cur, *YYCURSOR);";
597 o->newline() << "switch (*YYCURSOR) {";
598 o->indent(1);
599 for (list<span>::const_iterator it = spans.begin();
600 it != spans.end(); it++)
601 {
602 // If we see a '\0', go immediately into an accept state:
603 if (it->lb == '\0')
604 {
605 o->newline() << "case " << c_char('\0') << ":";
606 it->emit_final(o, d); // TODOXXX extra function may be unneeded
607 }
608
609 // Emit labels to handle all the other elements of the span:
610 for (unsigned c = max('\1', it->lb); c <= it->ub; c++) {
611 o->newline() << "case " << c_char((char) c) << ":";
612 }
613 it->emit_jump(o, d);
614
615 // TODOXXX handle a 'default' set of characters for the largest span...
616 // TODOXXX optimize by accepting before end of string whenever possible... (also necessary for proper first-matched-substring selection)
617 }
618 o->newline(-1) << "}";
619}
620
07777235
SM
621void
622dfa::emit (translator_output *o) const
623{
54f92ee4 624#ifdef STAPREGEX_DEBUG_DFA
07777235 625 print(o);
54f92ee4
SM
626#else
627 o->newline() << "{";
628 o->newline(1);
629 for (state *s = first; s; s = s->next)
630 s->emit(o, this);
631 o->newline() << "yyfinish: ;";
632 o->newline(-1) << "}";
633#endif
07777235
SM
634}
635
636void
637dfa::emit_tagsave (translator_output *o, std::string tag_states,
638 std::string tag_vals, std::string tag_count) const
639{
640 // TODOXXX implement after testing the preceding algorithms
641}
642
643// ------------------------------------------------------------------------
644
645std::ostream&
646operator << (std::ostream &o, const tdfa_action& a)
647{
648 for (list<tdfa_insn>::const_iterator it = a.begin();
649 it != a.end(); it++)
650 {
651 if (it != a.begin()) o << "; ";
652
653 o << "m[" << it->to.first << "," << it->to.second << "] <- ";
654
655 if (it->save_pos)
656 o << "p";
657 else
658 o << "m[" << it->from.first << "," << it->from.second << "]";
659 }
660
661 return o;
662}
663
248f3856
SM
664std::ostream&
665operator << (std::ostream &o, const arc_priority& p)
666{
667 o << p.first << "/" << (1 << p.second);
668 return o;
669}
670
07777235
SM
671void
672state::print (translator_output *o) const
673{
674 o->line() << "state " << label;
675 if (accepts)
676 o->line() << " accepts " << accept_outcome;
677 if (!finalizer.empty())
678 o->line() << " [" << finalizer << "]";
679
680 o->indent(1);
681 for (list<span>::const_iterator it = spans.begin();
682 it != spans.end(); it++)
683 {
248f3856 684 o->newline() << "'";
07777235 685 if (it->lb == it->ub)
248f3856
SM
686 {
687 print_escaped (o->line(), it->lb);
688 o->line() << " ";
689 }
07777235 690 else
248f3856
SM
691 {
692 print_escaped (o->line(), it->lb);
693 o->line() << "-";
694 print_escaped (o->line(), it->ub);
695 }
07777235 696
248f3856 697 o->line() << "' -> " << it->to->label;
07777235
SM
698
699 if (!it->action.empty())
700 o->line() << " [" << it->action << "]";
701 }
702 o->newline(-1);
703}
704
705void
706dfa::print (std::ostream& o) const
707{
708 translator_output to(o); print(&to);
709}
710
711void
712dfa::print (translator_output *o) const
713{
248f3856 714 o->newline();
07777235
SM
715 for (state *s = first; s; s = s->next)
716 {
717 s->print(o);
718 o->newline();
719 }
248f3856 720 o->newline();
07777235
SM
721}
722
723std::ostream&
724operator << (std::ostream& o, const dfa& d)
725{
726 d.print(o);
727 return o;
728}
729
730std::ostream&
731operator << (std::ostream &o, const dfa *d)
732{
733 o << *d;
734 return o;
735}
736
737};
738
739/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.09336 seconds and 5 git commands to generate.