]> sourceware.org Git - systemtap.git/blob - stapregex-dfa.cxx
Fix PR19954 by avoiding "suspicious RCU usage" message.
[systemtap.git] / stapregex-dfa.cxx
1 // -*- C++ -*-
2 // Copyright (C) 2012-2013 Red Hat Inc.
3 //
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
7 // later version.
8 //
9 // ---
10 //
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
13
14 #include <string>
15 #include <iostream>
16 #include <sstream>
17 #include <set>
18 #include <list>
19 #include <map>
20 #include <vector>
21 #include <queue>
22 #include <utility>
23
24 #include "translator-output.h"
25
26 #include "stapregex-parse.h"
27 #include "stapregex-tree.h"
28 #include "stapregex-dfa.h"
29
30 // Uncomment to show result of ins (NFA) compilation:
31 //#define STAPREGEX_DEBUG_INS
32 // Uncomment to display result of DFA compilation in a compact format:
33 //#define STAPREGEX_DEBUG_DFA
34 // Uncomment to have the generated engine do a trace of visited states:
35 //#define STAPREGEX_DEBUG_MATCH
36
37 using namespace std;
38
39 namespace stapregex {
40
41 regexp *pad_re = NULL;
42 regexp *fail_re = NULL;
43
44 dfa *
45 stapregex_compile (regexp *re, const std::string& match_snippet,
46 const std::string& fail_snippet)
47 {
48 if (pad_re == NULL) {
49 // build regexp for ".*"
50 pad_re = make_dot ();
51 pad_re = new close_op (pad_re, true); // -- prefer shorter match
52 pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match
53 }
54 if (fail_re == NULL) {
55 // build regexp for ".*$", but allow '\0' and support fail outcome
56 fail_re = make_dot (true); // -- allow '\0'
57 fail_re = new close_op (fail_re, true); // -- prefer shorter match
58 fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match
59 fail_re = new cat_op (fail_re, new anchor_op('$'));
60 fail_re = new rule_op(fail_re, 0);
61 // XXX: this approach creates one extra spurious-but-safe state
62 // (safe because the matching procedure stops after encountering '\0')
63 }
64
65 vector<string> outcomes(2);
66 outcomes[0] = fail_snippet;
67 outcomes[1] = match_snippet;
68
69 int num_tags = re->num_tags;
70
71 // Pad & wrap re in appropriate rule_ops to control match behaviour:
72 bool anchored = re->anchored ();
73 if (!anchored) re = new cat_op(pad_re, re); // -- left-padding
74 re = new rule_op(re, 1);
75 re = new alt_op(re, fail_re);
76
77 #ifdef STAPREGEX_DEBUG_INS
78 cerr << "RESULTING INS FROM REGEX " << re << ":" << endl;
79 #endif
80
81 ins *i = re->compile();
82
83 #ifdef STAPREGEX_DEBUG_INS
84 for (const ins *j = i; (j - i) < re->ins_size() + 1; )
85 {
86 j = show_ins(cerr, j, i); cerr << endl;
87 }
88 cerr << endl;
89 #endif
90
91 // TODOXXX optimize ins as in re2c
92
93 dfa *d = new dfa(i, num_tags, outcomes);
94
95 // Carefully deallocate temporary scaffolding:
96 if (!anchored) delete ((rule_op*) ((alt_op*) re)->a)->re; // -- new cat_op
97 delete ((alt_op*) re)->a; // -- new rule_op
98 delete re; // -- new alt_op
99 // NB: deleting a regular expression DOES NOT deallocate its
100 // children. The original re parameter is presumed to be retained
101 // indefinitely as part of a stapdfa table, or such....
102
103 return d;
104 }
105
106 // ------------------------------------------------------------------------
107
108 /* Now follows the heart of the tagged-DFA algorithm. This is a basic
109 implementation of the algorithm described in Ville Laurikari's
110 Masters thesis and summarized in the paper "NFAs with Tagged
111 Transitions, their Conversion to Deterministic Automata and
112 Application to Regular Expressions"
113 (http://laurikari.net/ville/spire2000-tnfa.pdf).
114
115 TODOXXX: The following does not contain a fully working
116 implementation of the tagging support, but only of the regex
117 matching.
118
119 HERE BE DRAGONS (and not the friendly kind) */
120
121 /* Functions to deal with relative transition priorities: */
122
123 arc_priority
124 refine_higher(const arc_priority& a)
125 {
126 return make_pair(2 * a.first + 1, a.second + 1);
127 }
128
129 arc_priority
130 refine_lower (const arc_priority& a)
131 {
132 return make_pair(2 * a.first, a.second + 1);
133 }
134
135 int
136 arc_compare (const arc_priority& a, const arc_priority& b)
137 {
138 unsigned long x = a.first;
139 unsigned long y = b.first;
140
141 if (a.second > b.second)
142 x = x << (a.second - b.second);
143 else if (a.second < b.second)
144 y = y << (b.second - a.second);
145
146 return ( x == y ? 0 : x < y ? -1 : 1 );
147 }
148
149 /* Manage the linked list of states in a DFA: */
150
151 state::state (state_kernel *kernel)
152 : label(~0), next(NULL), kernel(kernel), accepts(false), accept_outcome(0) {}
153
154 state *
155 dfa::add_state (state *s)
156 {
157 s->label = nstates++;
158
159 if (last == NULL)
160 {
161 last = s;
162 first = last;
163 }
164 else
165 {
166 // append to the end
167 last->next = s;
168 last = last->next;
169 }
170
171 return last;
172 }
173
174 /* Operations to build a simple kernel prior to taking closure: */
175
176 void
177 add_kernel (state_kernel *kernel, ins *i)
178 {
179 kernel_point point;
180 point.i = i;
181 point.priority = make_pair(0,0);
182 // NB: point->map_items is empty
183
184 kernel->push_back(point);
185 }
186
187 state_kernel *
188 make_kernel (ins *i)
189 {
190 state_kernel *kernel = new state_kernel;
191 add_kernel (kernel, i);
192 return kernel;
193 }
194
195 /* Compute the set of kernel_points that are 'tag-wise unambiguously
196 reachable' from a given initial set of points. Absent tagging, this
197 becomes a bog-standard NFA e_closure construction. */
198 state_kernel *
199 te_closure (state_kernel *start, int ntags, bool is_initial = false)
200 {
201 state_kernel *closure = new state_kernel(*start);
202 queue<kernel_point> worklist;
203
204 /* To avoid searching through closure incessantly when retrieving
205 information about existing elements, the following caches are
206 needed: */
207 vector<unsigned> max_tags (ntags, 0);
208 map<ins *, list<kernel_point> > closure_map;
209
210 /* Reset priorities and cache initial elements of closure: */
211 for (state_kernel::iterator it = closure->begin();
212 it != closure->end(); it++)
213 {
214 it->priority = make_pair(0,0);
215 worklist.push(*it);
216
217 // Store the element in relevant caches:
218
219 for (list<map_item>::const_iterator jt = it->map_items.begin();
220 jt != it->map_items.end(); jt++)
221 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
222
223 closure_map[it->i].push_back(*it);
224 }
225
226 while (!worklist.empty())
227 {
228 kernel_point point = worklist.front(); worklist.pop();
229
230 // Identify e-transitions depending on the opcode.
231 // There are at most two e-transitions emerging from an insn.
232 // If we have two e-transitions, the 'other' has higher priority.
233
234 ins *target = NULL; int tag = -1;
235 ins *other_target = NULL; int other_tag = -1;
236
237 // TODOXXX line-by-line proceeds below
238
239 bool do_split = false;
240
241 if (point.i->i.tag == TAG)
242 {
243 target = &point.i[1];
244 tag = (int) point.i->i.param;
245 }
246 else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link)
247 {
248 /* Workaround for a FORK that points to itself: */
249 target = &point.i[1];
250 }
251 else if (point.i->i.tag == FORK)
252 {
253 do_split = true;
254 // Relative priority of two e-transitions depends on param:
255 if (point.i->i.param)
256 {
257 // Prefer jumping to link.
258 target = &point.i[1];
259 other_target = (ins *) point.i->i.link;
260 }
261 else
262 {
263 // Prefer stepping to next instruction.
264 target = (ins *) point.i->i.link;
265 other_target = &point.i[1];
266 }
267 }
268 else if (point.i->i.tag == GOTO)
269 {
270 target = (ins *) point.i->i.link;
271 }
272 else if (point.i->i.tag == INIT && is_initial)
273 {
274 target = &point.i[1];
275 }
276
277 bool already_found;
278
279 // Data for the endpoint of the first transition:
280 kernel_point next;
281 next.i = target;
282 next.priority = do_split ? refine_lower(point.priority) : point.priority;
283 next.map_items = point.map_items;
284
285 // Date for the endpoint of the second transition:
286 kernel_point other_next;
287 other_next.i = other_target;
288 other_next.priority = do_split ? refine_higher(point.priority) : point.priority;
289 other_next.map_items = point.map_items;
290
291 // Do infinite-loop-check:
292 other_next.parents = point.parents;
293 if (point.parents.find(other_next.i) != point.parents.end())
294 {
295 other_target = NULL;
296 other_tag = -1;
297 }
298 other_next.parents.insert(other_next.i);
299
300 next.parents = point.parents;
301 if (point.parents.find(next.i) != point.parents.end())
302 {
303 // target = NULL;
304 // tag = -1;
305 // ^^^ these are overwritten from other_target / other_tag immediately
306 goto next_target;
307 }
308 next.parents.insert(next.i);
309
310 another_transition:
311 if (target == NULL)
312 continue;
313
314 // Deal with the current e-transition:
315
316 if (tag >= 0)
317 {
318 /* Delete all existing next.map_items of the form m[tag,x]. */
319 for (list<map_item>::iterator it = next.map_items.begin();
320 it != next.map_items.end(); )
321 if (it->first == (unsigned) tag)
322 {
323 list<map_item>::iterator next_it = it;
324 next_it++;
325 next.map_items.erase (it);
326 it = next_it;
327 }
328 else it++;
329
330 /* Add m[tag,x] to next.map_items, where x is the smallest
331 nonnegative integer such that m[tag,x] does not occur
332 anywhere in closure. Then update the cache. */
333 unsigned x = max_tags[tag];
334 next.map_items.push_back(make_pair(tag, ++x));
335 max_tags[tag] = x;
336 }
337
338 already_found = false;
339
340 /* Deal with similar transitions that have a different priority. */
341 for (list<kernel_point>::iterator it = closure_map[next.i].begin();
342 it != closure_map[next.i].end(); )
343 {
344 int result = arc_compare(it->priority, next.priority);
345
346 if (result > 0) {
347 // obnoxious shuffle to avoid iterator invalidation
348 list<kernel_point>::iterator old_it = it;
349 it++;
350 closure_map[next.i].erase(old_it);
351 continue;
352 } else if (result == 0) {
353 already_found = true;
354 }
355 it++;
356 }
357
358 if (!already_found) {
359 // Store the element in relevant caches:
360
361 closure_map[next.i].push_back(next);
362
363 for (list<map_item>::iterator jt = next.map_items.begin();
364 jt != next.map_items.end(); jt++)
365 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
366
367 // Store the element in closure:
368 closure->push_back(next);
369 worklist.push(next);
370 }
371
372 next_target:
373 // Now move to dealing with the second e-transition, if any.
374 target = other_target; other_target = NULL;
375 tag = other_tag; other_tag = -1;
376 next = other_next;
377
378 goto another_transition;
379 }
380
381 return closure;
382 }
383
384 /* Find the set of reordering commands (if any) that will get us from
385 state s to some existing state in the dfa (returns the state in
386 question, appends reordering commands to r). Returns NULL is no
387 suitable state is found. */
388 state *
389 dfa::find_equivalent (state *s, tdfa_action &)
390 {
391 state *answer = NULL;
392
393 for (state_kernel::iterator it = s->kernel->begin();
394 it != s->kernel->end(); it++)
395 mark(it->i);
396
397 /* Check kernels of existing states for size equivalence and for
398 unmarked items (similar to re2c's original algorithm): */
399 unsigned n = s->kernel->size();
400 for (state *t = first; t != NULL; t = t->next)
401 {
402 if (t->kernel->size() == n)
403 {
404 for (state_kernel::iterator it = t->kernel->begin();
405 it != t->kernel->end(); it++)
406 if (!marked(it->i))
407 goto next_state;
408
409 // TODOXXX check for existence of reordering tdfa_action r
410 answer = t;
411 goto cleanup;
412 }
413 next_state:
414 ;
415 }
416
417 cleanup:
418 for (state_kernel::iterator it = s->kernel->begin();
419 it != s->kernel->end(); it++)
420 unmark(it->i);
421
422 return answer;
423 }
424
425
426 dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets)
427 : orig_nfa(i), nstates(0), ntags(ntags), outcome_snippets(outcome_snippets)
428 {
429 /* Initialize empty linked list of states: */
430 first = last = NULL;
431
432 ins *start = &i[0];
433 state_kernel *seed_kernel = make_kernel(start);
434 state_kernel *initial_kernel = te_closure(seed_kernel, ntags, true);
435 delete seed_kernel;
436 state *initial = add_state(new state(initial_kernel));
437 queue<state *> worklist; worklist.push(initial);
438
439 while (!worklist.empty())
440 {
441 state *curr = worklist.front(); worklist.pop();
442
443 vector<list<ins *> > edges(NUM_REAL_CHARS);
444
445 /* Using the CHAR instructions in kernel, build the initial
446 table of spans for curr. Also check for final states. */
447
448 for (list<kernel_point>::iterator it = curr->kernel->begin();
449 it != curr->kernel->end(); it++)
450 {
451 if (it->i->i.tag == CHAR)
452 {
453 for (ins *j = &it->i[1]; j < (ins *) it->i->i.link; j++)
454 edges[j->c.value].push_back((ins *) it->i->i.link);
455 }
456 else if (it->i->i.tag == ACCEPT)
457 {
458 /* Always prefer the highest numbered outcome: */
459 curr->accepts = true;
460 curr->accept_outcome = max(it->i->i.param, curr->accept_outcome);
461 }
462 }
463
464 for (unsigned c = 0; c < NUM_REAL_CHARS; )
465 {
466 list <ins *> e = edges[c];
467 assert (!e.empty()); // XXX: ensured by fail_re in stapregex_compile
468
469 span s;
470
471 s.lb = c;
472
473 while (++c < NUM_REAL_CHARS && edges[c] == e) ;
474
475 s.ub = c - 1;
476
477 s.reach_pairs = new state_kernel;
478
479 for (list<ins *>::iterator it = e.begin();
480 it != e.end(); it++)
481 add_kernel (s.reach_pairs, *it);
482
483 curr->spans.push_back(s);
484 }
485
486 /* For each of the spans in curr, determine the reachable
487 points assuming a character in the span. */
488 for (list<span>::iterator it = curr->spans.begin();
489 it != curr->spans.end(); it++)
490 {
491 state_kernel *reach_pairs = it->reach_pairs;
492
493 /* Set up candidate target state: */
494 state_kernel *u_pairs = te_closure(reach_pairs, ntags);
495 state *target = new state(u_pairs);
496 tdfa_action c;
497
498 /* Generate position-save commands for any map items
499 that do not appear in curr->kernel: */
500
501 set<map_item> all_items;
502 for (state_kernel::const_iterator jt = curr->kernel->begin();
503 jt != curr->kernel->end(); jt++)
504 for (list<map_item>::const_iterator kt = jt->map_items.begin();
505 kt != jt->map_items.end(); jt++)
506 all_items.insert(*kt);
507
508 list<map_item> store_items;
509 for (state_kernel::const_iterator jt = u_pairs->begin();
510 jt != u_pairs->end(); jt++)
511 for (list<map_item>::const_iterator kt = jt->map_items.begin();
512 kt != jt->map_items.end(); kt++)
513 if (all_items.find(*kt) == all_items.end())
514 store_items.push_back(*kt);
515
516 for (list<map_item>::iterator jt = store_items.begin();
517 jt != store_items.end(); jt++)
518 {
519 // append m[i,n] <- <curr position> to c
520 tdfa_insn insn;
521 insn.to = *jt;
522 insn.save_pos = true;
523 c.push_back(insn);
524 }
525
526 /* If there is a state t_prime in states such that some
527 sequence of reordering commands r produces t_prime
528 from target, use t_prime as the target state,
529 appending the reordering commands to c. */
530 state *t_prime = find_equivalent(target, c);
531 if (t_prime != NULL)
532 {
533 delete target;
534 }
535 else
536 {
537 /* We need to actually add target to the dfa: */
538 t_prime = target;
539 add_state(t_prime);
540 worklist.push(t_prime);
541
542 if (t_prime->accepts)
543 {
544 // TODOXXX set the finisher of t_prime
545 }
546 }
547
548 /* Set the transition: */
549 it->to = t_prime;
550 it->action = c;
551 }
552 }
553 }
554
555 dfa::~dfa ()
556 {
557 state * s;
558 while ((s = first))
559 {
560 first = s->next;
561 delete s;
562 }
563
564 delete orig_nfa;
565 }
566
567 // ------------------------------------------------------------------------
568
569 // TODOXXX add emission instructions for tag_ops
570
571 void
572 span::emit_jump (translator_output *o, const dfa *d) const
573 {
574 #ifdef STAPREGEX_DEBUG_MATCH
575 o->newline () << "printf(\" --> GOTO yystate%d\\n\", " << to->label << ");";
576 #endif
577
578 // TODOXXX tags feature allows proper longest-match priority
579 if (to->accepts)
580 {
581 emit_final(o, d);
582 }
583 else
584 {
585 o->newline () << "YYCURSOR++;";
586 o->newline () << "goto yystate" << to->label << ";";
587 }
588 }
589
590 /* Assuming the target DFA of the span is a final state, emit code to
591 (TODOXXX) cleanup tags and exit with a final answer. */
592 void
593 span::emit_final (translator_output *o, const dfa *d) const
594 {
595 assert (to->accepts); // XXX: must guarantee correct usage of emit_final()
596 o->newline() << d->outcome_snippets[to->accept_outcome];
597 o->newline() << "goto yyfinish;";
598 }
599
600 string c_char(char c)
601 {
602 stringstream o;
603 o << "'";
604 print_escaped(o, c);
605 o << "'";
606 return o.str();
607 }
608
609 void
610 state::emit (translator_output *o, const dfa *d) const
611 {
612 o->newline() << "yystate" << label << ": ";
613 #ifdef STAPREGEX_DEBUG_MATCH
614 o->newline () << "printf(\"READ '%s' %c\", cur, *YYCURSOR);";
615 #endif
616 o->newline() << "switch (*YYCURSOR) {";
617 o->indent(1);
618 for (list<span>::const_iterator it = spans.begin();
619 it != spans.end(); it++)
620 {
621 // If we see a '\0', go immediately into an accept state:
622 if (it->lb == '\0')
623 {
624 o->newline() << "case " << c_char('\0') << ":";
625 it->emit_final(o, d); // TODOXXX extra function may be unneeded
626 }
627
628 // Emit labels to handle all the other elements of the span:
629 for (unsigned c = max('\1', it->lb); c <= (unsigned) it->ub; c++) {
630 o->newline() << "case " << c_char((char) c) << ":";
631 }
632 it->emit_jump(o, d);
633
634 // TODOXXX handle a 'default' set of characters for the largest span...
635 // TODOXXX optimize by accepting before end of string whenever possible... (also necessary for proper first-matched-substring selection)
636 }
637 o->newline(-1) << "}";
638 }
639
640 void
641 dfa::emit (translator_output *o) const
642 {
643 #ifdef STAPREGEX_DEBUG_DFA
644 print(o);
645 #else
646 o->newline() << "{";
647 o->newline(1);
648
649 // XXX: workaround for empty regex
650 if (first->accepts)
651 {
652 o->newline() << outcome_snippets[first->accept_outcome];
653 o->newline() << "goto yyfinish;";
654 }
655
656 for (state *s = first; s; s = s->next)
657 s->emit(o, this);
658
659 o->newline() << "yyfinish: ;";
660 o->newline(-1) << "}";
661 #endif
662 }
663
664 void
665 dfa::emit_tagsave (translator_output *, std::string,
666 std::string, std::string) const
667 {
668 // TODOXXX implement after testing the preceding algorithms
669 }
670
671 // ------------------------------------------------------------------------
672
673 std::ostream&
674 operator << (std::ostream &o, const tdfa_action& a)
675 {
676 for (list<tdfa_insn>::const_iterator it = a.begin();
677 it != a.end(); it++)
678 {
679 if (it != a.begin()) o << "; ";
680
681 o << "m[" << it->to.first << "," << it->to.second << "] <- ";
682
683 if (it->save_pos)
684 o << "p";
685 else
686 o << "m[" << it->from.first << "," << it->from.second << "]";
687 }
688
689 return o;
690 }
691
692 std::ostream&
693 operator << (std::ostream &o, const arc_priority& p)
694 {
695 o << p.first << "/" << (1 << p.second);
696 return o;
697 }
698
699 void
700 state::print (translator_output *o) const
701 {
702 o->line() << "state " << label;
703 if (accepts)
704 o->line() << " accepts " << accept_outcome;
705 if (!finalizer.empty())
706 o->line() << " [" << finalizer << "]";
707
708 o->indent(1);
709 for (list<span>::const_iterator it = spans.begin();
710 it != spans.end(); it++)
711 {
712 o->newline() << "'";
713 if (it->lb == it->ub)
714 {
715 print_escaped (o->line(), it->lb);
716 o->line() << " ";
717 }
718 else
719 {
720 print_escaped (o->line(), it->lb);
721 o->line() << "-";
722 print_escaped (o->line(), it->ub);
723 }
724
725 o->line() << "' -> " << it->to->label;
726
727 if (!it->action.empty())
728 o->line() << " [" << it->action << "]";
729 }
730 o->newline(-1);
731 }
732
733 void
734 dfa::print (std::ostream& o) const
735 {
736 translator_output to(o); print(&to);
737 }
738
739 void
740 dfa::print (translator_output *o) const
741 {
742 o->newline();
743 for (state *s = first; s; s = s->next)
744 {
745 s->print(o);
746 o->newline();
747 }
748 o->newline();
749 }
750
751 std::ostream&
752 operator << (std::ostream& o, const dfa& d)
753 {
754 d.print(o);
755 return o;
756 }
757
758 std::ostream&
759 operator << (std::ostream &o, const dfa *d)
760 {
761 o << *d;
762 return o;
763 }
764
765 };
766
767 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.069927 seconds and 5 git commands to generate.