]> sourceware.org Git - systemtap.git/blob - stapregex-dfa.cxx
add recent AUTHORS
[systemtap.git] / stapregex-dfa.cxx
1 // -*- C++ -*-
2 // Copyright (C) 2012-2018 Red Hat Inc.
3 //
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
7 // later version.
8 //
9 // ---
10 //
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
13
14 #include <string>
15 #include <iostream>
16 #include <sstream>
17 #include <set>
18 #include <list>
19 #include <map>
20 #include <vector>
21 #include <stack>
22 #include <queue>
23 #include <utility>
24 #include <climits>
25
26 #include "translator-output.h"
27 #include "util.h"
28
29 #include "stapregex-parse.h"
30 #include "stapregex-tree.h"
31 #include "stapregex-dfa.h"
32
33 // Uncomment to show result of ins (NFA) compilation:
34 //#define STAPREGEX_DEBUG_INS
35 // Uncomment to emit DFA in a non-working compact format (use with -p3):
36 //#define STAPREGEX_DEBUG_DFA
37 // Uncomment to have the generated engine do a trace of visited states
38 // (only when testing using the standalone regtest module):
39 //#define STAPREGEX_DEBUG_MATCH
40
41 // Uncomment for a detailed walkthrough of the tagged-NFA conversion:
42 //#define STAPREGEX_DEBUG_TNFA
43
44 using namespace std;
45
46 namespace stapregex {
47
48 regexp *pad_re = NULL;
49 regexp *fail_re = NULL;
50
51 dfa *
52 stapregex_compile (regexp *re, const std::string& match_snippet,
53 const std::string& fail_snippet)
54 {
55 if (pad_re == NULL) {
56 // build regexp for ".*"
57 pad_re = make_dot ();
58 pad_re = new close_op (pad_re, true); // -- prefer shorter match
59 pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match
60 }
61 if (fail_re == NULL) {
62 // build regexp for ".*$", but allow '\0' and support fail outcome
63 fail_re = make_dot (true); // -- allow '\0'
64 fail_re = new close_op (fail_re, true); // -- prefer shorter match
65 fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match
66 fail_re = new cat_op (fail_re, new anchor_op('$'));
67 fail_re = new rule_op(fail_re, 0);
68 // XXX: this approach creates one extra spurious-but-safe state
69 // (safe because the matching procedure stops after encountering '\0')
70 }
71
72 vector<string> outcomes(2);
73 outcomes[0] = fail_snippet;
74 outcomes[1] = match_snippet;
75
76 int num_tags = re->num_tags;
77
78 // Pad & wrap re in appropriate rule_ops to control match behaviour:
79 bool anchored = re->anchored ();
80 if (!anchored) re = new cat_op(pad_re, re); // -- left-padding
81 re = new rule_op(re, 1);
82 re = new alt_op(re, fail_re);
83
84 #ifdef STAPREGEX_DEBUG_INS
85 cerr << "RESULTING INS FROM REGEX " << re << ":" << endl;
86 #endif
87
88 ins *i = re->compile();
89
90 #ifdef STAPREGEX_DEBUG_INS
91 for (const ins *j = i; (j - i) < (int)re->ins_size() + 1; )
92 {
93 j = show_ins(cerr, j, i); cerr << endl;
94 }
95 cerr << endl;
96 #endif
97
98 ins_optimize(i);
99 for (ins *j = i; (j - i) < (int)re->ins_size() + 1; )
100 {
101 unmark(j);
102 if (j->i.tag == CHAR)
103 j = (ins *) j->i.link;
104 else
105 j++;
106 }
107
108 #ifdef STAPREGEX_DEBUG_INS
109 cerr << "OPTIMIZED INS FROM THE SAME REGEX" << endl;
110 for (const ins *j = i; (j - i) < (int)re->ins_size() + 1; )
111 {
112 j = show_ins(cerr, j, i); cerr << endl;
113 }
114 cerr << endl;
115 #endif
116
117 dfa *d = new dfa(i, num_tags, outcomes);
118
119 // Carefully deallocate temporary scaffolding:
120 if (!anchored) delete ((rule_op*) ((alt_op*) re)->a)->re; // -- new cat_op
121 delete ((alt_op*) re)->a; // -- new rule_op
122 delete re; // -- new alt_op
123 // NB: deleting a regular expression DOES NOT deallocate its
124 // children. The original re parameter is presumed to be retained
125 // indefinitely as part of a stapdfa table, or such....
126
127 return d;
128 }
129
130 // ------------------------------------------------------------------------
131
132 /* Now follows the heart of the tagged-DFA algorithm. This is a basic
133 implementation of the algorithm described in Ville Laurikari's
134 Masters thesis and summarized in the paper "NFAs with Tagged
135 Transitions, their Conversion to Deterministic Automata and
136 Application to Regular Expressions"
137 (http://laurikari.net/ville/spire2000-tnfa.pdf).
138
139 HERE BE DRAGONS (and not the friendly kind) */
140
141 /* Functions to deal with relative transition priorities: */
142
143 arc_priority
144 refine_higher(const arc_priority& a)
145 {
146 if (a.first > ULLONG_MAX/4) // detect overflow
147 throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0);
148 return make_pair(2 * a.first + 1, a.second + 1);
149 }
150
151 arc_priority
152 refine_lower (const arc_priority& a)
153 {
154 if (a.first > ULLONG_MAX/4) // detect overflow
155 throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0);
156 return make_pair(2 * a.first, a.second + 1);
157 }
158
159 int
160 arc_compare (const arc_priority& a, const arc_priority& b)
161 {
162 unsigned long x = a.first;
163 unsigned long y = b.first;
164
165 if (a.second > b.second)
166 y = y << (a.second - b.second);
167 else if (a.second < b.second)
168 x = x << (b.second - a.second);
169
170 // Special case: 0/n </> 0/m iff m </> n.
171 // This is because such priorities are obtained by refine_lower().
172 if (x == 0 && y == 0)
173 return ( a.second == b.second ? 0 : a.second < b.second ? 1 : -1 );
174
175 return ( x == y ? 0 : x < y ? -1 : 1 );
176 }
177
178 /* Manage the linked list of states in a DFA: */
179
180 state::state (dfa *owner, state_kernel *kernel)
181 : owner(owner), label(~0), next(NULL), kernel(kernel),
182 accepts(false), accept_outcome(0) {}
183
184 void
185 dfa::add_map_item (const map_item &m)
186 {
187 // TODOXXX: later, compute a mapping into a single-level tag_states array
188 // TODOXXX: could drop the +1 and instead subtract 1 in YYTAG macro
189 nmapitems = max(nmapitems, m.second) + 1;
190 }
191
192 state *
193 dfa::add_state (state *s)
194 {
195 s->label = nstates++;
196
197 if (last == NULL)
198 {
199 last = s;
200 first = last;
201 }
202 else
203 {
204 // append to the end
205 last->next = s;
206 last = last->next;
207 }
208
209 return last;
210 }
211
212 /* Operations to build a simple kernel prior to taking closure: */
213
214 /* Create a new kernel_point in kernel with empty map items. */
215 void
216 add_kernel (state_kernel *kernel, ins *i)
217 {
218 kernel_point point;
219 point.i = i;
220 point.priority = MAKE_START_PRIORITY;
221 // NB: point->map_items is empty
222
223 kernel->push_back(point);
224 }
225
226 state_kernel *
227 make_kernel (ins *i)
228 {
229 state_kernel *kernel = new state_kernel;
230 add_kernel (kernel, i);
231 return kernel;
232 }
233
234 struct sort_priorities {
235 bool operator ()(const arc_priority &a, const arc_priority &b)
236 {
237 return arc_compare(a,b) > 0;
238 }
239 };
240
241 struct sort_denominator {
242 bool operator ()(const arc_priority &a, const arc_priority &b)
243 {
244 return a.second > b.second;
245 }
246 };
247
248 struct sort_kernel_points {
249 bool operator ()(const kernel_point &k, const kernel_point &l)
250 {
251 return arc_compare(k.priority, l.priority) > 0;
252 }
253 };
254
255 // Move points from worklist to new_worklist while rebalancing priorities:
256 void
257 rebalance_priorities(stack<kernel_point> &worklist, stack<kernel_point> &new_worklist)
258 {
259 // Sort worklist in order of priority:
260 priority_queue<kernel_point, vector<kernel_point>, sort_kernel_points> sorted_worklist;
261 while (!worklist.empty())
262 {
263 kernel_point point = worklist.top(); worklist.pop();
264 sorted_worklist.push(point);
265 }
266
267 // Generate a 'clean' set of priorities:
268 priority_queue<arc_priority, vector<arc_priority>, sort_priorities> sorted_priorities;
269 priority_queue<arc_priority, vector<arc_priority>, sort_denominator> new_priorities;
270 new_priorities.push(MAKE_START_PRIORITY);
271 while (new_priorities.size() < sorted_worklist.size())
272 {
273 arc_priority new_priority = new_priorities.top(); new_priorities.pop();
274 new_priorities.push(refine_higher(new_priority));
275 new_priorities.push(refine_lower(new_priority));
276 }
277 for (unsigned i = 0; i < sorted_worklist.size(); i++)
278 {
279 arc_priority new_priority = new_priorities.top(); new_priorities.pop();
280 sorted_priorities.push(new_priority);
281 }
282
283 while (!sorted_worklist.empty())
284 {
285 kernel_point point = sorted_worklist.top(); sorted_worklist.pop();
286 arc_priority new_priority = sorted_priorities.top(); sorted_priorities.pop();
287 point.priority = new_priority;
288 new_worklist.push(point);
289 }
290 }
291
292 /* Compute the set of kernel_points that are 'tag-wise unambiguously
293 reachable' from a given initial set of points. Absent tagging, this
294 becomes a bog-standard NFA e_closure construction. */
295 state_kernel *
296 te_closure (dfa *dfa, state_kernel *start, int ntags, bool is_initial = false)
297 {
298 state_kernel *closure = new state_kernel(*start);
299 stack<kernel_point> base_worklist; // -- with old priorities
300 stack<kernel_point> worklist; // -- with rebalanced priorities
301 // XXX: state_kernel is a list<kernel_point> so we avoid iterator
302 // invalidation and make a new copy of each kernel_point from start
303
304 /* To avoid searching through closure incessantly when retrieving
305 information about existing elements, the following caches are
306 needed: */
307 vector<unsigned> max_tags (ntags, 0);
308 map<ins *, list<list<kernel_point>::iterator> > closure_map;
309
310 /* Cache initial elements of closure: */
311 for (state_kernel::iterator it = closure->begin();
312 it != closure->end(); it++)
313 {
314 #if 0
315 cerr << "**DEBUG** initial closure point ";
316 it->print(cerr, dfa->orig_nfa);
317 cerr << endl;
318 #endif
319
320 base_worklist.push(*it); // -- push with existing priority, rebalance later
321
322 // Store the element in relevant caches:
323
324 for (list<map_item>::const_iterator jt = it->map_items.begin();
325 jt != it->map_items.end(); jt++)
326 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
327
328 closure_map[it->i].push_back(it);
329 }
330
331 // PR23608: Retaining the priority from the previous state has the
332 // potential to overflow the arc_priority representation with large
333 // numbers when there are many distinct DFA states. This should
334 // cause an explicit assertion failure if it occurs in practice (see
335 // refine_*()), e.g. with long non-branching regexes such as
336 // "aaaa...aaaaa".
337 //
338 // Fixed by adding an explicit step to rebalance priorities:
339 rebalance_priorities(base_worklist, worklist);
340
341 while (!worklist.empty())
342 {
343 kernel_point point = worklist.top(); worklist.pop();
344
345 // Identify e-transitions depending on the opcode.
346 // There are at most two e-transitions emerging from an insn.
347 // If we have two e-transitions, the 'other' has higher priority.
348
349 ins *target = NULL; int tag = -1;
350 ins *other_target = NULL; int other_tag = -1;
351
352 bool do_split = false;
353
354 if (point.i->i.tag == TAG)
355 {
356 target = &point.i[1];
357 tag = (int) point.i->i.param;
358 }
359 else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link)
360 {
361 /* Workaround for a FORK that points to itself: */
362 target = &point.i[1];
363 }
364 else if (point.i->i.tag == FORK)
365 {
366 do_split = true;
367 // Relative priority of two e-transitions depends on param:
368 if (point.i->i.param)
369 {
370 // Prefer jumping to link.
371 target = &point.i[1];
372 other_target = (ins *) point.i->i.link;
373 }
374 else
375 {
376 // Prefer stepping to next instruction.
377 target = (ins *) point.i->i.link;
378 other_target = &point.i[1];
379 }
380 }
381 else if (point.i->i.tag == GOTO)
382 {
383 target = (ins *) point.i->i.link;
384 }
385 else if (point.i->i.tag == INIT && is_initial)
386 {
387 target = &point.i[1];
388 }
389
390 bool already_found;
391
392 // Data for the endpoint of the first transition:
393 kernel_point next;
394 next.i = target;
395 next.priority = do_split ? refine_lower(point.priority) : point.priority;
396 next.map_items = point.map_items;
397
398 // Date for the endpoint of the second transition:
399 kernel_point other_next;
400 other_next.i = other_target;
401 other_next.priority = do_split ? refine_higher(point.priority) : point.priority;
402 other_next.map_items = point.map_items;
403
404 // Do infinite-loop-check:
405 other_next.parents = point.parents;
406 if (point.parents.find(other_next.i) != point.parents.end())
407 {
408 other_target = NULL;
409 other_tag = -1;
410 }
411 other_next.parents.insert(other_next.i);
412
413 next.parents = point.parents;
414 if (point.parents.find(next.i) != point.parents.end())
415 {
416 // target = NULL;
417 // tag = -1;
418 // <- XXX will be overwritten by other_target / other_tag immediately
419 goto next_target;
420 }
421 next.parents.insert(next.i);
422
423 another_transition:
424 if (target == NULL)
425 continue;
426
427 // Deal with the current e-transition:
428
429 if (tag >= 0)
430 {
431 /* Delete all existing next.map_items of the form m[tag,x]. */
432 for (list<map_item>::iterator it = next.map_items.begin();
433 it != next.map_items.end(); )
434 if (it->first == (unsigned) tag)
435 {
436 list<map_item>::iterator next_it = it;
437 next_it++;
438 next.map_items.erase (it);
439 it = next_it;
440 }
441 else it++;
442
443 /* Add m[tag,x] to next.map_items, where x is the smallest
444 nonnegative integer such that m[tag,x] does not occur
445 anywhere in closure. Then update the cache. */
446 unsigned x = max_tags[tag];
447 next.map_items.push_back(make_pair(tag, ++x));
448 max_tags[tag] = x;
449 }
450
451 /* Deal with similar transitions that have a different priority: */
452 already_found = false;
453 for (list<list<kernel_point>::iterator>::iterator it
454 = closure_map[next.i].begin();
455 it != closure_map[next.i].end(); )
456 {
457 // NB: it is an iterator into closure_map[next.i],
458 // while *it is an iterator into closure
459
460 int result = arc_compare(next.priority, (*it)->priority);
461 if (result == 0)
462 {
463 ins *base = dfa->orig_nfa;
464 cerr << "stapregex **UNEXPECTED** -- identical arc_priorities for ";
465 (*it)->print(cerr, base);
466 cerr << " and ";
467 next.print(cerr, base);
468 cerr << endl;
469 }
470 #if 0
471 // XXX This is an experimental solution which did not work correctly.
472 if (result == 0 && (*it)->i == next.i)
473 {
474 // Reached the same kernel_point via two alternate
475 // (equal priority) paths. Merge map_items from next into *it:
476 cerr << "**DEBUG** (merging paths for same ins)" << endl;
477 for (list<map_item>::iterator jt = next.map_items.begin();
478 jt != next.map_items.end(); jt++)
479 (*it)->map_items.push_back(*jt);
480 }
481 else
482 #endif
483 assert (result != 0); // Expect this to fail.
484
485 if (result > 0) { // i.e. next.priority > (*it)->priority
486 #if 0
487 ins *base = dfa->orig_nfa;
488 cerr << "**DEBUG** erasing ";
489 (*it)->print(cerr, base);
490 cerr << " in favour of ";
491 next.print(cerr, base);
492 cerr << endl;
493 #endif
494
495 // next.priority is higher, delete existing element
496 closure->erase(*it);
497
498 // obnoxious shuffle to avoid iterator invalidation
499 list<list<kernel_point>::iterator>::iterator old_it = it;
500 it++;
501 closure_map[next.i].erase(old_it);
502 continue;
503 } else { // result <= 0
504 // next.priority is lower, skip adding next
505 already_found = true;
506 }
507
508 it++;
509 }
510
511 if (!already_found) {
512 #if 0
513 cerr << "**DEBUG** added to closure: ";
514 next.print(cerr, dfa->orig_nfa);
515 cerr << endl;
516 #endif
517
518 // Store the element in closure:
519 closure->push_back(next);
520 worklist.push(next);
521
522 // Store the element in relevant caches:
523
524 list<kernel_point>::iterator next_it = closure->end();
525 next_it --; // XXX rewind to just-pushed element
526 closure_map[next.i].push_back(next_it);
527
528 for (list<map_item>::iterator jt = next.map_items.begin();
529 jt != next.map_items.end(); jt++)
530 max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
531
532 }
533
534 next_target:
535 // Now move to dealing with the second e-transition, if any.
536 target = other_target; other_target = NULL;
537 tag = other_tag; other_tag = -1;
538 next = other_next;
539
540 goto another_transition;
541 }
542
543 return closure;
544 }
545
546 /* Helpers for constructing span table: */
547
548 bool
549 same_ins(list<kernel_point> &e1, list<kernel_point> &e2)
550 {
551 set<ins *> s1;
552 for (list<kernel_point>::iterator it = e1.begin();
553 it != e1.end(); it++)
554 s1.insert(it->i);
555 set<ins *> s2;
556 for (list<kernel_point>::iterator it = e2.begin();
557 it != e2.end(); it++)
558 s2.insert(it->i);
559 return s1 == s2;
560 }
561
562 /* Helpers for constructing TDFA actions: */
563
564 /* Find the set of reordering commands (if any) that will get us from
565 state s to some existing state in the dfa (returns the state in
566 question, appends reordering commands to r). Returns NULL is no
567 suitable state is found. */
568 state *
569 dfa::find_equivalent (state *s, tdfa_action &action)
570 {
571 state *answer = NULL;
572
573 for (state_kernel::iterator it = s->kernel->begin();
574 it != s->kernel->end(); it++)
575 mark(it->i);
576
577 /* Check kernels of existing states for size equivalence and for
578 unmarked items (similar to re2c's original algorithm): */
579 unsigned n = s->kernel->size();
580 map<map_item, map_item> shift_map;
581 map<map_item, map_item> shift_back;
582 for (state *t = first; t != NULL; t = t->next)
583 {
584 if (t->kernel->size() == n)
585 {
586 for (state_kernel::iterator it = t->kernel->begin();
587 it != t->kernel->end(); it++)
588 if (!marked(it->i))
589 goto next_state;
590
591 // Check for existence of a reordering tdfa_action r that will
592 // produce identical kernel_points with identical map values.
593
594 // XXX In the below code, we search for more-or-less an
595 // arbitrary permutation of map values.
596 //
597 // To simplify the algorithm, we could instead only check
598 // where lower-index map values are missing from s and
599 // replace them with higher-index map values. The paper
600 // claims this leads to only a slight penalty in number of
601 // TDFA states.
602
603 // Mapping must be one-to-one; check consistency in both directions:
604 shift_map.clear(); // map item of s -> map item of t
605 shift_back.clear(); // map item of t -> map item of s
606
607 for (state_kernel::iterator it = s->kernel->begin();
608 it != s->kernel->end(); it++)
609 {
610 kernel_point *kp1 = &*it;
611 kernel_point *kp2 = 0;
612
613 // Find matching kernel_point in t:
614 bool found_kp = false;
615 for (state_kernel::iterator jt = t->kernel->begin();
616 jt != t->kernel->end(); jt++)
617 if (kp1->i == jt->i)
618 {
619 // XXX check that ins appears only once
620 assert (!found_kp);
621 kp2 = &*jt; // TODO found matching point
622 found_kp = true;
623 }
624 assert(found_kp);
625
626 set<int> seen_tags;
627 for (list<map_item>::iterator jt = kp1->map_items.begin();
628 jt != kp1->map_items.end(); jt++)
629 {
630 map_item mt1 = *jt;
631 map_item mt2;
632
633 // XXX check that tag appears only once
634 assert (seen_tags.count(mt1.first) == 0);
635 seen_tags.insert(mt1.first);
636
637 // Find matching map_item in kp2
638 bool found_tag = false;
639 for (list<map_item>::iterator kt = kp2->map_items.begin();
640 kt != kp2->map_items.end(); kt++)
641 if (mt1.first == kt->first)
642 {
643 // XXX check that tag appears only once
644 assert (!found_tag);
645 mt2 = *kt;
646 found_tag = true;
647 }
648
649 if (!found_tag) // if no matching tag, can't use this state
650 goto next_state;
651 if (shift_map.count(mt1) != 0
652 && shift_map[mt1] != mt2) // if contradiction
653 goto next_state;
654 if (shift_back.count(mt2) != 0
655 && shift_back[mt2] != mt1) // if contradiction
656 goto next_state;
657
658 shift_map[mt1] = mt2;
659 shift_back[mt2] = mt1;
660 }
661
662 // XXX check that every tag in kp2 appears in seen_tag
663 for (list<map_item>::iterator jt = kp2->map_items.begin();
664 jt != kp2->map_items.end(); jt++)
665 {
666 int t2 = jt->first;
667 if (seen_tags.count(t2) == 0)
668 goto next_state;
669 }
670 }
671
672 // #ifdef STAPREGEX_DEBUG_TNFA
673 // cerr << " -*- PRE CYCLE CHECK DEBUG obtained valid reorder ";
674 // for (map<map_item, map_item>::iterator it = shift_map.begin();
675 // it != shift_map.end(); it++)
676 // if (it->first != it->second)
677 // cerr << it->first << "=>" << it->second << " ";
678 // cerr << "to existing state " << t->label << endl;
679 // #endif
680
681 #if 1
682 // Check for cyclical dependencies in the resulting reorder.
683 // XXX: If we find a cycle, just create a new state. We could
684 // also break the cycle with a temporary variable.
685 set<map_item> cycle_okay; cycle_okay.clear();
686 set<map_item> cycle_seen; cycle_seen.clear();
687 for (map<map_item, map_item>::iterator it = shift_map.begin();
688 it != shift_map.end(); it++)
689 {
690 map_item m = it->first;
691 if (cycle_okay.count(m) != 0)
692 continue; // -- already checked for cycle
693
694 while (shift_map.count(m) != 0 && shift_map[m] != m)
695 {
696 if (cycle_okay.count(shift_map[m]) != 0)
697 break; // -- found not-cycle
698 if (cycle_seen.count(shift_map[m]) != 0)
699 goto next_state; // -- found cycle
700 cycle_seen.insert(m);
701 m = shift_map[m];
702 }
703
704 // If we reach the end of the chain, or find a map item
705 // where shift_map[m] == m, this is not considered a
706 // cycle, and therefore none of the map items leading to
707 // here are in cycles:
708 cycle_okay.insert(m);
709 for (set<map_item>::iterator jt = cycle_seen.begin();
710 jt != cycle_seen.end(); jt++)
711 cycle_okay.insert(*jt);
712 cycle_seen.clear();
713 }
714 #endif
715
716 #ifdef STAPREGEX_DEBUG_TNFA
717 cerr << " -*- obtained valid reorder ";
718 for (map<map_item, map_item>::iterator it = shift_map.begin();
719 it != shift_map.end(); it++)
720 if (it->first != it->second)
721 cerr << it->first << "=>" << it->second << " ";
722 cerr << "to existing state " << t->label << endl;
723 #endif
724
725 // Generate reordering command based on the contents of shift_map:
726 tdfa_action r;
727 set<map_item> saved; saved.clear(); // <- elts safe to overwite
728 queue<map_item> to_shift;
729 for (map<map_item, map_item>::iterator it = shift_back.begin();
730 it != shift_back.end(); it++)
731 if (it->first != it->second) // skip trivial shifts
732 to_shift.push(it->first);
733 while (!to_shift.empty())
734 {
735 map_item elt = to_shift.front(); to_shift.pop();
736 if (shift_map.count(elt) != 0 && saved.count(elt) == 0)
737 {
738 // Need to save it first -- put back on queue:
739 to_shift.push(elt);
740 continue;
741 }
742
743 tdfa_insn insn;
744 insn.to = elt;
745 insn.from = shift_back[elt];
746 insn.save_tag = false;
747 insn.save_pos = false;
748 r.push_back(insn);
749
750 // shift_back[elt] is now safe to overwrite
751 saved.insert(shift_back[elt]);
752 }
753
754 answer = t;
755 action.insert(action.end(), r.begin(), r.end()); // XXX append
756 goto cleanup;
757 }
758 next_state:
759 ;
760 }
761
762 cleanup:
763 for (state_kernel::iterator it = s->kernel->begin();
764 it != s->kernel->end(); it++)
765 unmark(it->i);
766
767 return answer;
768 }
769
770 /* Generate position-save commands for any map items in new_k that do
771 not appear in old_k (old_k can be NULL). */
772 tdfa_action
773 dfa::compute_action (state_kernel *old_k, state_kernel *new_k)
774 {
775 tdfa_action c;
776
777 set<map_item> old_items;
778 if (old_k != NULL)
779 for (state_kernel::const_iterator it = old_k->begin();
780 it != old_k->end(); it++)
781 for (list<map_item>::const_iterator jt = it->map_items.begin();
782 jt != it->map_items.end(); jt++)
783 old_items.insert(*jt);
784
785 // XXX: use a set, since we only need one position-save per new map item
786 set<map_item> store_items;
787 for (state_kernel::const_iterator it = new_k->begin();
788 it != new_k->end(); it++)
789 for (list<map_item>::const_iterator jt = it->map_items.begin();
790 jt != it->map_items.end(); jt++)
791 if (old_items.find(*jt) == old_items.end())
792 store_items.insert(*jt);
793
794 for (set<map_item>::iterator it = store_items.begin();
795 it != store_items.end(); it++)
796 {
797 // ensure room for m[i,n] is present in tag_states:
798 add_map_item(*it);
799
800 // append m[i,n] <- <curr position> to c
801 tdfa_insn insn;
802 insn.to = *it;
803 insn.save_tag = false;
804 insn.save_pos = true;
805 c.push_back(insn);
806 }
807
808 return c;
809 }
810
811 tdfa_action
812 dfa::compute_finalizer (state *s)
813 {
814 // TODO VERIFY THAT THIS WORKS -- CAN THERE BE CONFLICTS?
815 tdfa_action c;
816 assert (s->accept_kp != NULL);
817
818 // iterate map items m[i,j]
819 for (list<map_item>::iterator it = s->accept_kp->map_items.begin();
820 it != s->accept_kp->map_items.end(); it++)
821 {
822 // append t[i] <- m[i,j] to c
823 tdfa_insn insn;
824 insn.from = *it;
825 insn.save_tag = true;
826 insn.save_pos = false;
827 c.push_back(insn);
828 }
829
830 return c;
831 }
832
833 /* The main DFA-construction algorithm: */
834
835 dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets,
836 int accept_outcome)
837 : orig_nfa(i), nstates(0), nmapitems(0), ntags(ntags),
838 outcome_snippets(outcome_snippets), success_outcome(accept_outcome)
839 {
840 #ifdef STAPREGEX_DEBUG_TNFA
841 cerr << "DFA CONSTRUCTION (ntags=" << ntags << "):" << endl;
842 #endif
843
844 // XXX: Longest-match priority requires one success and one failure outcome:
845 if (ntags > 0)
846 {
847 assert(outcome_snippets.size() == 2);
848 assert(success_outcome == 1);
849 fail_outcome = 0;
850 }
851
852 /* Initialize empty linked list of states: */
853 first = last = NULL;
854
855 ins *start = &i[0];
856 state_kernel *seed_kernel = make_kernel(start);
857 state_kernel *initial_kernel = te_closure(this, seed_kernel, ntags, true);
858 delete seed_kernel;
859 state *initial = add_state(new state(this, initial_kernel));
860 queue<state *> worklist; worklist.push(initial);
861
862 initializer = compute_action(NULL, initial_kernel);
863 #ifdef STAPREGEX_DEBUG_TNFA
864 cerr << " - constructed initializer " << initializer << endl << endl;
865 #endif
866
867 while (!worklist.empty())
868 {
869 state *curr = worklist.front(); worklist.pop();
870
871 // Kernel points before and after each edge:
872 vector<list<kernel_point> > edge_begin(NUM_REAL_CHARS);
873 vector<list<kernel_point> > edge_end(NUM_REAL_CHARS);
874
875 /* Using the CHAR instructions in kernel, build the initial
876 table of spans for curr. Also check for final states. */
877
878 for (list<kernel_point>::iterator it = curr->kernel->begin();
879 it != curr->kernel->end(); it++)
880 {
881 if (it->i->i.tag == CHAR)
882 {
883 // Add a new kernel_point for each targeted insn:
884 for (ins *j = &it->i[1]; j < (ins *) it->i->i.link; j++)
885 {
886 // XXX: deallocate together with span table
887 kernel_point point;
888 point.i = (ins *) it->i->i.link;
889 point.priority = it->priority;
890 point.map_items = it->map_items; // copy map items
891
892 edge_begin[j->c.value].push_back(*it);
893 edge_end[j->c.value].push_back(point);
894 }
895 }
896 else if (it->i->i.tag == ACCEPT)
897 {
898 /* In case of multiple accepting NFA states,
899 prefer the highest numbered outcome.
900
901 XXX: A possible refinement (commented-out).
902 In case of NFA states with identical outcomes
903 pick the one with the highest arc_priority. */
904 if (!curr->accepts || it->i->i.param > curr->accept_outcome
905 /* || arc_compare(it->priority, curr->accept_kp->priority) > 0 */)
906 {
907 curr->accept_kp = &*it;
908 curr->accept_outcome = it->i->i.param;
909 }
910 curr->accepts = true;
911 }
912 }
913
914 /* If the state was marked as accepting, add a finalizer: */
915 if (curr->accepts)
916 {
917 assert(curr->finalizer.empty()); // XXX: only process a state once
918 curr->finalizer = compute_finalizer(curr);
919 }
920
921 for (unsigned c = 0; c < NUM_REAL_CHARS; )
922 {
923 list<kernel_point> eb = edge_begin[c];
924 list<kernel_point> ee = edge_end[c];
925 assert (!ee.empty()); // XXX: ensured by fail_re in stapregex_compile
926
927 span s;
928
929 s.lb = c;
930
931 while (++c < NUM_REAL_CHARS && same_ins(edge_end[c], ee)) ;
932
933 s.ub = c - 1;
934
935 s.reach_pairs = new state_kernel;
936 s.jump_pairs = new state_kernel;
937
938 for (list<kernel_point>::iterator it = eb.begin();
939 it != eb.end(); it++)
940 s.jump_pairs->push_back(*it);
941 for (list<kernel_point>::iterator it = ee.begin();
942 it != ee.end(); it++)
943 s.reach_pairs->push_back(*it);
944
945 curr->spans.push_back(s);
946 }
947
948 /* For each of the spans in curr, determine the reachable
949 points assuming a character in the span. */
950 #ifdef STAPREGEX_DEBUG_TNFA
951 cerr << "building transitions for state " << curr->label << ":" << endl;
952 #endif
953 for (list<span>::iterator it = curr->spans.begin();
954 it != curr->spans.end(); it++)
955 {
956 /* Set up candidate target state: */
957 state_kernel *u_pairs = te_closure(this, it->reach_pairs, ntags);
958 state *target = new state(this, u_pairs);
959
960 /* Generate position-save commands for any map items
961 that do not appear in the edge: */
962 tdfa_action c = compute_action(it->jump_pairs, u_pairs);
963
964 /* If there is a state t_prime in states such that some
965 sequence of reordering commands r produces t_prime
966 from target, use t_prime as the target state,
967 appending the reordering commands to c. */
968 state *t_prime = find_equivalent(target, c);
969 if (t_prime != NULL)
970 {
971 assert (t_prime != target);
972 delete target;
973 }
974 else
975 {
976 /* We need to actually add target to the dfa: */
977 t_prime = target;
978 add_state(t_prime);
979 worklist.push(t_prime);
980 #ifdef STAPREGEX_DEBUG_TNFA
981 cerr << " -*- add new state " << t_prime->label << endl;
982 #endif
983 }
984
985 /* Set the transition: */
986 it->to = t_prime;
987 it->action = c;
988 }
989 #ifdef STAPREGEX_DEBUG_TNFA
990 cerr << " -> constructed " << curr << endl;
991 #endif
992 }
993 #ifdef STAPREGEX_DEBUG_TNFA
994 cerr << endl;
995 #endif
996 }
997
998 dfa::~dfa ()
999 {
1000 state * s;
1001 while ((s = first))
1002 {
1003 first = s->next;
1004 delete s;
1005 }
1006
1007 delete orig_nfa;
1008 }
1009
1010 // ------------------------------------------------------------------------
1011
1012 void
1013 span::emit_jump (translator_output *o, const dfa *d) const
1014 {
1015 #ifdef STAPREGEX_DEBUG_MATCH
1016 o->newline () << "_stp_printf(\" --> @%ld GOTO yystate%d\\n\", "
1017 << "YYLENGTH, " << to->label << ");";
1018 o->newline () << "_stp_print_flush();";
1019 #endif
1020
1021 if (to->accepts)
1022 {
1023 emit_final(o, d);
1024 return;
1025 }
1026
1027 // We record map_items *after* consuming YYCURSOR:
1028 o->newline () << "YYCURSOR++;";
1029 d->emit_action(o, action);
1030 o->newline () << "goto yystate" << to->label << ";";
1031 }
1032
1033 /* Assuming the target DFA state of the span is a final state, emit code to
1034 cleanup tags and (if appropriate) exit with a final answer. */
1035 void
1036 span::emit_final (translator_output *o, const dfa *d) const
1037 {
1038 assert (to->accepts); // XXX: must guarantee correct usage of emit_final()
1039
1040 // We record map_items *after* consuming YYCURSOR:
1041 o->newline () << "YYCURSOR++;";
1042 d->emit_action(o, action);
1043
1044 // XXX: Note that condition to->finalizer.empty() is only
1045 // appropriate for the two-outcome scheme with one outcome being a
1046 // failure.
1047 if (d->ntags == 0 || to->finalizer.empty()) // terminate immediately
1048 {
1049 d->emit_action(o, to->finalizer);
1050 if (d->ntags == 0)
1051 {
1052 o->newline() << d->outcome_snippets[to->accept_outcome];
1053 o->newline() << "goto yyfinish;";
1054 }
1055 else
1056 {
1057 // Need to return the outcome associated with the longest match:
1058 o->newline() << "if ( YYFINAL(0) >= 0 ) {";
1059 o->newline(1) << d->outcome_snippets[d->success_outcome];
1060 o->newline(-1) << "} else {";
1061 o->newline(1) << d->outcome_snippets[d->fail_outcome];
1062 o->newline(-1) << "}";
1063 o->newline() << "goto yyfinish;";
1064 }
1065 }
1066 else
1067 {
1068 // Ensure longest-match priority by comparing length + start coord:
1069 map_item new_tag_0; bool found = false;
1070 for (tdfa_action::iterator it = to->finalizer.begin();
1071 it != to->finalizer.end(); it++)
1072 // TODOXXX: Only works if finalizer only contains reordering commands
1073 // (perhaps make that into an explicitly checked condition?)
1074 if (it->save_tag && it->from.first == 0)
1075 {
1076 new_tag_0 = it->from; // the map_item saved to tag 0
1077 found = true;
1078 }
1079 assert(found);
1080 #define NEW_TAG_0 "YYTAG(" << new_tag_0.first << "," << new_tag_0.second << ")"
1081 // if (new_tag_0 == old_tag_0 && new_length > old_length) emit action;
1082 o->newline() << "if ( YYFINAL(0) < 0 || "
1083 << "(" << NEW_TAG_0 << " == YYFINAL(0) &&";
1084 o->newline() << " (YYLENGTH - " << NEW_TAG_0 << ")"
1085 << " > (YYFINAL(1) - YYFINAL(0)))) {";
1086 o->newline(1); d->emit_action(o, to->finalizer);
1087 o->indent(-1);
1088 o->newline() << "}";
1089
1090 o->newline () << "goto yystate" << to->label << ";";
1091 }
1092 }
1093
1094 string c_char(rchar c)
1095 {
1096 stringstream o;
1097 o << "'";
1098 print_escaped(o, c);
1099 o << "'";
1100 return o.str();
1101 }
1102
1103 void
1104 state::emit (translator_output *o, const dfa *d) const
1105 {
1106 o->newline() << "yystate" << label << ": ";
1107 #ifdef STAPREGEX_DEBUG_MATCH
1108 o->newline () << "_stp_printf(\"@%ld READ '%s' %c\", "
1109 << "YYLENGTH, cur, *YYCURSOR);";
1110 o->newline () << "_stp_print_flush();";
1111 #endif
1112 o->newline() << "switch (*YYCURSOR) {";
1113 o->indent(1);
1114 const span *default_span = NULL;
1115 for (list<span>::const_iterator it = spans.begin();
1116 it != spans.end(); it++)
1117 {
1118 // If we see a '\0', go immediately into an accept state:
1119 if (it->lb == '\0')
1120 {
1121 o->newline() << "case " << c_char('\0') << ":";
1122 it->emit_final(o, d);
1123 }
1124
1125 // Emit labels to handle all the other elements of the span:
1126 for (unsigned c = max((rchar) '\1', it->lb);
1127 c <= (unsigned) it->ub; c++) {
1128 if (c > 127)
1129 {
1130 default_span = &(*it);
1131 continue; // XXX: not an ASCII char, needs special handling
1132 }
1133 o->newline() << "case " << c_char((rchar) c) << ":";
1134 }
1135 it->emit_jump(o, d);
1136
1137 // TODOXXX 'default' option should handle the largest span
1138 // TODOXXX optimize by accepting before end of string whenever possible
1139 }
1140 if (default_span)
1141 {
1142 // Handle a non-ASCII (unknown) char:
1143 o->newline() << "default:";
1144 default_span->emit_jump(o, d);
1145 }
1146 o->newline(-1) << "}";
1147 }
1148
1149 void
1150 dfa::emit (translator_output *o) const
1151 {
1152 #ifdef STAPREGEX_DEBUG_DFA
1153 print(o);
1154 #else
1155 o->newline() << "{";
1156 o->newline(1);
1157
1158 // Initialize tags:
1159 if (ntags > 0)
1160 {
1161 o->newline() << "unsigned int i;";
1162 o->newline() << "for (i = 0; i < STAPREGEX_MAX_TAG; i++)";
1163 o->newline(1) << "YYFINAL(i) = -1;";
1164 o->indent(-1);
1165 }
1166
1167 emit_action(o, initializer);
1168
1169 if (first->accepts)
1170 {
1171 emit_action(o, first->finalizer);
1172 }
1173 if (first->accepts && ntags == 0) // XXX workaround for empty regex
1174 {
1175 o->newline() << outcome_snippets[first->accept_outcome];
1176 o->newline() << "goto yyfinish;";
1177 }
1178
1179 for (state *s = first; s; s = s->next)
1180 s->emit(o, this);
1181
1182 o->newline() << "yyfinish: ;";
1183 o->newline(-1) << "}";
1184 #endif
1185 }
1186
1187 void
1188 dfa::emit_action (translator_output *o, const tdfa_action &act) const
1189 {
1190 #ifdef STAPREGEX_DEBUG_MATCH
1191 o->newline () << "_stp_printf(\" --> @%ld, SET_TAG %s\\n\", "
1192 << "YYLENGTH, \"" << act << "\");";
1193 o->newline () << "_stp_print_flush();";
1194 #endif
1195 for (tdfa_action::const_iterator it = act.begin(); it != act.end(); it++)
1196 {
1197 if (it->save_tag)
1198 o->newline() << "YYFINAL(" << it->from.first << ") = ";
1199 else
1200 o->newline() << "YYTAG(" << it->to.first
1201 << "," << it->to.second << ") = ";
1202 if (it->save_pos)
1203 o->line() << "YYLENGTH";
1204 else
1205 o->line() << "YYTAG(" << it->from.first
1206 << "," << it->from.second << ")";
1207 o->line() << ";";
1208 }
1209 }
1210
1211 void
1212 dfa::emit_tagsave (translator_output *o, std::string,
1213 std::string, std::string num_final_tags) const
1214 {
1215 // TODOXXX: ignoring other two snippets (tag_states and tag_vals),
1216 // which are handled by the earlier code in the actual matcher.
1217 o->newline() << num_final_tags << " = " << ntags << ";";
1218 }
1219
1220 // ------------------------------------------------------------------------
1221
1222 std::ostream&
1223 operator << (std::ostream &o, const map_item& m)
1224 {
1225 o << "m[" << m.first << "," << m.second << "]";
1226 return o;
1227 }
1228
1229 std::ostream&
1230 operator << (std::ostream &o, const tdfa_action& a)
1231 {
1232 for (list<tdfa_insn>::const_iterator it = a.begin();
1233 it != a.end(); it++)
1234 {
1235 if (it != a.begin()) o << "; ";
1236
1237 if (it->save_tag)
1238 o << "t[" << it->from.first << "] <- ";
1239 else
1240 o << it->to << " <- ";
1241
1242 if (it->save_pos)
1243 o << "p";
1244 else
1245 o << it->from;
1246 }
1247
1248 return o;
1249 }
1250
1251 std::ostream&
1252 operator << (std::ostream &o, const arc_priority& p)
1253 {
1254 o << p.first << "/" << (1 << p.second);
1255 return o;
1256 }
1257
1258 void
1259 kernel_point::print (std::ostream &o, ins *base) const
1260 {
1261 o << (i - base);
1262 o << "[" << priority << "]";
1263 if (!map_items.empty())
1264 {
1265 o << ":";
1266 for (list<map_item>::const_iterator it = map_items.begin();
1267 it != map_items.end(); it++)
1268 {
1269 if (it != map_items.begin()) o << ",";
1270 o << *it;
1271 }
1272 }
1273 }
1274
1275 void
1276 state::print (translator_output *o) const
1277 {
1278 o->line() << "state " << label;
1279
1280 #ifdef STAPREGEX_DEBUG_TNFA
1281 // For debugging, also show the kernel:
1282 ins *base = owner->orig_nfa;
1283 o->line() << " w/kernel {";
1284 for (state_kernel::iterator it = kernel->begin();
1285 it != kernel->end(); it++)
1286 {
1287 if (it != kernel->begin()) o->line() << "; ";
1288 it->print(o->line(), base);
1289 }
1290 o->line() << "}";
1291
1292 // Also print information for constructing reorderings:
1293 set<map_item> all_items;
1294 for (state_kernel::iterator it = kernel->begin();
1295 it != kernel->end(); it++)
1296 for (list<map_item>::iterator jt = it->map_items.begin();
1297 jt != it->map_items.end(); jt++)
1298 all_items.insert(*jt);
1299 if (!all_items.empty())
1300 {
1301 o->newline() << " ";
1302 o->line() << " with map_items ";
1303 for (set<map_item>::iterator it = all_items.begin();
1304 it != all_items.end(); it++)
1305 o->line() << *it << " ";
1306 }
1307
1308 if (accepts || !finalizer.empty())
1309 o->newline() << " ";
1310 #endif
1311
1312 if (accepts)
1313 o->line() << " accepts " << accept_outcome;
1314 if (!finalizer.empty())
1315 o->line() << " with finalizer {" << finalizer << "}";
1316
1317 // TODOXXX: factor this out to span::print()
1318 o->indent(1);
1319 for (list<span>::const_iterator it = spans.begin();
1320 it != spans.end(); it++)
1321 {
1322 o->newline() << "'";
1323 if (it->lb == it->ub)
1324 {
1325 print_escaped (o->line(), it->lb);
1326 o->line() << " ";
1327 }
1328 else
1329 {
1330 print_escaped (o->line(), it->lb);
1331 o->line() << "-";
1332 print_escaped (o->line(), it->ub);
1333 }
1334
1335 if (it->to != NULL)
1336 o->line() << "' -> " << it->to->label;
1337 else
1338 o->line() << "' -> <none>";
1339
1340 if (!it->action.empty())
1341 o->line() << " {" << it->action << "}";
1342 }
1343 o->newline(-1);
1344 }
1345
1346 void
1347 state::print (std::ostream &o) const
1348 {
1349 translator_output to(o); print(&to);
1350 }
1351
1352 std::ostream&
1353 operator << (std::ostream &o, const state *s)
1354 {
1355 s->print(o);
1356 return o;
1357 }
1358
1359 void
1360 dfa::print (translator_output *o) const
1361 {
1362 o->newline();
1363 for (state *s = first; s; s = s->next)
1364 {
1365 s->print(o);
1366 o->newline();
1367 }
1368 o->newline();
1369 }
1370
1371 void
1372 dfa::print (std::ostream& o) const
1373 {
1374 translator_output to(o); print(&to);
1375 }
1376
1377 std::ostream&
1378 operator << (std::ostream& o, const dfa& d)
1379 {
1380 d.print(o);
1381 return o;
1382 }
1383
1384 std::ostream&
1385 operator << (std::ostream &o, const dfa *d)
1386 {
1387 o << *d;
1388 return o;
1389 }
1390
1391 };
1392
1393 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.101404 seconds and 5 git commands to generate.