2 // Copyright (C) 2012-2018 Red Hat Inc.
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
26 #include "translator-output.h"
29 #include "stapregex-parse.h"
30 #include "stapregex-tree.h"
31 #include "stapregex-dfa.h"
33 // Uncomment to show result of ins (NFA) compilation:
34 //#define STAPREGEX_DEBUG_INS
35 // Uncomment to emit DFA in a non-working compact format (use with -p3):
36 //#define STAPREGEX_DEBUG_DFA
37 // Uncomment to have the generated engine do a trace of visited states
38 // (only when testing using the standalone regtest module):
39 //#define STAPREGEX_DEBUG_MATCH
41 // Uncomment for a detailed walkthrough of the tagged-NFA conversion:
42 //#define STAPREGEX_DEBUG_TNFA
48 regexp
*pad_re
= NULL
;
49 regexp
*fail_re
= NULL
;
52 stapregex_compile (regexp
*re
, const std::string
& match_snippet
,
53 const std::string
& fail_snippet
)
56 // build regexp for ".*"
58 pad_re
= new close_op (pad_re
, true); // -- prefer shorter match
59 pad_re
= new alt_op (pad_re
, new null_op
, true); // -- prefer second match
61 if (fail_re
== NULL
) {
62 // build regexp for ".*$", but allow '\0' and support fail outcome
63 fail_re
= make_dot (true); // -- allow '\0'
64 fail_re
= new close_op (fail_re
, true); // -- prefer shorter match
65 fail_re
= new alt_op (fail_re
, new null_op
, true); // -- prefer second match
66 fail_re
= new cat_op (fail_re
, new anchor_op('$'));
67 fail_re
= new rule_op(fail_re
, 0);
68 // XXX: this approach creates one extra spurious-but-safe state
69 // (safe because the matching procedure stops after encountering '\0')
72 vector
<string
> outcomes(2);
73 outcomes
[0] = fail_snippet
;
74 outcomes
[1] = match_snippet
;
76 int num_tags
= re
->num_tags
;
78 // Pad & wrap re in appropriate rule_ops to control match behaviour:
79 bool anchored
= re
->anchored ();
80 if (!anchored
) re
= new cat_op(pad_re
, re
); // -- left-padding
81 re
= new rule_op(re
, 1);
82 re
= new alt_op(re
, fail_re
);
84 #ifdef STAPREGEX_DEBUG_INS
85 cerr
<< "RESULTING INS FROM REGEX " << re
<< ":" << endl
;
88 ins
*i
= re
->compile();
90 #ifdef STAPREGEX_DEBUG_INS
91 for (const ins
*j
= i
; (j
- i
) < (int)re
->ins_size() + 1; )
93 j
= show_ins(cerr
, j
, i
); cerr
<< endl
;
99 for (ins
*j
= i
; (j
- i
) < (int)re
->ins_size() + 1; )
102 if (j
->i
.tag
== CHAR
)
103 j
= (ins
*) j
->i
.link
;
108 #ifdef STAPREGEX_DEBUG_INS
109 cerr
<< "OPTIMIZED INS FROM THE SAME REGEX" << endl
;
110 for (const ins
*j
= i
; (j
- i
) < (int)re
->ins_size() + 1; )
112 j
= show_ins(cerr
, j
, i
); cerr
<< endl
;
117 dfa
*d
= new dfa(i
, num_tags
, outcomes
);
119 // Carefully deallocate temporary scaffolding:
120 if (!anchored
) delete ((rule_op
*) ((alt_op
*) re
)->a
)->re
; // -- new cat_op
121 delete ((alt_op
*) re
)->a
; // -- new rule_op
122 delete re
; // -- new alt_op
123 // NB: deleting a regular expression DOES NOT deallocate its
124 // children. The original re parameter is presumed to be retained
125 // indefinitely as part of a stapdfa table, or such....
130 // ------------------------------------------------------------------------
132 /* Now follows the heart of the tagged-DFA algorithm. This is a basic
133 implementation of the algorithm described in Ville Laurikari's
134 Masters thesis and summarized in the paper "NFAs with Tagged
135 Transitions, their Conversion to Deterministic Automata and
136 Application to Regular Expressions"
137 (http://laurikari.net/ville/spire2000-tnfa.pdf).
139 HERE BE DRAGONS (and not the friendly kind) */
141 /* Functions to deal with relative transition priorities: */
144 refine_higher(const arc_priority
& a
)
146 if (a
.first
> ULLONG_MAX
/4) // detect overflow
147 throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0);
148 return make_pair(2 * a
.first
+ 1, a
.second
+ 1);
152 refine_lower (const arc_priority
& a
)
154 if (a
.first
> ULLONG_MAX
/4) // detect overflow
155 throw regex_error(_("arc_priority overflow due to excessive branching factor"), 0);
156 return make_pair(2 * a
.first
, a
.second
+ 1);
160 arc_compare (const arc_priority
& a
, const arc_priority
& b
)
162 unsigned long x
= a
.first
;
163 unsigned long y
= b
.first
;
165 if (a
.second
> b
.second
)
166 y
= y
<< (a
.second
- b
.second
);
167 else if (a
.second
< b
.second
)
168 x
= x
<< (b
.second
- a
.second
);
170 // Special case: 0/n </> 0/m iff m </> n.
171 // This is because such priorities are obtained by refine_lower().
172 if (x
== 0 && y
== 0)
173 return ( a
.second
== b
.second
? 0 : a
.second
< b
.second
? 1 : -1 );
175 return ( x
== y
? 0 : x
< y
? -1 : 1 );
178 /* Manage the linked list of states in a DFA: */
180 state::state (dfa
*owner
, state_kernel
*kernel
)
181 : owner(owner
), label(~0), next(NULL
), kernel(kernel
),
182 accepts(false), accept_outcome(0) {}
185 dfa::add_map_item (const map_item
&m
)
187 // TODOXXX: later, compute a mapping into a single-level tag_states array
188 // TODOXXX: could drop the +1 and instead subtract 1 in YYTAG macro
189 nmapitems
= max(nmapitems
, m
.second
) + 1;
193 dfa::add_state (state
*s
)
195 s
->label
= nstates
++;
212 /* Operations to build a simple kernel prior to taking closure: */
214 /* Create a new kernel_point in kernel with empty map items. */
216 add_kernel (state_kernel
*kernel
, ins
*i
)
220 point
.priority
= MAKE_START_PRIORITY
;
221 // NB: point->map_items is empty
223 kernel
->push_back(point
);
229 state_kernel
*kernel
= new state_kernel
;
230 add_kernel (kernel
, i
);
234 struct sort_priorities
{
235 bool operator ()(const arc_priority
&a
, const arc_priority
&b
)
237 return arc_compare(a
,b
) > 0;
241 struct sort_denominator
{
242 bool operator ()(const arc_priority
&a
, const arc_priority
&b
)
244 return a
.second
> b
.second
;
248 struct sort_kernel_points
{
249 bool operator ()(const kernel_point
&k
, const kernel_point
&l
)
251 return arc_compare(k
.priority
, l
.priority
) > 0;
255 // Move points from worklist to new_worklist while rebalancing priorities:
257 rebalance_priorities(stack
<kernel_point
> &worklist
, stack
<kernel_point
> &new_worklist
)
259 // Sort worklist in order of priority:
260 priority_queue
<kernel_point
, vector
<kernel_point
>, sort_kernel_points
> sorted_worklist
;
261 while (!worklist
.empty())
263 kernel_point point
= worklist
.top(); worklist
.pop();
264 sorted_worklist
.push(point
);
267 // Generate a 'clean' set of priorities:
268 priority_queue
<arc_priority
, vector
<arc_priority
>, sort_priorities
> sorted_priorities
;
269 priority_queue
<arc_priority
, vector
<arc_priority
>, sort_denominator
> new_priorities
;
270 new_priorities
.push(MAKE_START_PRIORITY
);
271 while (new_priorities
.size() < sorted_worklist
.size())
273 arc_priority new_priority
= new_priorities
.top(); new_priorities
.pop();
274 new_priorities
.push(refine_higher(new_priority
));
275 new_priorities
.push(refine_lower(new_priority
));
277 for (unsigned i
= 0; i
< sorted_worklist
.size(); i
++)
279 arc_priority new_priority
= new_priorities
.top(); new_priorities
.pop();
280 sorted_priorities
.push(new_priority
);
283 while (!sorted_worklist
.empty())
285 kernel_point point
= sorted_worklist
.top(); sorted_worklist
.pop();
286 arc_priority new_priority
= sorted_priorities
.top(); sorted_priorities
.pop();
287 point
.priority
= new_priority
;
288 new_worklist
.push(point
);
292 /* Compute the set of kernel_points that are 'tag-wise unambiguously
293 reachable' from a given initial set of points. Absent tagging, this
294 becomes a bog-standard NFA e_closure construction. */
296 te_closure (dfa
*dfa
, state_kernel
*start
, int ntags
, bool is_initial
= false)
298 state_kernel
*closure
= new state_kernel(*start
);
299 stack
<kernel_point
> base_worklist
; // -- with old priorities
300 stack
<kernel_point
> worklist
; // -- with rebalanced priorities
301 // XXX: state_kernel is a list<kernel_point> so we avoid iterator
302 // invalidation and make a new copy of each kernel_point from start
304 /* To avoid searching through closure incessantly when retrieving
305 information about existing elements, the following caches are
307 vector
<unsigned> max_tags (ntags
, 0);
308 map
<ins
*, list
<list
<kernel_point
>::iterator
> > closure_map
;
310 /* Cache initial elements of closure: */
311 for (state_kernel::iterator it
= closure
->begin();
312 it
!= closure
->end(); it
++)
315 cerr
<< "**DEBUG** initial closure point ";
316 it
->print(cerr
, dfa
->orig_nfa
);
320 base_worklist
.push(*it
); // -- push with existing priority, rebalance later
322 // Store the element in relevant caches:
324 for (list
<map_item
>::const_iterator jt
= it
->map_items
.begin();
325 jt
!= it
->map_items
.end(); jt
++)
326 max_tags
[jt
->first
] = max(jt
->second
, max_tags
[jt
->first
]);
328 closure_map
[it
->i
].push_back(it
);
331 // PR23608: Retaining the priority from the previous state has the
332 // potential to overflow the arc_priority representation with large
333 // numbers when there are many distinct DFA states. This should
334 // cause an explicit assertion failure if it occurs in practice (see
335 // refine_*()), e.g. with long non-branching regexes such as
338 // Fixed by adding an explicit step to rebalance priorities:
339 rebalance_priorities(base_worklist
, worklist
);
341 while (!worklist
.empty())
343 kernel_point point
= worklist
.top(); worklist
.pop();
345 // Identify e-transitions depending on the opcode.
346 // There are at most two e-transitions emerging from an insn.
347 // If we have two e-transitions, the 'other' has higher priority.
349 ins
*target
= NULL
; int tag
= -1;
350 ins
*other_target
= NULL
; int other_tag
= -1;
352 bool do_split
= false;
354 if (point
.i
->i
.tag
== TAG
)
356 target
= &point
.i
[1];
357 tag
= (int) point
.i
->i
.param
;
359 else if (point
.i
->i
.tag
== FORK
&& point
.i
== (ins
*) point
.i
->i
.link
)
361 /* Workaround for a FORK that points to itself: */
362 target
= &point
.i
[1];
364 else if (point
.i
->i
.tag
== FORK
)
367 // Relative priority of two e-transitions depends on param:
368 if (point
.i
->i
.param
)
370 // Prefer jumping to link.
371 target
= &point
.i
[1];
372 other_target
= (ins
*) point
.i
->i
.link
;
376 // Prefer stepping to next instruction.
377 target
= (ins
*) point
.i
->i
.link
;
378 other_target
= &point
.i
[1];
381 else if (point
.i
->i
.tag
== GOTO
)
383 target
= (ins
*) point
.i
->i
.link
;
385 else if (point
.i
->i
.tag
== INIT
&& is_initial
)
387 target
= &point
.i
[1];
392 // Data for the endpoint of the first transition:
395 next
.priority
= do_split
? refine_lower(point
.priority
) : point
.priority
;
396 next
.map_items
= point
.map_items
;
398 // Date for the endpoint of the second transition:
399 kernel_point other_next
;
400 other_next
.i
= other_target
;
401 other_next
.priority
= do_split
? refine_higher(point
.priority
) : point
.priority
;
402 other_next
.map_items
= point
.map_items
;
404 // Do infinite-loop-check:
405 other_next
.parents
= point
.parents
;
406 if (point
.parents
.find(other_next
.i
) != point
.parents
.end())
411 other_next
.parents
.insert(other_next
.i
);
413 next
.parents
= point
.parents
;
414 if (point
.parents
.find(next
.i
) != point
.parents
.end())
418 // <- XXX will be overwritten by other_target / other_tag immediately
421 next
.parents
.insert(next
.i
);
427 // Deal with the current e-transition:
431 /* Delete all existing next.map_items of the form m[tag,x]. */
432 for (list
<map_item
>::iterator it
= next
.map_items
.begin();
433 it
!= next
.map_items
.end(); )
434 if (it
->first
== (unsigned) tag
)
436 list
<map_item
>::iterator next_it
= it
;
438 next
.map_items
.erase (it
);
443 /* Add m[tag,x] to next.map_items, where x is the smallest
444 nonnegative integer such that m[tag,x] does not occur
445 anywhere in closure. Then update the cache. */
446 unsigned x
= max_tags
[tag
];
447 next
.map_items
.push_back(make_pair(tag
, ++x
));
451 /* Deal with similar transitions that have a different priority: */
452 already_found
= false;
453 for (list
<list
<kernel_point
>::iterator
>::iterator it
454 = closure_map
[next
.i
].begin();
455 it
!= closure_map
[next
.i
].end(); )
457 // NB: it is an iterator into closure_map[next.i],
458 // while *it is an iterator into closure
460 int result
= arc_compare(next
.priority
, (*it
)->priority
);
463 ins
*base
= dfa
->orig_nfa
;
464 cerr
<< "stapregex **UNEXPECTED** -- identical arc_priorities for ";
465 (*it
)->print(cerr
, base
);
467 next
.print(cerr
, base
);
471 // XXX This is an experimental solution which did not work correctly.
472 if (result
== 0 && (*it
)->i
== next
.i
)
474 // Reached the same kernel_point via two alternate
475 // (equal priority) paths. Merge map_items from next into *it:
476 cerr
<< "**DEBUG** (merging paths for same ins)" << endl
;
477 for (list
<map_item
>::iterator jt
= next
.map_items
.begin();
478 jt
!= next
.map_items
.end(); jt
++)
479 (*it
)->map_items
.push_back(*jt
);
483 assert (result
!= 0); // Expect this to fail.
485 if (result
> 0) { // i.e. next.priority > (*it)->priority
487 ins
*base
= dfa
->orig_nfa
;
488 cerr
<< "**DEBUG** erasing ";
489 (*it
)->print(cerr
, base
);
490 cerr
<< " in favour of ";
491 next
.print(cerr
, base
);
495 // next.priority is higher, delete existing element
498 // obnoxious shuffle to avoid iterator invalidation
499 list
<list
<kernel_point
>::iterator
>::iterator old_it
= it
;
501 closure_map
[next
.i
].erase(old_it
);
503 } else { // result <= 0
504 // next.priority is lower, skip adding next
505 already_found
= true;
511 if (!already_found
) {
513 cerr
<< "**DEBUG** added to closure: ";
514 next
.print(cerr
, dfa
->orig_nfa
);
518 // Store the element in closure:
519 closure
->push_back(next
);
522 // Store the element in relevant caches:
524 list
<kernel_point
>::iterator next_it
= closure
->end();
525 next_it
--; // XXX rewind to just-pushed element
526 closure_map
[next
.i
].push_back(next_it
);
528 for (list
<map_item
>::iterator jt
= next
.map_items
.begin();
529 jt
!= next
.map_items
.end(); jt
++)
530 max_tags
[jt
->first
] = max(jt
->second
, max_tags
[jt
->first
]);
535 // Now move to dealing with the second e-transition, if any.
536 target
= other_target
; other_target
= NULL
;
537 tag
= other_tag
; other_tag
= -1;
540 goto another_transition
;
546 /* Helpers for constructing span table: */
549 same_ins(list
<kernel_point
> &e1
, list
<kernel_point
> &e2
)
552 for (list
<kernel_point
>::iterator it
= e1
.begin();
553 it
!= e1
.end(); it
++)
556 for (list
<kernel_point
>::iterator it
= e2
.begin();
557 it
!= e2
.end(); it
++)
562 /* Helpers for constructing TDFA actions: */
564 /* Find the set of reordering commands (if any) that will get us from
565 state s to some existing state in the dfa (returns the state in
566 question, appends reordering commands to r). Returns NULL is no
567 suitable state is found. */
569 dfa::find_equivalent (state
*s
, tdfa_action
&action
)
571 state
*answer
= NULL
;
573 for (state_kernel::iterator it
= s
->kernel
->begin();
574 it
!= s
->kernel
->end(); it
++)
577 /* Check kernels of existing states for size equivalence and for
578 unmarked items (similar to re2c's original algorithm): */
579 unsigned n
= s
->kernel
->size();
580 map
<map_item
, map_item
> shift_map
;
581 map
<map_item
, map_item
> shift_back
;
582 for (state
*t
= first
; t
!= NULL
; t
= t
->next
)
584 if (t
->kernel
->size() == n
)
586 for (state_kernel::iterator it
= t
->kernel
->begin();
587 it
!= t
->kernel
->end(); it
++)
591 // Check for existence of a reordering tdfa_action r that will
592 // produce identical kernel_points with identical map values.
594 // XXX In the below code, we search for more-or-less an
595 // arbitrary permutation of map values.
597 // To simplify the algorithm, we could instead only check
598 // where lower-index map values are missing from s and
599 // replace them with higher-index map values. The paper
600 // claims this leads to only a slight penalty in number of
603 // Mapping must be one-to-one; check consistency in both directions:
604 shift_map
.clear(); // map item of s -> map item of t
605 shift_back
.clear(); // map item of t -> map item of s
607 for (state_kernel::iterator it
= s
->kernel
->begin();
608 it
!= s
->kernel
->end(); it
++)
610 kernel_point
*kp1
= &*it
;
611 kernel_point
*kp2
= 0;
613 // Find matching kernel_point in t:
614 bool found_kp
= false;
615 for (state_kernel::iterator jt
= t
->kernel
->begin();
616 jt
!= t
->kernel
->end(); jt
++)
619 // XXX check that ins appears only once
621 kp2
= &*jt
; // TODO found matching point
627 for (list
<map_item
>::iterator jt
= kp1
->map_items
.begin();
628 jt
!= kp1
->map_items
.end(); jt
++)
633 // XXX check that tag appears only once
634 assert (seen_tags
.count(mt1
.first
) == 0);
635 seen_tags
.insert(mt1
.first
);
637 // Find matching map_item in kp2
638 bool found_tag
= false;
639 for (list
<map_item
>::iterator kt
= kp2
->map_items
.begin();
640 kt
!= kp2
->map_items
.end(); kt
++)
641 if (mt1
.first
== kt
->first
)
643 // XXX check that tag appears only once
649 if (!found_tag
) // if no matching tag, can't use this state
651 if (shift_map
.count(mt1
) != 0
652 && shift_map
[mt1
] != mt2
) // if contradiction
654 if (shift_back
.count(mt2
) != 0
655 && shift_back
[mt2
] != mt1
) // if contradiction
658 shift_map
[mt1
] = mt2
;
659 shift_back
[mt2
] = mt1
;
662 // XXX check that every tag in kp2 appears in seen_tag
663 for (list
<map_item
>::iterator jt
= kp2
->map_items
.begin();
664 jt
!= kp2
->map_items
.end(); jt
++)
667 if (seen_tags
.count(t2
) == 0)
672 // #ifdef STAPREGEX_DEBUG_TNFA
673 // cerr << " -*- PRE CYCLE CHECK DEBUG obtained valid reorder ";
674 // for (map<map_item, map_item>::iterator it = shift_map.begin();
675 // it != shift_map.end(); it++)
676 // if (it->first != it->second)
677 // cerr << it->first << "=>" << it->second << " ";
678 // cerr << "to existing state " << t->label << endl;
682 // Check for cyclical dependencies in the resulting reorder.
683 // XXX: If we find a cycle, just create a new state. We could
684 // also break the cycle with a temporary variable.
685 set
<map_item
> cycle_okay
; cycle_okay
.clear();
686 set
<map_item
> cycle_seen
; cycle_seen
.clear();
687 for (map
<map_item
, map_item
>::iterator it
= shift_map
.begin();
688 it
!= shift_map
.end(); it
++)
690 map_item m
= it
->first
;
691 if (cycle_okay
.count(m
) != 0)
692 continue; // -- already checked for cycle
694 while (shift_map
.count(m
) != 0 && shift_map
[m
] != m
)
696 if (cycle_okay
.count(shift_map
[m
]) != 0)
697 break; // -- found not-cycle
698 if (cycle_seen
.count(shift_map
[m
]) != 0)
699 goto next_state
; // -- found cycle
700 cycle_seen
.insert(m
);
704 // If we reach the end of the chain, or find a map item
705 // where shift_map[m] == m, this is not considered a
706 // cycle, and therefore none of the map items leading to
707 // here are in cycles:
708 cycle_okay
.insert(m
);
709 for (set
<map_item
>::iterator jt
= cycle_seen
.begin();
710 jt
!= cycle_seen
.end(); jt
++)
711 cycle_okay
.insert(*jt
);
716 #ifdef STAPREGEX_DEBUG_TNFA
717 cerr
<< " -*- obtained valid reorder ";
718 for (map
<map_item
, map_item
>::iterator it
= shift_map
.begin();
719 it
!= shift_map
.end(); it
++)
720 if (it
->first
!= it
->second
)
721 cerr
<< it
->first
<< "=>" << it
->second
<< " ";
722 cerr
<< "to existing state " << t
->label
<< endl
;
725 // Generate reordering command based on the contents of shift_map:
727 set
<map_item
> saved
; saved
.clear(); // <- elts safe to overwite
728 queue
<map_item
> to_shift
;
729 for (map
<map_item
, map_item
>::iterator it
= shift_back
.begin();
730 it
!= shift_back
.end(); it
++)
731 if (it
->first
!= it
->second
) // skip trivial shifts
732 to_shift
.push(it
->first
);
733 while (!to_shift
.empty())
735 map_item elt
= to_shift
.front(); to_shift
.pop();
736 if (shift_map
.count(elt
) != 0 && saved
.count(elt
) == 0)
738 // Need to save it first -- put back on queue:
745 insn
.from
= shift_back
[elt
];
746 insn
.save_tag
= false;
747 insn
.save_pos
= false;
750 // shift_back[elt] is now safe to overwrite
751 saved
.insert(shift_back
[elt
]);
755 action
.insert(action
.end(), r
.begin(), r
.end()); // XXX append
763 for (state_kernel::iterator it
= s
->kernel
->begin();
764 it
!= s
->kernel
->end(); it
++)
770 /* Generate position-save commands for any map items in new_k that do
771 not appear in old_k (old_k can be NULL). */
773 dfa::compute_action (state_kernel
*old_k
, state_kernel
*new_k
)
777 set
<map_item
> old_items
;
779 for (state_kernel::const_iterator it
= old_k
->begin();
780 it
!= old_k
->end(); it
++)
781 for (list
<map_item
>::const_iterator jt
= it
->map_items
.begin();
782 jt
!= it
->map_items
.end(); jt
++)
783 old_items
.insert(*jt
);
785 // XXX: use a set, since we only need one position-save per new map item
786 set
<map_item
> store_items
;
787 for (state_kernel::const_iterator it
= new_k
->begin();
788 it
!= new_k
->end(); it
++)
789 for (list
<map_item
>::const_iterator jt
= it
->map_items
.begin();
790 jt
!= it
->map_items
.end(); jt
++)
791 if (old_items
.find(*jt
) == old_items
.end())
792 store_items
.insert(*jt
);
794 for (set
<map_item
>::iterator it
= store_items
.begin();
795 it
!= store_items
.end(); it
++)
797 // ensure room for m[i,n] is present in tag_states:
800 // append m[i,n] <- <curr position> to c
803 insn
.save_tag
= false;
804 insn
.save_pos
= true;
812 dfa::compute_finalizer (state
*s
)
814 // TODO VERIFY THAT THIS WORKS -- CAN THERE BE CONFLICTS?
816 assert (s
->accept_kp
!= NULL
);
818 // iterate map items m[i,j]
819 for (list
<map_item
>::iterator it
= s
->accept_kp
->map_items
.begin();
820 it
!= s
->accept_kp
->map_items
.end(); it
++)
822 // append t[i] <- m[i,j] to c
825 insn
.save_tag
= true;
826 insn
.save_pos
= false;
833 /* The main DFA-construction algorithm: */
835 dfa::dfa (ins
*i
, int ntags
, vector
<string
>& outcome_snippets
,
837 : orig_nfa(i
), nstates(0), nmapitems(0), ntags(ntags
),
838 outcome_snippets(outcome_snippets
), success_outcome(accept_outcome
)
840 #ifdef STAPREGEX_DEBUG_TNFA
841 cerr
<< "DFA CONSTRUCTION (ntags=" << ntags
<< "):" << endl
;
844 // XXX: Longest-match priority requires one success and one failure outcome:
847 assert(outcome_snippets
.size() == 2);
848 assert(success_outcome
== 1);
852 /* Initialize empty linked list of states: */
856 state_kernel
*seed_kernel
= make_kernel(start
);
857 state_kernel
*initial_kernel
= te_closure(this, seed_kernel
, ntags
, true);
859 state
*initial
= add_state(new state(this, initial_kernel
));
860 queue
<state
*> worklist
; worklist
.push(initial
);
862 initializer
= compute_action(NULL
, initial_kernel
);
863 #ifdef STAPREGEX_DEBUG_TNFA
864 cerr
<< " - constructed initializer " << initializer
<< endl
<< endl
;
867 while (!worklist
.empty())
869 state
*curr
= worklist
.front(); worklist
.pop();
871 // Kernel points before and after each edge:
872 vector
<list
<kernel_point
> > edge_begin(NUM_REAL_CHARS
);
873 vector
<list
<kernel_point
> > edge_end(NUM_REAL_CHARS
);
875 /* Using the CHAR instructions in kernel, build the initial
876 table of spans for curr. Also check for final states. */
878 for (list
<kernel_point
>::iterator it
= curr
->kernel
->begin();
879 it
!= curr
->kernel
->end(); it
++)
881 if (it
->i
->i
.tag
== CHAR
)
883 // Add a new kernel_point for each targeted insn:
884 for (ins
*j
= &it
->i
[1]; j
< (ins
*) it
->i
->i
.link
; j
++)
886 // XXX: deallocate together with span table
888 point
.i
= (ins
*) it
->i
->i
.link
;
889 point
.priority
= it
->priority
;
890 point
.map_items
= it
->map_items
; // copy map items
892 edge_begin
[j
->c
.value
].push_back(*it
);
893 edge_end
[j
->c
.value
].push_back(point
);
896 else if (it
->i
->i
.tag
== ACCEPT
)
898 /* In case of multiple accepting NFA states,
899 prefer the highest numbered outcome.
901 XXX: A possible refinement (commented-out).
902 In case of NFA states with identical outcomes
903 pick the one with the highest arc_priority. */
904 if (!curr
->accepts
|| it
->i
->i
.param
> curr
->accept_outcome
905 /* || arc_compare(it->priority, curr->accept_kp->priority) > 0 */)
907 curr
->accept_kp
= &*it
;
908 curr
->accept_outcome
= it
->i
->i
.param
;
910 curr
->accepts
= true;
914 /* If the state was marked as accepting, add a finalizer: */
917 assert(curr
->finalizer
.empty()); // XXX: only process a state once
918 curr
->finalizer
= compute_finalizer(curr
);
921 for (unsigned c
= 0; c
< NUM_REAL_CHARS
; )
923 list
<kernel_point
> eb
= edge_begin
[c
];
924 list
<kernel_point
> ee
= edge_end
[c
];
925 assert (!ee
.empty()); // XXX: ensured by fail_re in stapregex_compile
931 while (++c
< NUM_REAL_CHARS
&& same_ins(edge_end
[c
], ee
)) ;
935 s
.reach_pairs
= new state_kernel
;
936 s
.jump_pairs
= new state_kernel
;
938 for (list
<kernel_point
>::iterator it
= eb
.begin();
939 it
!= eb
.end(); it
++)
940 s
.jump_pairs
->push_back(*it
);
941 for (list
<kernel_point
>::iterator it
= ee
.begin();
942 it
!= ee
.end(); it
++)
943 s
.reach_pairs
->push_back(*it
);
945 curr
->spans
.push_back(s
);
948 /* For each of the spans in curr, determine the reachable
949 points assuming a character in the span. */
950 #ifdef STAPREGEX_DEBUG_TNFA
951 cerr
<< "building transitions for state " << curr
->label
<< ":" << endl
;
953 for (list
<span
>::iterator it
= curr
->spans
.begin();
954 it
!= curr
->spans
.end(); it
++)
956 /* Set up candidate target state: */
957 state_kernel
*u_pairs
= te_closure(this, it
->reach_pairs
, ntags
);
958 state
*target
= new state(this, u_pairs
);
960 /* Generate position-save commands for any map items
961 that do not appear in the edge: */
962 tdfa_action c
= compute_action(it
->jump_pairs
, u_pairs
);
964 /* If there is a state t_prime in states such that some
965 sequence of reordering commands r produces t_prime
966 from target, use t_prime as the target state,
967 appending the reordering commands to c. */
968 state
*t_prime
= find_equivalent(target
, c
);
971 assert (t_prime
!= target
);
976 /* We need to actually add target to the dfa: */
979 worklist
.push(t_prime
);
980 #ifdef STAPREGEX_DEBUG_TNFA
981 cerr
<< " -*- add new state " << t_prime
->label
<< endl
;
985 /* Set the transition: */
989 #ifdef STAPREGEX_DEBUG_TNFA
990 cerr
<< " -> constructed " << curr
<< endl
;
993 #ifdef STAPREGEX_DEBUG_TNFA
1010 // ------------------------------------------------------------------------
1013 span::emit_jump (translator_output
*o
, const dfa
*d
) const
1015 #ifdef STAPREGEX_DEBUG_MATCH
1016 o
->newline () << "_stp_printf(\" --> @%ld GOTO yystate%d\\n\", "
1017 << "YYLENGTH, " << to
->label
<< ");";
1018 o
->newline () << "_stp_print_flush();";
1027 // We record map_items *after* consuming YYCURSOR:
1028 o
->newline () << "YYCURSOR++;";
1029 d
->emit_action(o
, action
);
1030 o
->newline () << "goto yystate" << to
->label
<< ";";
1033 /* Assuming the target DFA state of the span is a final state, emit code to
1034 cleanup tags and (if appropriate) exit with a final answer. */
1036 span::emit_final (translator_output
*o
, const dfa
*d
) const
1038 assert (to
->accepts
); // XXX: must guarantee correct usage of emit_final()
1040 // We record map_items *after* consuming YYCURSOR:
1041 o
->newline () << "YYCURSOR++;";
1042 d
->emit_action(o
, action
);
1044 // XXX: Note that condition to->finalizer.empty() is only
1045 // appropriate for the two-outcome scheme with one outcome being a
1047 if (d
->ntags
== 0 || to
->finalizer
.empty()) // terminate immediately
1049 d
->emit_action(o
, to
->finalizer
);
1052 o
->newline() << d
->outcome_snippets
[to
->accept_outcome
];
1053 o
->newline() << "goto yyfinish;";
1057 // Need to return the outcome associated with the longest match:
1058 o
->newline() << "if ( YYFINAL(0) >= 0 ) {";
1059 o
->newline(1) << d
->outcome_snippets
[d
->success_outcome
];
1060 o
->newline(-1) << "} else {";
1061 o
->newline(1) << d
->outcome_snippets
[d
->fail_outcome
];
1062 o
->newline(-1) << "}";
1063 o
->newline() << "goto yyfinish;";
1068 // Ensure longest-match priority by comparing length + start coord:
1069 map_item new_tag_0
; bool found
= false;
1070 for (tdfa_action::iterator it
= to
->finalizer
.begin();
1071 it
!= to
->finalizer
.end(); it
++)
1072 // TODOXXX: Only works if finalizer only contains reordering commands
1073 // (perhaps make that into an explicitly checked condition?)
1074 if (it
->save_tag
&& it
->from
.first
== 0)
1076 new_tag_0
= it
->from
; // the map_item saved to tag 0
1080 #define NEW_TAG_0 "YYTAG(" << new_tag_0.first << "," << new_tag_0.second << ")"
1081 // if (new_tag_0 == old_tag_0 && new_length > old_length) emit action;
1082 o
->newline() << "if ( YYFINAL(0) < 0 || "
1083 << "(" << NEW_TAG_0
<< " == YYFINAL(0) &&";
1084 o
->newline() << " (YYLENGTH - " << NEW_TAG_0
<< ")"
1085 << " > (YYFINAL(1) - YYFINAL(0)))) {";
1086 o
->newline(1); d
->emit_action(o
, to
->finalizer
);
1088 o
->newline() << "}";
1090 o
->newline () << "goto yystate" << to
->label
<< ";";
1094 string
c_char(rchar c
)
1098 print_escaped(o
, c
);
1104 state::emit (translator_output
*o
, const dfa
*d
) const
1106 o
->newline() << "yystate" << label
<< ": ";
1107 #ifdef STAPREGEX_DEBUG_MATCH
1108 o
->newline () << "_stp_printf(\"@%ld READ '%s' %c\", "
1109 << "YYLENGTH, cur, *YYCURSOR);";
1110 o
->newline () << "_stp_print_flush();";
1112 o
->newline() << "switch (*YYCURSOR) {";
1114 const span
*default_span
= NULL
;
1115 for (list
<span
>::const_iterator it
= spans
.begin();
1116 it
!= spans
.end(); it
++)
1118 // If we see a '\0', go immediately into an accept state:
1121 o
->newline() << "case " << c_char('\0') << ":";
1122 it
->emit_final(o
, d
);
1125 // Emit labels to handle all the other elements of the span:
1126 for (unsigned c
= max((rchar
) '\1', it
->lb
);
1127 c
<= (unsigned) it
->ub
; c
++) {
1130 default_span
= &(*it
);
1131 continue; // XXX: not an ASCII char, needs special handling
1133 o
->newline() << "case " << c_char((rchar
) c
) << ":";
1135 it
->emit_jump(o
, d
);
1137 // TODOXXX 'default' option should handle the largest span
1138 // TODOXXX optimize by accepting before end of string whenever possible
1142 // Handle a non-ASCII (unknown) char:
1143 o
->newline() << "default:";
1144 default_span
->emit_jump(o
, d
);
1146 o
->newline(-1) << "}";
1150 dfa::emit (translator_output
*o
) const
1152 #ifdef STAPREGEX_DEBUG_DFA
1155 o
->newline() << "{";
1161 o
->newline() << "unsigned int i;";
1162 o
->newline() << "for (i = 0; i < STAPREGEX_MAX_TAG; i++)";
1163 o
->newline(1) << "YYFINAL(i) = -1;";
1167 emit_action(o
, initializer
);
1171 emit_action(o
, first
->finalizer
);
1173 if (first
->accepts
&& ntags
== 0) // XXX workaround for empty regex
1175 o
->newline() << outcome_snippets
[first
->accept_outcome
];
1176 o
->newline() << "goto yyfinish;";
1179 for (state
*s
= first
; s
; s
= s
->next
)
1182 o
->newline() << "yyfinish: ;";
1183 o
->newline(-1) << "}";
1188 dfa::emit_action (translator_output
*o
, const tdfa_action
&act
) const
1190 #ifdef STAPREGEX_DEBUG_MATCH
1191 o
->newline () << "_stp_printf(\" --> @%ld, SET_TAG %s\\n\", "
1192 << "YYLENGTH, \"" << act
<< "\");";
1193 o
->newline () << "_stp_print_flush();";
1195 for (tdfa_action::const_iterator it
= act
.begin(); it
!= act
.end(); it
++)
1198 o
->newline() << "YYFINAL(" << it
->from
.first
<< ") = ";
1200 o
->newline() << "YYTAG(" << it
->to
.first
1201 << "," << it
->to
.second
<< ") = ";
1203 o
->line() << "YYLENGTH";
1205 o
->line() << "YYTAG(" << it
->from
.first
1206 << "," << it
->from
.second
<< ")";
1212 dfa::emit_tagsave (translator_output
*o
, std::string
,
1213 std::string
, std::string num_final_tags
) const
1215 // TODOXXX: ignoring other two snippets (tag_states and tag_vals),
1216 // which are handled by the earlier code in the actual matcher.
1217 o
->newline() << num_final_tags
<< " = " << ntags
<< ";";
1220 // ------------------------------------------------------------------------
1223 operator << (std::ostream
&o
, const map_item
& m
)
1225 o
<< "m[" << m
.first
<< "," << m
.second
<< "]";
1230 operator << (std::ostream
&o
, const tdfa_action
& a
)
1232 for (list
<tdfa_insn
>::const_iterator it
= a
.begin();
1233 it
!= a
.end(); it
++)
1235 if (it
!= a
.begin()) o
<< "; ";
1238 o
<< "t[" << it
->from
.first
<< "] <- ";
1240 o
<< it
->to
<< " <- ";
1252 operator << (std::ostream
&o
, const arc_priority
& p
)
1254 o
<< p
.first
<< "/" << (1 << p
.second
);
1259 kernel_point::print (std::ostream
&o
, ins
*base
) const
1262 o
<< "[" << priority
<< "]";
1263 if (!map_items
.empty())
1266 for (list
<map_item
>::const_iterator it
= map_items
.begin();
1267 it
!= map_items
.end(); it
++)
1269 if (it
!= map_items
.begin()) o
<< ",";
1276 state::print (translator_output
*o
) const
1278 o
->line() << "state " << label
;
1280 #ifdef STAPREGEX_DEBUG_TNFA
1281 // For debugging, also show the kernel:
1282 ins
*base
= owner
->orig_nfa
;
1283 o
->line() << " w/kernel {";
1284 for (state_kernel::iterator it
= kernel
->begin();
1285 it
!= kernel
->end(); it
++)
1287 if (it
!= kernel
->begin()) o
->line() << "; ";
1288 it
->print(o
->line(), base
);
1292 // Also print information for constructing reorderings:
1293 set
<map_item
> all_items
;
1294 for (state_kernel::iterator it
= kernel
->begin();
1295 it
!= kernel
->end(); it
++)
1296 for (list
<map_item
>::iterator jt
= it
->map_items
.begin();
1297 jt
!= it
->map_items
.end(); jt
++)
1298 all_items
.insert(*jt
);
1299 if (!all_items
.empty())
1301 o
->newline() << " ";
1302 o
->line() << " with map_items ";
1303 for (set
<map_item
>::iterator it
= all_items
.begin();
1304 it
!= all_items
.end(); it
++)
1305 o
->line() << *it
<< " ";
1308 if (accepts
|| !finalizer
.empty())
1309 o
->newline() << " ";
1313 o
->line() << " accepts " << accept_outcome
;
1314 if (!finalizer
.empty())
1315 o
->line() << " with finalizer {" << finalizer
<< "}";
1317 // TODOXXX: factor this out to span::print()
1319 for (list
<span
>::const_iterator it
= spans
.begin();
1320 it
!= spans
.end(); it
++)
1322 o
->newline() << "'";
1323 if (it
->lb
== it
->ub
)
1325 print_escaped (o
->line(), it
->lb
);
1330 print_escaped (o
->line(), it
->lb
);
1332 print_escaped (o
->line(), it
->ub
);
1336 o
->line() << "' -> " << it
->to
->label
;
1338 o
->line() << "' -> <none>";
1340 if (!it
->action
.empty())
1341 o
->line() << " {" << it
->action
<< "}";
1347 state::print (std::ostream
&o
) const
1349 translator_output
to(o
); print(&to
);
1353 operator << (std::ostream
&o
, const state
*s
)
1360 dfa::print (translator_output
*o
) const
1363 for (state
*s
= first
; s
; s
= s
->next
)
1372 dfa::print (std::ostream
& o
) const
1374 translator_output
to(o
); print(&to
);
1378 operator << (std::ostream
& o
, const dfa
& d
)
1385 operator << (std::ostream
&o
, const dfa
*d
)
1393 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */