]>
sourceware.org Git - systemtap.git/blob - stapregex-dfa.cxx
2 // Copyright (C) 2012-2013 Red Hat Inc.
4 // This file is part of systemtap, and is free software. You can
5 // redistribute it and/or modify it under the terms of the GNU General
6 // Public License (GPL); either version 2, or (at your option) any
11 // This file incorporates code from the re2c project; please see
12 // the file README.stapregex for details.
24 #include "translator-output.h"
26 #include "stapregex-parse.h"
27 #include "stapregex-tree.h"
28 #include "stapregex-dfa.h"
30 // Uncomment to show result of ins (NFA) compilation:
31 //#define STAPREGEX_DEBUG_INS
32 // Uncomment to display result of DFA compilation in a compact format:
33 //#define STAPREGEX_DEBUG_DFA
34 // Uncomment to have the generated engine do a trace of visited states:
35 //#define STAPREGEX_DEBUG_MATCH
41 regexp
*pad_re
= NULL
;
42 regexp
*fail_re
= NULL
;
45 stapregex_compile (regexp
*re
, const std::string
& match_snippet
,
46 const std::string
& fail_snippet
)
49 // build regexp for ".*"
51 pad_re
= new close_op (pad_re
, true); // -- prefer shorter match
52 pad_re
= new alt_op (pad_re
, new null_op
, true); // -- prefer second match
54 if (fail_re
== NULL
) {
55 // build regexp for ".*$", but allow '\0' and support fail outcome
56 fail_re
= make_dot (true); // -- allow '\0'
57 fail_re
= new close_op (fail_re
, true); // -- prefer shorter match
58 fail_re
= new alt_op (fail_re
, new null_op
, true); // -- prefer second match
59 fail_re
= new cat_op (fail_re
, new anchor_op('$'));
60 fail_re
= new rule_op(fail_re
, 0);
61 // XXX: this approach creates one extra spurious-but-safe state
62 // (safe because the matching procedure stops after encountering '\0')
65 vector
<string
> outcomes(2);
66 outcomes
[0] = fail_snippet
;
67 outcomes
[1] = match_snippet
;
69 int num_tags
= re
->num_tags
;
71 // Pad & wrap re in appropriate rule_ops to control match behaviour:
72 bool anchored
= re
->anchored ();
73 if (!anchored
) re
= new cat_op(pad_re
, re
); // -- left-padding
74 re
= new rule_op(re
, 1);
75 re
= new alt_op(re
, fail_re
);
77 #ifdef STAPREGEX_DEBUG_INS
78 cerr
<< "RESULTING INS FROM REGEX " << re
<< ":" << endl
;
81 ins
*i
= re
->compile();
83 #ifdef STAPREGEX_DEBUG_INS
84 for (const ins
*j
= i
; (j
- i
) < re
->ins_size() + 1; )
86 j
= show_ins(cerr
, j
, i
); cerr
<< endl
;
91 // TODOXXX optimize ins as in re2c
93 dfa
*d
= new dfa(i
, num_tags
, outcomes
);
95 // Carefully deallocate temporary scaffolding:
96 if (!anchored
) delete ((rule_op
*) ((alt_op
*) re
)->a
)->re
; // -- new cat_op
97 delete ((alt_op
*) re
)->a
; // -- new rule_op
98 delete re
; // -- new alt_op
99 // NB: deleting a regular expression DOES NOT deallocate its
100 // children. The original re parameter is presumed to be retained
101 // indefinitely as part of a stapdfa table, or such....
106 // ------------------------------------------------------------------------
108 /* Now follows the heart of the tagged-DFA algorithm. This is a basic
109 implementation of the algorithm described in Ville Laurikari's
110 Masters thesis and summarized in the paper "NFAs with Tagged
111 Transitions, their Conversion to Deterministic Automata and
112 Application to Regular Expressions"
113 (http://laurikari.net/ville/spire2000-tnfa.pdf).
115 TODOXXX: The following does not contain a fully working
116 implementation of the tagging support, but only of the regex
119 HERE BE DRAGONS (and not the friendly kind) */
121 /* Functions to deal with relative transition priorities: */
124 refine_higher(const arc_priority
& a
)
126 return make_pair(2 * a
.first
+ 1, a
.second
+ 1);
130 refine_lower (const arc_priority
& a
)
132 return make_pair(2 * a
.first
, a
.second
+ 1);
136 arc_compare (const arc_priority
& a
, const arc_priority
& b
)
138 unsigned long x
= a
.first
;
139 unsigned long y
= b
.first
;
141 if (a
.second
> b
.second
)
142 x
= x
<< (a
.second
- b
.second
);
143 else if (a
.second
< b
.second
)
144 y
= y
<< (b
.second
- a
.second
);
146 return ( x
== y
? 0 : x
< y
? -1 : 1 );
149 /* Manage the linked list of states in a DFA: */
151 state::state (state_kernel
*kernel
)
152 : label(~0), next(NULL
), kernel(kernel
), accepts(false), accept_outcome(0) {}
155 dfa::add_state (state
*s
)
157 s
->label
= nstates
++;
174 /* Operations to build a simple kernel prior to taking closure: */
177 add_kernel (state_kernel
*kernel
, ins
*i
)
181 point
.priority
= make_pair(0,0);
182 // NB: point->map_items is empty
184 kernel
->push_back(point
);
190 state_kernel
*kernel
= new state_kernel
;
191 add_kernel (kernel
, i
);
195 /* Compute the set of kernel_points that are 'tag-wise unambiguously
196 reachable' from a given initial set of points. Absent tagging, this
197 becomes a bog-standard NFA e_closure construction. */
199 te_closure (state_kernel
*start
, int ntags
, bool is_initial
= false)
201 state_kernel
*closure
= new state_kernel(*start
);
202 queue
<kernel_point
> worklist
;
204 /* To avoid searching through closure incessantly when retrieving
205 information about existing elements, the following caches are
207 vector
<unsigned> max_tags (ntags
, 0);
208 map
<ins
*, list
<kernel_point
> > closure_map
;
210 /* Reset priorities and cache initial elements of closure: */
211 for (state_kernel::iterator it
= closure
->begin();
212 it
!= closure
->end(); it
++)
214 it
->priority
= make_pair(0,0);
217 // Store the element in relevant caches:
219 for (list
<map_item
>::const_iterator jt
= it
->map_items
.begin();
220 jt
!= it
->map_items
.end(); jt
++)
221 max_tags
[jt
->first
] = max(jt
->second
, max_tags
[jt
->first
]);
223 closure_map
[it
->i
].push_back(*it
);
226 while (!worklist
.empty())
228 kernel_point point
= worklist
.front(); worklist
.pop();
230 // Identify e-transitions depending on the opcode.
231 // There are at most two e-transitions emerging from an insn.
232 // If we have two e-transitions, the 'other' has higher priority.
234 ins
*target
= NULL
; int tag
= -1;
235 ins
*other_target
= NULL
; int other_tag
= -1;
237 // TODOXXX line-by-line proceeds below
239 bool do_split
= false;
241 if (point
.i
->i
.tag
== TAG
)
243 target
= &point
.i
[1];
244 tag
= (int) point
.i
->i
.param
;
246 else if (point
.i
->i
.tag
== FORK
&& point
.i
== (ins
*) point
.i
->i
.link
)
248 /* Workaround for a FORK that points to itself: */
249 target
= &point
.i
[1];
251 else if (point
.i
->i
.tag
== FORK
)
254 // Relative priority of two e-transitions depends on param:
255 if (point
.i
->i
.param
)
257 // Prefer jumping to link.
258 target
= &point
.i
[1];
259 other_target
= (ins
*) point
.i
->i
.link
;
263 // Prefer stepping to next instruction.
264 target
= (ins
*) point
.i
->i
.link
;
265 other_target
= &point
.i
[1];
268 else if (point
.i
->i
.tag
== GOTO
)
270 target
= (ins
*) point
.i
->i
.link
;
272 else if (point
.i
->i
.tag
== INIT
&& is_initial
)
274 target
= &point
.i
[1];
279 // Data for the endpoint of the first transition:
282 next
.priority
= do_split
? refine_lower(point
.priority
) : point
.priority
;
283 next
.map_items
= point
.map_items
;
285 // Date for the endpoint of the second transition:
286 kernel_point other_next
;
287 other_next
.i
= other_target
;
288 other_next
.priority
= do_split
? refine_higher(point
.priority
) : point
.priority
;
289 other_next
.map_items
= point
.map_items
;
291 // Do infinite-loop-check:
292 other_next
.parents
= point
.parents
;
293 if (point
.parents
.find(other_next
.i
) != point
.parents
.end())
298 other_next
.parents
.insert(other_next
.i
);
300 next
.parents
= point
.parents
;
301 if (point
.parents
.find(next
.i
) != point
.parents
.end())
305 // ^^^ these are overwritten from other_target / other_tag immediately
308 next
.parents
.insert(next
.i
);
314 // Deal with the current e-transition:
318 /* Delete all existing next.map_items of the form m[tag,x]. */
319 for (list
<map_item
>::iterator it
= next
.map_items
.begin();
320 it
!= next
.map_items
.end(); )
321 if (it
->first
== (unsigned) tag
)
323 list
<map_item
>::iterator next_it
= it
;
325 next
.map_items
.erase (it
);
330 /* Add m[tag,x] to next.map_items, where x is the smallest
331 nonnegative integer such that m[tag,x] does not occur
332 anywhere in closure. Then update the cache. */
333 unsigned x
= max_tags
[tag
];
334 next
.map_items
.push_back(make_pair(tag
, ++x
));
338 already_found
= false;
340 /* Deal with similar transitions that have a different priority. */
341 for (list
<kernel_point
>::iterator it
= closure_map
[next
.i
].begin();
342 it
!= closure_map
[next
.i
].end(); )
344 int result
= arc_compare(it
->priority
, next
.priority
);
347 // obnoxious shuffle to avoid iterator invalidation
348 list
<kernel_point
>::iterator old_it
= it
;
350 closure_map
[next
.i
].erase(old_it
);
352 } else if (result
== 0) {
353 already_found
= true;
358 if (!already_found
) {
359 // Store the element in relevant caches:
361 closure_map
[next
.i
].push_back(next
);
363 for (list
<map_item
>::iterator jt
= next
.map_items
.begin();
364 jt
!= next
.map_items
.end(); jt
++)
365 max_tags
[jt
->first
] = max(jt
->second
, max_tags
[jt
->first
]);
367 // Store the element in closure:
368 closure
->push_back(next
);
373 // Now move to dealing with the second e-transition, if any.
374 target
= other_target
; other_target
= NULL
;
375 tag
= other_tag
; other_tag
= -1;
378 goto another_transition
;
384 /* Find the set of reordering commands (if any) that will get us from
385 state s to some existing state in the dfa (returns the state in
386 question, appends reordering commands to r). Returns NULL is no
387 suitable state is found. */
389 dfa::find_equivalent (state
*s
, tdfa_action
&)
391 state
*answer
= NULL
;
393 for (state_kernel::iterator it
= s
->kernel
->begin();
394 it
!= s
->kernel
->end(); it
++)
397 /* Check kernels of existing states for size equivalence and for
398 unmarked items (similar to re2c's original algorithm): */
399 unsigned n
= s
->kernel
->size();
400 for (state
*t
= first
; t
!= NULL
; t
= t
->next
)
402 if (t
->kernel
->size() == n
)
404 for (state_kernel::iterator it
= t
->kernel
->begin();
405 it
!= t
->kernel
->end(); it
++)
409 // TODOXXX check for existence of reordering tdfa_action r
418 for (state_kernel::iterator it
= s
->kernel
->begin();
419 it
!= s
->kernel
->end(); it
++)
426 dfa::dfa (ins
*i
, int ntags
, vector
<string
>& outcome_snippets
)
427 : orig_nfa(i
), nstates(0), ntags(ntags
), outcome_snippets(outcome_snippets
)
429 /* Initialize empty linked list of states: */
433 state_kernel
*seed_kernel
= make_kernel(start
);
434 state_kernel
*initial_kernel
= te_closure(seed_kernel
, ntags
, true);
436 state
*initial
= add_state(new state(initial_kernel
));
437 queue
<state
*> worklist
; worklist
.push(initial
);
439 while (!worklist
.empty())
441 state
*curr
= worklist
.front(); worklist
.pop();
443 vector
<list
<ins
*> > edges(NUM_REAL_CHARS
);
445 /* Using the CHAR instructions in kernel, build the initial
446 table of spans for curr. Also check for final states. */
448 for (list
<kernel_point
>::iterator it
= curr
->kernel
->begin();
449 it
!= curr
->kernel
->end(); it
++)
451 if (it
->i
->i
.tag
== CHAR
)
453 for (ins
*j
= &it
->i
[1]; j
< (ins
*) it
->i
->i
.link
; j
++)
454 edges
[j
->c
.value
].push_back((ins
*) it
->i
->i
.link
);
456 else if (it
->i
->i
.tag
== ACCEPT
)
458 /* Always prefer the highest numbered outcome: */
459 curr
->accepts
= true;
460 curr
->accept_outcome
= max(it
->i
->i
.param
, curr
->accept_outcome
);
464 for (unsigned c
= 0; c
< NUM_REAL_CHARS
; )
466 list
<ins
*> e
= edges
[c
];
467 assert (!e
.empty()); // XXX: ensured by fail_re in stapregex_compile
473 while (++c
< NUM_REAL_CHARS
&& edges
[c
] == e
) ;
477 s
.reach_pairs
= new state_kernel
;
479 for (list
<ins
*>::iterator it
= e
.begin();
481 add_kernel (s
.reach_pairs
, *it
);
483 curr
->spans
.push_back(s
);
486 /* For each of the spans in curr, determine the reachable
487 points assuming a character in the span. */
488 for (list
<span
>::iterator it
= curr
->spans
.begin();
489 it
!= curr
->spans
.end(); it
++)
491 state_kernel
*reach_pairs
= it
->reach_pairs
;
493 /* Set up candidate target state: */
494 state_kernel
*u_pairs
= te_closure(reach_pairs
, ntags
);
495 state
*target
= new state(u_pairs
);
498 /* Generate position-save commands for any map items
499 that do not appear in curr->kernel: */
501 set
<map_item
> all_items
;
502 for (state_kernel::const_iterator jt
= curr
->kernel
->begin();
503 jt
!= curr
->kernel
->end(); jt
++)
504 for (list
<map_item
>::const_iterator kt
= jt
->map_items
.begin();
505 kt
!= jt
->map_items
.end(); jt
++)
506 all_items
.insert(*kt
);
508 list
<map_item
> store_items
;
509 for (state_kernel::const_iterator jt
= u_pairs
->begin();
510 jt
!= u_pairs
->end(); jt
++)
511 for (list
<map_item
>::const_iterator kt
= jt
->map_items
.begin();
512 kt
!= jt
->map_items
.end(); kt
++)
513 if (all_items
.find(*kt
) == all_items
.end())
514 store_items
.push_back(*kt
);
516 for (list
<map_item
>::iterator jt
= store_items
.begin();
517 jt
!= store_items
.end(); jt
++)
519 // append m[i,n] <- <curr position> to c
522 insn
.save_pos
= true;
526 /* If there is a state t_prime in states such that some
527 sequence of reordering commands r produces t_prime
528 from target, use t_prime as the target state,
529 appending the reordering commands to c. */
530 state
*t_prime
= find_equivalent(target
, c
);
537 /* We need to actually add target to the dfa: */
540 worklist
.push(t_prime
);
542 if (t_prime
->accepts
)
544 // TODOXXX set the finisher of t_prime
548 /* Set the transition: */
567 // ------------------------------------------------------------------------
569 // TODOXXX add emission instructions for tag_ops
572 span::emit_jump (translator_output
*o
, const dfa
*d
) const
574 #ifdef STAPREGEX_DEBUG_MATCH
575 o
->newline () << "printf(\" --> GOTO yystate%d\\n\", " << to
->label
<< ");";
578 // TODOXXX tags feature allows proper longest-match priority
585 o
->newline () << "YYCURSOR++;";
586 o
->newline () << "goto yystate" << to
->label
<< ";";
590 /* Assuming the target DFA of the span is a final state, emit code to
591 (TODOXXX) cleanup tags and exit with a final answer. */
593 span::emit_final (translator_output
*o
, const dfa
*d
) const
595 assert (to
->accepts
); // XXX: must guarantee correct usage of emit_final()
596 o
->newline() << d
->outcome_snippets
[to
->accept_outcome
];
597 o
->newline() << "goto yyfinish;";
600 string
c_char(char c
)
610 state::emit (translator_output
*o
, const dfa
*d
) const
612 o
->newline() << "yystate" << label
<< ": ";
613 #ifdef STAPREGEX_DEBUG_MATCH
614 o
->newline () << "printf(\"READ '%s' %c\", cur, *YYCURSOR);";
616 o
->newline() << "switch (*YYCURSOR) {";
618 for (list
<span
>::const_iterator it
= spans
.begin();
619 it
!= spans
.end(); it
++)
621 // If we see a '\0', go immediately into an accept state:
624 o
->newline() << "case " << c_char('\0') << ":";
625 it
->emit_final(o
, d
); // TODOXXX extra function may be unneeded
628 // Emit labels to handle all the other elements of the span:
629 for (unsigned c
= max('\1', it
->lb
); c
<= (unsigned) it
->ub
; c
++) {
630 o
->newline() << "case " << c_char((char) c
) << ":";
634 // TODOXXX handle a 'default' set of characters for the largest span...
635 // TODOXXX optimize by accepting before end of string whenever possible... (also necessary for proper first-matched-substring selection)
637 o
->newline(-1) << "}";
641 dfa::emit (translator_output
*o
) const
643 #ifdef STAPREGEX_DEBUG_DFA
649 // XXX: workaround for empty regex
652 o
->newline() << outcome_snippets
[first
->accept_outcome
];
653 o
->newline() << "goto yyfinish;";
656 for (state
*s
= first
; s
; s
= s
->next
)
659 o
->newline() << "yyfinish: ;";
660 o
->newline(-1) << "}";
665 dfa::emit_tagsave (translator_output
*, std::string
,
666 std::string
, std::string
) const
668 // TODOXXX implement after testing the preceding algorithms
671 // ------------------------------------------------------------------------
674 operator << (std::ostream
&o
, const tdfa_action
& a
)
676 for (list
<tdfa_insn
>::const_iterator it
= a
.begin();
679 if (it
!= a
.begin()) o
<< "; ";
681 o
<< "m[" << it
->to
.first
<< "," << it
->to
.second
<< "] <- ";
686 o
<< "m[" << it
->from
.first
<< "," << it
->from
.second
<< "]";
693 operator << (std::ostream
&o
, const arc_priority
& p
)
695 o
<< p
.first
<< "/" << (1 << p
.second
);
700 state::print (translator_output
*o
) const
702 o
->line() << "state " << label
;
704 o
->line() << " accepts " << accept_outcome
;
705 if (!finalizer
.empty())
706 o
->line() << " [" << finalizer
<< "]";
709 for (list
<span
>::const_iterator it
= spans
.begin();
710 it
!= spans
.end(); it
++)
713 if (it
->lb
== it
->ub
)
715 print_escaped (o
->line(), it
->lb
);
720 print_escaped (o
->line(), it
->lb
);
722 print_escaped (o
->line(), it
->ub
);
725 o
->line() << "' -> " << it
->to
->label
;
727 if (!it
->action
.empty())
728 o
->line() << " [" << it
->action
<< "]";
734 dfa::print (std::ostream
& o
) const
736 translator_output
to(o
); print(&to
);
740 dfa::print (translator_output
*o
) const
743 for (state
*s
= first
; s
; s
= s
->next
)
752 operator << (std::ostream
& o
, const dfa
& d
)
759 operator << (std::ostream
&o
, const dfa
*d
)
767 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
This page took 0.069927 seconds and 5 git commands to generate.