stapregex-dfa.h

   1 // -*- C++ -*-
   2 // Copyright (C) 2012-2013 Red Hat Inc.
   3 //
   4 // This file is part of systemtap, and is free software.  You can
   5 // redistribute it and/or modify it under the terms of the GNU General
   6 // Public License (GPL); either version 2, or (at your option) any
   7 // later version.
   8 //
   9 // ---
  10 //
  11 // This file incorporates code from the re2c project; please see
  12 // the file README.stapregex for details.
  13
  14 #ifndef STAPREGEX_DFA_H
  15 #define STAPREGEX_DFA_H
  16
  17 #include <string>
  18 #include <iostream>
  19 #include <set>
  20 #include <list>
  21 #include <vector>
  22 #include <queue>
  23 #include <utility>
  24
  25 class translator_output; /* from translator-output.h */
  26
  27 namespace stapregex {
  28
  29 struct regexp; /* from stapregex-tree.h */
  30 union ins; /* from stapregex-tree.h */
  31
  32 struct dfa;
  33 struct state;
  34
  35 /* Coordinates of a subexpression map item m[t,s]: */
  36 typedef std::pair<unsigned, unsigned> map_item;
  37
  38 /* A tagged DFA transition can have a number of these instructions
  39    attached, which are able to assign the current position to specific
  40    map items, or to reorder the existing elements of the map: */
  41 struct tdfa_insn {
  42   map_item to, from;
  43   bool save_pos; // -- if true, assign position; if false, copy from's value
  44 };
  45 typedef std::list<tdfa_insn> tdfa_action;
  46
  47 std::ostream& operator << (std::ostream &o, const tdfa_action& a);
  48
  49 /* The arc_priority data type is a cunning way to represent transition
  50    priorities, necessitated by the fact that we have FORK opcodes (two
  51    outgoing e-transitions) which can lead to further FORK opcodes, &c,
  52    requiring a binary-subdivision style of priority assignment:
  53
  54                 -> 3/4 ... and so forth
  55                /
  56         -> 1/2 --> 2/4 ... and so forth
  57        /
  58       /       ---> 1/4 ... and so forth
  59      /       /
  60    0 ----> 0 ----> 0   ... and so forth
  61
  62    Our trick is pretty much just to allocate the possible values of an
  63    unsigned long in binary-search fashion.
  64
  65    XXX: For a 64-bit unsigned long type, this allows a chain of FORK
  66    opcodes around 64 units long (without intervening CHAR match
  67    insns), at which point things start to get funky. Be sure to keep
  68    an eye on whether this turns out to be enough in practice. */
  69 typedef std::pair<unsigned long, unsigned> arc_priority;
  70 arc_priority refine_higher(const arc_priority& a);
  71 arc_priority refine_lower(const arc_priority& a);
  72 int arc_compare(const arc_priority& a, const arc_priority& b);
  73
  74 std::ostream& operator << (std::ostream &o, const arc_priority& p);
  75
  76 /* When constructing tagged DFA sets from ins, we need to keep track
  77    of a set of instructions together with further bookkeeping
  78    information (relative preference/priority, map items affected). */
  79 struct kernel_point {
  80   ins *i;
  81   arc_priority priority; // -- used in tagged e-closure computation
  82   std::list<map_item> map_items;
  83   std::set<ins *> parents; // -- used for infinite-loop-detection
  84 };
  85 typedef std::list<kernel_point> state_kernel;
  86
  87 /* Corresponds to a tagged-DFA transition arc, complete with
  88    subexpression map reordering and such. */
  89 struct span {
  90   char lb, ub; // -- segment [lb, ub]
  91   state *to;
  92   tdfa_action action;
  93   state_kernel *reach_pairs; // -- for the subset-construction
  94                              // -- algorithm
  95
  96   void emit_jump (translator_output *o, const dfa *d) const;
  97   void emit_final (translator_output *o, const dfa *d) const;
  98 };
  99
 100 struct state {
 101   unsigned label; // -- index of state in dfa
 102   state *next;    // -- store dfa states as a linked list
 103   state_kernel *kernel; // -- set of corresponding ins coordinates
 104   /* NB: our usage of the term 'kernel' differs from re2c's slightly
 105      -- there is no real need to distinguish NFA edges inside the
 106      state from outgoing edges, (XXX) as far as I am aware. */
 107
 108   bool accepts;   // -- is this a final state?
 109   unsigned accept_outcome;
 110   tdfa_action finalizer; // -- run after accepting
 111
 112   std::list<span> spans;
 113
 114   state (state_kernel *kernel);
 115
 116   void emit (translator_output *o, const dfa *d) const;
 117
 118   void print (translator_output *o) const;
 119 };
 120
 121 // ------------------------------------------------------------------------
 122
 123 struct dfa {
 124   ins *orig_nfa;
 125   state *first, *last; // -- store dfa states as a linked list
 126   unsigned nstates;
 127
 128   // Infrastructure to deal with tagging:
 129   unsigned ntags;
 130   tdfa_action initializer; // -- run before entering start state
 131   std::vector<std::string> outcome_snippets;
 132
 133   dfa (ins *i, int ntags, std::vector<std::string>& outcome_snippets);
 134   ~dfa ();
 135
 136   void emit (translator_output *o) const;
 137   void emit_tagsave (translator_output *o, std::string tag_states,
 138                      std::string tag_vals, std::string tag_count) const;
 139
 140   void print (translator_output *o) const;
 141   void print (std::ostream& o) const;
 142
 143 private:
 144   state *add_state (state* s);
 145   state *find_equivalent (state *s, tdfa_action &r);
 146 };
 147
 148 std::ostream& operator << (std::ostream &o, const dfa& d);
 149 std::ostream& operator << (std::ostream &o, const dfa* d);
 150
 151 /* Produces a dfa that runs the specified code snippets based on match
 152    or fail outcomes for an unanchored (by default) match of re. */
 153 dfa *stapregex_compile (regexp *re, const std::string& match_snippet, const std::string& fail_snippet);
 154
 155 };
 156
 157 #endif
 158
 159 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */