stapregex-dfa.h

   1 // -*- C++ -*-
   2 // Copyright (C) 2012-2018 Red Hat Inc.
   3 //
   4 // This file is part of systemtap, and is free software.  You can
   5 // redistribute it and/or modify it under the terms of the GNU General
   6 // Public License (GPL); either version 2, or (at your option) any
   7 // later version.
   8 //
   9 // ---
  10 //
  11 // This file incorporates code from the re2c project; please see
  12 // the file README.stapregex for details.
  13
  14 #ifndef STAPREGEX_DFA_H
  15 #define STAPREGEX_DFA_H
  16
  17 #include <string>
  18 #include <iostream>
  19 #include <set>
  20 #include <list>
  21 #include <vector>
  22 #include <queue>
  23 #include <utility>
  24
  25 #include "stapregex-defines.h"
  26
  27 class translator_output; /* from translator-output.h */
  28
  29 namespace stapregex {
  30
  31 struct regexp; /* from stapregex-tree.h */
  32 union ins; /* from stapregex-tree.h */
  33
  34 struct dfa;
  35 struct state;
  36
  37 /* Coordinates of a subexpression map item m[t,s]: */
  38 typedef std::pair<unsigned, unsigned> map_item;
  39
  40 std::ostream& operator << (std::ostream &o, const map_item& m);
  41
  42 /* A tagged DFA transition can have a number of these instructions
  43    attached, which are able to assign the current position to specific
  44    map items, or to reorder the existing elements of the map: */
  45 struct tdfa_insn {
  46   map_item to, from;
  47   bool save_tag; // -- if true, copy from's value to final tag
  48   bool save_pos; // -- if true, assign cur position; else, copy from's value
  49 };
  50 typedef std::list<tdfa_insn> tdfa_action;
  51
  52 std::ostream& operator << (std::ostream &o, const tdfa_action& a);
  53
  54 /* The arc_priority data type is a cunning way to represent transition
  55    priorities, necessitated by the fact that we have FORK opcodes (two
  56    outgoing e-transitions) which can lead to further FORK opcodes, &c,
  57    requiring a binary-subdivision style of priority assignment:
  58
  59                 -> 3/4 ... and so forth
  60                /
  61         -> 1/2 --> 2/4 ... and so forth
  62        /
  63       /       ---> 1/4 ... and so forth
  64      /       /
  65    0 ----> 0 ----> 0   ... and so forth
  66
  67    Our trick is pretty much just to allocate the possible values of an
  68    unsigned long in binary-search fashion.
  69
  70    XXX: For a 64-bit unsigned long type, this allows a chain of FORK
  71    opcodes around 64 units long (without intervening CHAR match
  72    insns), at which point things start to get funky. Be sure to keep
  73    an eye on whether this turns out to be enough in practice.
  74
  75    TODOXXX: May want to test how this plays out in 32-bit architectures. */
  76 typedef std::pair<unsigned long long, unsigned> arc_priority;
  77 #define MAKE_START_PRIORITY make_pair(0,0)
  78 arc_priority refine_higher(const arc_priority& a);
  79 arc_priority refine_lower(const arc_priority& a);
  80 int arc_compare(const arc_priority& a, const arc_priority& b);
  81
  82 std::ostream& operator << (std::ostream &o, const arc_priority& p);
  83
  84 /* When constructing tagged DFA sets from ins, we need to keep track
  85    of a set of instructions together with further bookkeeping
  86    information (relative preference/priority, map items affected). */
  87 struct kernel_point {
  88   ins *i;
  89   arc_priority priority; // -- used in tagged e-closure computation
  90   std::list<map_item> map_items;
  91   std::set<ins *> parents; // -- used for infinite-loop-detection
  92   void print (std::ostream &o, ins *base) const;
  93 };
  94 typedef std::list<kernel_point> state_kernel; // TODO: does it make sense to have duplicate ins inside a state-kernel?
  95
  96 /* Corresponds to a tagged-DFA transition arc, complete with
  97    subexpression map reordering and such. */
  98 struct span {
  99   rchar lb, ub; // -- segment [lb, ub]
 100   state *to;
 101   tdfa_action action;
 102   state_kernel *jump_pairs; // -- kernel_points that jump to this span
 103   state_kernel *reach_pairs; // -- starting point for te_closure computation
 104
 105   void emit_jump (translator_output *o, const dfa *d) const;
 106   void emit_final (translator_output *o, const dfa *d) const;
 107 };
 108
 109 struct state {
 110   dfa *owner;     // -- dfa state was made for (XXX may not contain state yet)
 111   unsigned label; // -- index of state in dfa
 112   state *next;    // -- store dfa states as a linked list
 113   state_kernel *kernel; // -- set of corresponding ins coordinates
 114   /* NB: our usage of the term 'kernel' differs from re2c's slightly
 115      -- there is no real need to distinguish NFA edges inside the
 116      state from outgoing edges, (XXX) as far as I am aware. */
 117
 118   bool accepts;   // -- is this a final state?
 119   unsigned accept_outcome;
 120   kernel_point *accept_kp;
 121   tdfa_action finalizer; // -- run after accepting
 122
 123   std::list<span> spans;
 124
 125   state (dfa *dfa, state_kernel *kernel);
 126
 127   void emit (translator_output *o, const dfa *d) const;
 128
 129   void print (translator_output *o) const;
 130   void print (std::ostream& o) const;
 131 };
 132
 133 std::ostream& operator << (std::ostream &o, const state* s);
 134
 135 // ------------------------------------------------------------------------
 136
 137 struct dfa {
 138   ins *orig_nfa;
 139   state *first, *last; // -- store dfa states as a linked list
 140   unsigned nstates;
 141
 142   // Infrastructure to deal with tagging:
 143   unsigned nmapitems;
 144   unsigned ntags;
 145   tdfa_action initializer; // -- run before entering start state
 146   std::vector<std::string> outcome_snippets;
 147
 148   // When using tagged DFAs, record indices of the success and failure outcome:
 149   int success_outcome;
 150   int fail_outcome;
 151
 152   dfa (ins *i, int ntags, std::vector<std::string>& outcome_snippets,
 153        int accept_outcome = 1);
 154   ~dfa ();
 155
 156   void emit (translator_output *o) const;
 157
 158   void emit_action (translator_output *o, const tdfa_action &act) const;
 159   void emit_tagsave (translator_output *o, std::string tag_states,
 160                      std::string tag_vals, std::string tag_count) const;
 161
 162   void print (translator_output *o) const;
 163   void print (std::ostream& o) const;
 164
 165 private:
 166   void add_map_item (const map_item &m);
 167   state *add_state (state* s);
 168   state *find_equivalent (state *s, tdfa_action &r);
 169   tdfa_action compute_action (state_kernel *old_k, state_kernel *new_k);
 170   tdfa_action compute_finalizer (state *s);
 171 };
 172
 173 std::ostream& operator << (std::ostream &o, const dfa& d);
 174 std::ostream& operator << (std::ostream &o, const dfa* d);
 175
 176 /* Produces a dfa that runs the specified code snippets based on match
 177    or fail outcomes for an unanchored (by default) match of re. */
 178 dfa *stapregex_compile (regexp *re, const std::string& match_snippet, const std::string& fail_snippet);
 179
 180 };
 181
 182 #endif
 183
 184 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */