[systemtap.git] / stapregex-dfa.cxx

// -*- C++ -*-
// Copyright (C) 2012-2013 Red Hat Inc.
//
// This file is part of systemtap, and is free software.  You can
// redistribute it and/or modify it under the terms of the GNU General
// Public License (GPL); either version 2, or (at your option) any
// later version.
//
// ---
//
// This file incorporates code from the re2c project; please see
// the file README.stapregex for details.

#include <string>
#include <iostream>
#include <sstream>
#include <set>
#include <list>
#include <map>
#include <vector>
#include <queue>
#include <utility>

#include "translator-output.h"

#include "stapregex-parse.h"
#include "stapregex-tree.h"
#include "stapregex-dfa.h"

// Uncomment to show result of ins (NFA) compilation:
//#define STAPREGEX_DEBUG_INS
// Uncomment to display result of DFA compilation in a compact format:
//#define STAPREGEX_DEBUG_DFA

using namespace std;

namespace stapregex {

regexp *pad_re = NULL;
regexp *fail_re = NULL;

dfa *
stapregex_compile (regexp *re, const std::string& match_snippet,
                   const std::string& fail_snippet)
{
  if (pad_re == NULL) {
    // build regexp for ".*"
    pad_re = make_dot ();
    pad_re = new close_op (pad_re, true); // -- prefer shorter match
    pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match
  }
  if (fail_re == NULL) {
    // build regexp for ".*$", but allow '\0' and support fail outcome
    fail_re = make_dot (true); // -- allow '\0'
    fail_re = new close_op (fail_re, true); // -- prefer shorter match
    fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match
    fail_re = new cat_op (fail_re, new anchor_op('$'));
    fail_re = new rule_op(fail_re, 0);
    // XXX: this approach creates one extra spurious-but-safe state
    // (safe because the matching procedure stops after encountering '\0')
  }

  vector<string> outcomes(2);
  outcomes[0] = fail_snippet;
  outcomes[1] = match_snippet;

  int num_tags = re->num_tags;

  // Pad & wrap re in appropriate rule_ops to control match behaviour:
  bool anchored = re->anchored ();
  if (!anchored) re = new cat_op(pad_re, re); // -- left-padding
  re = new rule_op(re, 1);
  re = new alt_op(re, fail_re);

#ifdef STAPREGEX_DEBUG_INS
  cerr << "RESULTING INS FROM REGEX " << re << ":" << endl;
#endif

  ins *i = re->compile();

#ifdef STAPREGEX_DEBUG_INS
  for (const ins *j = i; (j - i) < re->ins_size() + 1; )
    {
      j = show_ins(cerr, j, i); cerr << endl;
    }
  cerr << endl;
#endif
  
  // TODOXXX optimize ins as in re2c

  dfa *d = new dfa(i, num_tags, outcomes);

  // Carefully deallocate temporary scaffolding:
  if (!anchored) delete ((rule_op*) ((alt_op*) re)->a)->re; // -- new cat_op
  delete ((alt_op*) re)->a; // -- new rule_op
  delete re; // -- new alt_op
  // NB: deleting a regular expression DOES NOT deallocate its
  // children. The original re parameter is presumed to be retained
  // indefinitely as part of a stapdfa table, or such....

  return d;
}

// ------------------------------------------------------------------------

/* Now follows the heart of the tagged-DFA algorithm. This is a basic
   implementation of the algorithm described in Ville Laurikari's
   Masters thesis and summarized in the paper "NFAs with Tagged
   Transitions, their Conversion to Deterministic Automata and
   Application to Regular Expressions"
   (http://laurikari.net/ville/spire2000-tnfa.pdf).

   HERE BE DRAGONS (and not the friendly kind) */

/* Functions to deal with relative transition priorities: */

arc_priority
refine_higher(const arc_priority& a)
{
  return make_pair(2 * a.first + 1, a.second + 1);
}

arc_priority
refine_lower (const arc_priority& a)
{
  return make_pair(2 * a.first, a.second + 1);
}

int
arc_compare (const arc_priority& a, const arc_priority& b)
{
  unsigned long x = a.first;
  unsigned long y = b.first;

  if (a.second > b.second)
    x = x << (a.second - b.second);
  else if (a.second < b.second)
    y = y << (b.second - a.second);

  return ( x == y ? 0 : x < y ? -1 : 1 );
}

/* Manage the linked list of states in a DFA: */

state::state (state_kernel *kernel)
  : next(NULL), kernel(kernel), accepts(false), accept_outcome(0) {}

state *
dfa::add_state (state *s)
{
  s->label = nstates++;
  
  if (last == NULL)
    {
      last = s;
      first = last;
    }
  else
    {
      // append to the end
      last->next = s;
      last = last->next;
    }

  return last;
}

/* Operations to build a simple kernel prior to taking closure: */

void
add_kernel (state_kernel *kernel, ins *i)
{
  kernel_point point;
  point.i = i;
  point.priority = make_pair(0,0);
  // NB: point->map_items is empty

  kernel->push_back(point);
}

state_kernel *
make_kernel (ins *i)
{
  state_kernel *kernel = new state_kernel;
  add_kernel (kernel, i);
  return kernel;
}

/* Compute the set of kernel_points that are 'tag-wise unambiguously
   reachable' from a given initial set of points. Absent tagging, this
   becomes a bog-standard NFA e_closure construction. */
state_kernel *
te_closure (state_kernel *start, int ntags, bool is_initial = false)
{
  state_kernel *closure = new state_kernel(*start);
  queue<kernel_point> worklist;

  /* To avoid searching through closure incessantly when retrieving
     information about existing elements, the following caches are
     needed: */
  vector<unsigned> max_tags (ntags, 0);
  map<ins *, list<kernel_point> > closure_map;

  /* Reset priorities and cache initial elements of closure: */
  for (state_kernel::iterator it = closure->begin();
       it != closure->end(); it++)
    {
      it->priority = make_pair(0,0);
      worklist.push(*it);

      // Store the element in relevant caches:

      for (list<map_item>::const_iterator jt = it->map_items.begin();
           jt != it->map_items.end(); jt++)
        max_tags[jt->first] = max(jt->second, max_tags[jt->first]);

      closure_map[it->i].push_back(*it);
    }

  while (!worklist.empty())
    {
      kernel_point point = worklist.front(); worklist.pop();

      // Identify e-transitions depending on the opcode.
      // There are at most two e-transitions emerging from an insn.
      // If we have two e-transitions, the 'other' has higher priority.

      ins *target = NULL; int tag = -1;
      ins *other_target = NULL; int other_tag = -1;

      // TODOXXX line-by-line proceeds below

      bool do_split = false;

      if (point.i->i.tag == TAG)
        {
          target = &point.i[1];
          tag = (int) point.i->i.param;
        }
      else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link)
        {
          /* Workaround for a FORK that points to itself: */
          target = &point.i[1];
        }
      else if (point.i->i.tag == FORK)
        {
          do_split = true;
          // Relative priority of two e-transitions depends on param:
          if (point.i->i.param)
            {
              // Prefer jumping to link.
              target = &point.i[1];
              other_target = (ins *) point.i->i.link;
            }
          else
            {
              // Prefer stepping to next instruction.
              target = (ins *) point.i->i.link;
              other_target = &point.i[1];
            }
        }
      else if (point.i->i.tag == GOTO)
        {
          target = (ins *) point.i->i.link;
        }
      else if (point.i->i.tag == INIT && is_initial)
        {
          target = &point.i[1];
        }

      bool already_found;

      // Data for the endpoint of the first transition:
      kernel_point next;
      next.i = target;
      next.priority = do_split ? refine_lower(point.priority) : point.priority;
      next.map_items = point.map_items;

      // Date for the endpoint of the second transition:
      kernel_point other_next;
      other_next.i = other_target;
      other_next.priority = do_split ? refine_higher(point.priority) : point.priority;
      other_next.map_items = point.map_items;

      // Do infinite-loop-check:
      other_next.parents = point.parents;
      if (point.parents.find(other_next.i) != point.parents.end())
        {
          other_target = NULL;
          other_tag = -1;
        }
      other_next.parents.insert(other_next.i);

      next.parents = point.parents;
      if (point.parents.find(next.i) != point.parents.end())
        {
          target = NULL;
          tag = -1;
          goto next_target;
        }
      next.parents.insert(next.i);

    another_transition:
      if (target == NULL)
        continue;

      // Deal with the current e-transition:

      if (tag >= 0)
        {
          /* Delete all existing next.map_items of the form m[tag,x]. */
          for (list<map_item>::iterator it = next.map_items.begin();
               it != next.map_items.end(); it++)
            if (it->first == (unsigned) tag)
              next.map_items.erase (it);

          /* Add m[tag,x] to next.map_items, where x is the smallest
             nonnegative integer such that m[tag,x] does not occur
             anywhere in closure. Then update the cache. */
          unsigned x = max_tags[tag];
          next.map_items.push_back(make_pair(tag, ++x));
          max_tags[tag] = x;
        }

      already_found = false;

      /* Deal with similar transitions that have a different priority. */
      for (list<kernel_point>::iterator it = closure_map[next.i].begin();
           it != closure_map[next.i].end(); )
        {
          int result = arc_compare(it->priority, next.priority);

          if (result > 0) {
            // obnoxious shuffle to avoid iterator invalidation
            list<kernel_point>::iterator old_it = it;
            it++;
            closure_map[next.i].erase(old_it);
            continue;
          } else if (result == 0) {
            already_found = true;
          }
          it++;
        }

      if (!already_found) {
        // Store the element in relevant caches:

        closure_map[next.i].push_back(next);

        for (list<map_item>::iterator jt = next.map_items.begin();
             jt != next.map_items.end(); jt++)
          max_tags[jt->first] = max(jt->second, max_tags[jt->first]);

        // Store the element in closure:
        closure->push_back(next);
        worklist.push(next);
      }

    next_target:
      // Now move to dealing with the second e-transition, if any.
      target = other_target; other_target = NULL;
      tag = other_tag; other_tag = -1;
      next = other_next;

      goto another_transition;
    }

  return closure;
}

/* Find the set of reordering commands (if any) that will get us from
   state s to some existing state in the dfa (returns the state in
   question, appends reordering commands to r). Returns NULL is no
   suitable state is found. */
state *
dfa::find_equivalent (state *s, tdfa_action &r)
{
  state *answer = NULL;

  for (state_kernel::iterator it = s->kernel->begin();
       it != s->kernel->end(); it++)
    mark(it->i);

  /* Check kernels of existing states for size equivalence and for
     unmarked items (similar to re2c's original algorithm): */
  unsigned n = s->kernel->size();
  for (state *t = first; t != NULL; t = t->next)
    {
      if (t->kernel->size() == n)
        {
          for (state_kernel::iterator it = t->kernel->begin();
               it != t->kernel->end(); it++)
              if (!marked(it->i)) 
                goto next_state;

          // TODOXXX check for existence of reordering r
          answer = t;
          goto cleanup;
        }
    next_state:
      ;
    }

 cleanup:
  for (state_kernel::iterator it = s->kernel->begin();
       it != s->kernel->end(); it++)
    unmark(it->i);

  return answer;
}


dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets)
  : orig_nfa(i), nstates(0), ntags(ntags), outcome_snippets(outcome_snippets)
{
  /* Initialize empty linked list of states: */
  first = last = NULL;

  ins *start = &i[0];
  state_kernel *initial_kernel = te_closure(make_kernel(start), ntags, true);
  state *initial = add_state(new state(initial_kernel));
  queue<state *> worklist; worklist.push(initial);

  while (!worklist.empty())
    {
      state *curr = worklist.front(); worklist.pop();

      vector<list<ins *> > edges(NUM_REAL_CHARS);

      /* Using the CHAR instructions in kernel, build the initial
         table of spans for curr. Also check for final states. */

      for (list<kernel_point>::iterator it = curr->kernel->begin();
           it != curr->kernel->end(); it++)
        {
          if (it->i->i.tag == CHAR)
            {
              for (ins *j = &it->i[1]; j < (ins *) it->i->i.link; j++)
                edges[j->c.value].push_back((ins *) it->i->i.link);
            }
          else if (it->i->i.tag == ACCEPT)
            {
              /* Always prefer the highest numbered outcome: */
              curr->accepts = true;
              curr->accept_outcome = max(it->i->i.param, curr->accept_outcome);
            }
        }

      for (unsigned c = 0; c < NUM_REAL_CHARS; )
        {
          list <ins *> e = edges[c];
          assert (!e.empty()); // XXX: ensured by fail_re in stapregex_compile

          span s;

          s.lb = c;

          while (++c < NUM_REAL_CHARS && edges[c] == e) ;

          s.ub = c - 1;

          s.reach_pairs = new state_kernel;

          for (list<ins *>::iterator it = e.begin();
               it != e.end(); it++)
            add_kernel (s.reach_pairs, *it);

          curr->spans.push_back(s);
        }

      /* For each of the spans in curr, determine the reachable
         points assuming a character in the span. */
      for (list<span>::iterator it = curr->spans.begin();
           it != curr->spans.end(); it++)
        {
          state_kernel *reach_pairs = it->reach_pairs;

          /* Set up candidate target state: */
          state_kernel *u_pairs = te_closure(reach_pairs, ntags);
          state *target = new state(u_pairs);
          tdfa_action c;

          /* Generate position-save commands for any map items
             that do not appear in curr->kernel: */

          set<map_item> all_items;
          for (state_kernel::const_iterator jt = curr->kernel->begin();
               jt != curr->kernel->end(); jt++)
            for (list<map_item>::const_iterator kt = jt->map_items.begin();
                 kt != jt->map_items.end(); jt++)
              all_items.insert(*kt);

          list<map_item> store_items;
          for (state_kernel::const_iterator jt = u_pairs->begin();
               jt != u_pairs->end(); jt++)
            for (list<map_item>::const_iterator kt = jt->map_items.begin();
                kt != jt->map_items.end(); kt++)
              if (all_items.find(*kt) == all_items.end())
                store_items.push_back(*kt);

          for (list<map_item>::iterator jt = store_items.begin();
               jt != store_items.end(); jt++)
            {
              // append m[i,n] <- <curr position> to c
              tdfa_insn insn;
              insn.to = *jt;
              insn.save_pos = true;
              c.push_back(insn);
            }

          /* If there is a state t_prime in states such that some
             sequence of reordering commands r produces t_prime
             from target, use t_prime as the target state,
             appending the reordering commands to c. */
          state *t_prime = find_equivalent(target, c);
          if (t_prime != NULL)
            {
              delete target;
            }
          else
            {
              /* We need to actually add target to the dfa: */
              t_prime = target;
              add_state(t_prime);
              worklist.push(t_prime);

              if (t_prime->accepts)
                {
                  // TODOXXX set the finisher of t_prime
                }
            }

          /* Set the transition: */
          it->to = t_prime;
          it->action = c;
        }
    }
}

dfa::~dfa ()
{
  state * s;
  while ((s = first))
    {
      first = s->next;
      delete s;
    }

  delete orig_nfa;
}

// ------------------------------------------------------------------------

// TODOXXX add emission instructions for tag_ops

void
span::emit_jump (translator_output *o, const dfa *d) const
{
  o->newline () << "printf(\" --> GOTO yystate%d\\n\", " << to->label << ");";

  // TODOXXX tags feature allows proper longest-match priority
  if (to->accepts)
    {
      emit_final(o, d);
    }
  else
    {
      o->newline () << "YYCURSOR++;";
      o->newline () << "goto yystate" << to->label << ";";
    }
}

/* Assuming the target DFA of the span is a final state, emit code to
   (TODOXXX) cleanup tags and exit with a final answer. */
void
span::emit_final (translator_output *o, const dfa *d) const
{
  assert (to->accepts); // XXX: must guarantee correct usage of emit_final()
  o->newline() << d->outcome_snippets[to->accept_outcome];
  o->newline() << "goto yyfinish;";
}

string c_char(char c)
{
  stringstream o;
  o << "'";
  print_escaped(o, c);
  o << "'";
  return o.str();
}

void
state::emit (translator_output *o, const dfa *d) const
{
  o->newline() << "yystate" << label << ": ";
  o->newline () << "printf(\"READ '%s' %c\", cur, *YYCURSOR);";
  o->newline() << "switch (*YYCURSOR) {";
  o->indent(1);
  for (list<span>::const_iterator it = spans.begin();
       it != spans.end(); it++)
    {
      // If we see a '\0', go immediately into an accept state:
      if (it->lb == '\0')
        {
          o->newline() << "case " << c_char('\0') << ":";
          it->emit_final(o, d); // TODOXXX extra function may be unneeded
        }

      // Emit labels to handle all the other elements of the span:
      for (unsigned c = max('\1', it->lb); c <= it->ub; c++) {
        o->newline() << "case " << c_char((char) c) << ":";
      }
      it->emit_jump(o, d);

      // TODOXXX handle a 'default' set of characters for the largest span...
      // TODOXXX optimize by accepting before end of string whenever possible... (also necessary for proper first-matched-substring selection)
    }
  o->newline(-1) << "}";
}

void
dfa::emit (translator_output *o) const
{
#ifdef STAPREGEX_DEBUG_DFA
  print(o);
#else
  o->newline() << "{";
  o->newline(1);
  for (state *s = first; s; s = s->next)
    s->emit(o, this);
  o->newline() << "yyfinish: ;";
  o->newline(-1) << "}";
#endif
}

void
dfa::emit_tagsave (translator_output *o, std::string tag_states,
                   std::string tag_vals, std::string tag_count) const
{
  // TODOXXX implement after testing the preceding algorithms
}

// ------------------------------------------------------------------------

std::ostream&
operator << (std::ostream &o, const tdfa_action& a)
{
  for (list<tdfa_insn>::const_iterator it = a.begin();
       it != a.end(); it++)
    {
      if (it != a.begin()) o << "; ";

      o << "m[" << it->to.first << "," << it->to.second << "] <- ";

      if (it->save_pos)
        o << "p";
      else
        o << "m[" << it->from.first << "," << it->from.second << "]";
    }

  return o;
}

std::ostream&
operator << (std::ostream &o, const arc_priority& p)
{
  o << p.first << "/" << (1 << p.second);
  return o;
}

void
state::print (translator_output *o) const
{
  o->line() << "state " << label;
  if (accepts)
    o->line() << " accepts " << accept_outcome;
  if (!finalizer.empty())
    o->line() << " [" << finalizer << "]";

  o->indent(1);
  for (list<span>::const_iterator it = spans.begin();
       it != spans.end(); it++)
    {
      o->newline() << "'";
      if (it->lb == it->ub)
        {
          print_escaped (o->line(), it->lb);
          o->line() << "  ";
        }
      else
        {
          print_escaped (o->line(), it->lb);
          o->line() << "-";
          print_escaped (o->line(), it->ub);
        }

      o->line() << "' -> " << it->to->label;

      if (!it->action.empty())
        o->line() << " [" << it->action << "]";
    }
  o->newline(-1);
}

void
dfa::print (std::ostream& o) const
{
  translator_output to(o); print(&to);
}

void
dfa::print (translator_output *o) const
{
  o->newline();
  for (state *s = first; s; s = s->next)
    {
      s->print(o);
      o->newline();
    }
  o->newline();
}

std::ostream&
operator << (std::ostream& o, const dfa& d)
{
  d.print(o);
  return o;
}

std::ostream&
operator << (std::ostream &o, const dfa *d)
{
  o << *d;
  return o;
}

};

/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
Commit	Line	Data
07777235 SM	1	// -- C++ --
	2	// Copyright (C) 2012-2013 Red Hat Inc.
	3	//
	4	// This file is part of systemtap, and is free software. You can
	5	// redistribute it and/or modify it under the terms of the GNU General
	6	// Public License (GPL); either version 2, or (at your option) any
	7	// later version.
	8	//
	9	// ---
	10	//
	11	// This file incorporates code from the re2c project; please see
	12	// the file README.stapregex for details.
	13
	14	#include <string>
	15	#include <iostream>
54f92ee4	16	#include <sstream>
07777235 SM	17	#include <set>
	18	#include <list>
	19	#include <map>
	20	#include <vector>
	21	#include <queue>
	22	#include <utility>
	23
	24	#include "translator-output.h"
	25
	26	#include "stapregex-parse.h"
	27	#include "stapregex-tree.h"
	28	#include "stapregex-dfa.h"
	29
54f92ee4 SM	30	// Uncomment to show result of ins (NFA) compilation:
	31	//#define STAPREGEX_DEBUG_INS
	32	// Uncomment to display result of DFA compilation in a compact format:
	33	//#define STAPREGEX_DEBUG_DFA
	34
07777235 SM	35	using namespace std;
	36
	37	namespace stapregex {
	38
	39	regexp *pad_re = NULL;
	40	regexp *fail_re = NULL;
	41
	42	dfa *
	43	stapregex_compile (regexp *re, const std::string& match_snippet,
	44	const std::string& fail_snippet)
	45	{
	46	if (pad_re == NULL) {
	47	// build regexp for ".*"
	48	pad_re = make_dot ();
248f3856 SM	49	pad_re = new close_op (pad_re, true); // -- prefer shorter match
248f3856 SM	50	pad_re = new alt_op (pad_re, new null_op, true); // -- prefer second match
07777235 SM	51	}
07777235 SM	52	if (fail_re == NULL) {
9f232afb	53	// build regexp for ".*$", but allow '\0' and support fail outcome
9b7e0f3d	54	fail_re = make_dot (true); // -- allow '\0'
248f3856 SM	55	fail_re = new close_op (fail_re, true); // -- prefer shorter match
248f3856 SM	56	fail_re = new alt_op (fail_re, new null_op, true); // -- prefer second match
9f232afb	57	fail_re = new cat_op (fail_re, new anchor_op('$'));
248f3856	58	fail_re = new rule_op(fail_re, 0);
9f232afb SM	59	// XXX: this approach creates one extra spurious-but-safe state
9f232afb SM	60	// (safe because the matching procedure stops after encountering '\0')
07777235 SM	61	}
	62
	63	vector<string> outcomes(2);
	64	outcomes[0] = fail_snippet;
	65	outcomes[1] = match_snippet;
	66
	67	int num_tags = re->num_tags;
	68
	69	// Pad & wrap re in appropriate rule_ops to control match behaviour:
8d2eb09e SM	70	bool anchored = re->anchored ();
8d2eb09e SM	71	if (!anchored) re = new cat_op(pad_re, re); // -- left-padding
07777235 SM	72	re = new rule_op(re, 1);
	73	re = new alt_op(re, fail_re);
	74
248f3856 SM	75	#ifdef STAPREGEX_DEBUG_INS
	76	cerr << "RESULTING INS FROM REGEX " << re << ":" << endl;
	77	#endif
	78
07777235	79	ins *i = re->compile();
248f3856 SM	80
	81	#ifdef STAPREGEX_DEBUG_INS
	82	for (const ins *j = i; (j - i) < re->ins_size() + 1; )
	83	{
	84	j = show_ins(cerr, j, i); cerr << endl;
	85	}
	86	cerr << endl;
	87	#endif
	88
	89	// TODOXXX optimize ins as in re2c
	90
07777235 SM	91	dfa *d = new dfa(i, num_tags, outcomes);
07777235 SM	92
8d2eb09e SM	93	// Carefully deallocate temporary scaffolding:
8d2eb09e SM	94	if (!anchored) delete ((rule_op) ((alt_op) re)->a)->re; // -- new cat_op
07777235 SM	95	delete ((alt_op*) re)->a; // -- new rule_op
	96	delete re; // -- new alt_op
	97	// NB: deleting a regular expression DOES NOT deallocate its
	98	// children. The original re parameter is presumed to be retained
	99	// indefinitely as part of a stapdfa table, or such....
	100
	101	return d;
	102	}
	103
	104	// ------------------------------------------------------------------------
	105
	106	/* Now follows the heart of the tagged-DFA algorithm. This is a basic
	107	implementation of the algorithm described in Ville Laurikari's
	108	Masters thesis and summarized in the paper "NFAs with Tagged
	109	Transitions, their Conversion to Deterministic Automata and
	110	Application to Regular Expressions"
	111	(http://laurikari.net/ville/spire2000-tnfa.pdf).
	112
	113	HERE BE DRAGONS (and not the friendly kind) */
	114
07777235 SM	115	/* Functions to deal with relative transition priorities: */
	116
	117	arc_priority
	118	refine_higher(const arc_priority& a)
	119	{
	120	return make_pair(2 * a.first + 1, a.second + 1);
	121	}
	122
	123	arc_priority
	124	refine_lower (const arc_priority& a)
	125	{
	126	return make_pair(2 * a.first, a.second + 1);
	127	}
	128
	129	int
	130	arc_compare (const arc_priority& a, const arc_priority& b)
	131	{
	132	unsigned long x = a.first;
	133	unsigned long y = b.first;
	134
	135	if (a.second > b.second)
	136	x = x << (a.second - b.second);
	137	else if (a.second < b.second)
	138	y = y << (b.second - a.second);
	139
	140	return ( x == y ? 0 : x < y ? -1 : 1 );
	141	}
	142
	143	/* Manage the linked list of states in a DFA: */
	144
	145	state::state (state_kernel *kernel)
	146	: next(NULL), kernel(kernel), accepts(false), accept_outcome(0) {}
	147
	148	state *
	149	dfa::add_state (state *s)
	150	{
	151	s->label = nstates++;
	152
	153	if (last == NULL)
	154	{
	155	last = s;
	156	first = last;
	157	}
	158	else
	159	{
	160	// append to the end
	161	last->next = s;
	162	last = last->next;
	163	}
	164
	165	return last;
	166	}
	167
	168	/* Operations to build a simple kernel prior to taking closure: */
	169
9b7e0f3d SM	170	void
9b7e0f3d SM	171	add_kernel (state_kernel kernel, ins i)
07777235 SM	172	{
	173	kernel_point point;
	174	point.i = i;
	175	point.priority = make_pair(0,0);
	176	// NB: point->map_items is empty
	177
	178	kernel->push_back(point);
	179	}
	180
	181	state_kernel *
	182	make_kernel (ins *i)
	183	{
	184	state_kernel *kernel = new state_kernel;
	185	add_kernel (kernel, i);
	186	return kernel;
	187	}
	188
	189	/* Compute the set of kernel_points that are 'tag-wise unambiguously
	190	reachable' from a given initial set of points. Absent tagging, this
	191	becomes a bog-standard NFA e_closure construction. */
	192	state_kernel *
	193	te_closure (state_kernel *start, int ntags, bool is_initial = false)
	194	{
	195	state_kernel closure = new state_kernel(start);
	196	queue<kernel_point> worklist;
	197
	198	/* To avoid searching through closure incessantly when retrieving
	199	information about existing elements, the following caches are
	200	needed: */
	201	vector<unsigned> max_tags (ntags, 0);
	202	map<ins *, list<kernel_point> > closure_map;
	203
	204	/* Reset priorities and cache initial elements of closure: */
	205	for (state_kernel::iterator it = closure->begin();
	206	it != closure->end(); it++)
	207	{
	208	it->priority = make_pair(0,0);
	209	worklist.push(*it);
	210
	211	// Store the element in relevant caches:
	212
	213	for (list<map_item>::const_iterator jt = it->map_items.begin();
	214	jt != it->map_items.end(); jt++)
	215	max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
	216
	217	closure_map[it->i].push_back(*it);
	218	}
	219
	220	while (!worklist.empty())
	221	{
	222	kernel_point point = worklist.front(); worklist.pop();
	223
	224	// Identify e-transitions depending on the opcode.
	225	// There are at most two e-transitions emerging from an insn.
	226	// If we have two e-transitions, the 'other' has higher priority.
	227
	228	ins *target = NULL; int tag = -1;
	229	ins *other_target = NULL; int other_tag = -1;
	230
9b7e0f3d SM	231	// TODOXXX line-by-line proceeds below
9b7e0f3d SM	232
07777235 SM	233	bool do_split = false;
	234
	235	if (point.i->i.tag == TAG)
	236	{
	237	target = &point.i[1];
	238	tag = (int) point.i->i.param;
	239	}
179688ce SM	240	else if (point.i->i.tag == FORK && point.i == (ins *) point.i->i.link)
	241	{
	242	/* Workaround for a FORK that points to itself: */
	243	target = &point.i[1];
	244	}
07777235 SM	245	else if (point.i->i.tag == FORK)
	246	{
	247	do_split = true;
	248	// Relative priority of two e-transitions depends on param:
	249	if (point.i->i.param)
	250	{
	251	// Prefer jumping to link.
	252	target = &point.i[1];
	253	other_target = (ins *) point.i->i.link;
	254	}
	255	else
	256	{
	257	// Prefer stepping to next instruction.
	258	target = (ins *) point.i->i.link;
	259	other_target = &point.i[1];
	260	}
	261	}
	262	else if (point.i->i.tag == GOTO)
	263	{
	264	target = (ins *) point.i->i.link;
	265	}
	266	else if (point.i->i.tag == INIT && is_initial)
	267	{
	268	target = &point.i[1];
	269	}
	270
179688ce SM	271	bool already_found;
179688ce SM	272
07777235 SM	273	// Data for the endpoint of the first transition:
	274	kernel_point next;
	275	next.i = target;
	276	next.priority = do_split ? refine_lower(point.priority) : point.priority;
	277	next.map_items = point.map_items;
	278
9b7e0f3d SM	279	// Date for the endpoint of the second transition:
	280	kernel_point other_next;
	281	other_next.i = other_target;
	282	other_next.priority = do_split ? refine_higher(point.priority) : point.priority;
	283	other_next.map_items = point.map_items;
	284
179688ce SM	285	// Do infinite-loop-check:
	286	other_next.parents = point.parents;
	287	if (point.parents.find(other_next.i) != point.parents.end())
	288	{
	289	other_target = NULL;
	290	other_tag = -1;
	291	}
	292	other_next.parents.insert(other_next.i);
	293
	294	next.parents = point.parents;
	295	if (point.parents.find(next.i) != point.parents.end())
	296	{
	297	target = NULL;
	298	tag = -1;
	299	goto next_target;
	300	}
	301	next.parents.insert(next.i);
	302
07777235	303	another_transition:
9b7e0f3d SM	304	if (target == NULL)
9b7e0f3d SM	305	continue;
07777235 SM	306
	307	// Deal with the current e-transition:
	308
	309	if (tag >= 0)
	310	{
	311	/* Delete all existing next.map_items of the form m[tag,x]. */
	312	for (list<map_item>::iterator it = next.map_items.begin();
	313	it != next.map_items.end(); it++)
	314	if (it->first == (unsigned) tag)
	315	next.map_items.erase (it);
	316
	317	/* Add m[tag,x] to next.map_items, where x is the smallest
	318	nonnegative integer such that m[tag,x] does not occur
	319	anywhere in closure. Then update the cache. */
	320	unsigned x = max_tags[tag];
	321	next.map_items.push_back(make_pair(tag, ++x));
	322	max_tags[tag] = x;
	323	}
	324
179688ce	325	already_found = false;
07777235 SM	326
	327	/* Deal with similar transitions that have a different priority. */
	328	for (list<kernel_point>::iterator it = closure_map[next.i].begin();
248f3856	329	it != closure_map[next.i].end(); )
07777235 SM	330	{
07777235 SM	331	int result = arc_compare(it->priority, next.priority);
179688ce	332
07777235	333	if (result > 0) {
248f3856 SM	334	// obnoxious shuffle to avoid iterator invalidation
	335	list<kernel_point>::iterator old_it = it;
	336	it++;
	337	closure_map[next.i].erase(old_it);
	338	continue;
07777235 SM	339	} else if (result == 0) {
	340	already_found = true;
	341	}
248f3856	342	it++;
07777235 SM	343	}
	344
	345	if (!already_found) {
07777235 SM	346	// Store the element in relevant caches:
07777235 SM	347
248f3856 SM	348	closure_map[next.i].push_back(next);
248f3856 SM	349
07777235 SM	350	for (list<map_item>::iterator jt = next.map_items.begin();
	351	jt != next.map_items.end(); jt++)
	352	max_tags[jt->first] = max(jt->second, max_tags[jt->first]);
	353
248f3856 SM	354	// Store the element in closure:
	355	closure->push_back(next);
	356	worklist.push(next);
07777235 SM	357	}
07777235 SM	358
179688ce	359	next_target:
07777235	360	// Now move to dealing with the second e-transition, if any.
07777235 SM	361	target = other_target; other_target = NULL;
07777235 SM	362	tag = other_tag; other_tag = -1;
9b7e0f3d	363	next = other_next;
07777235 SM	364
	365	goto another_transition;
	366	}
	367
	368	return closure;
	369	}
	370
	371	/* Find the set of reordering commands (if any) that will get us from
	372	state s to some existing state in the dfa (returns the state in
	373	question, appends reordering commands to r). Returns NULL is no
	374	suitable state is found. */
	375	state *
	376	dfa::find_equivalent (state *s, tdfa_action &r)
	377	{
	378	state *answer = NULL;
	379
	380	for (state_kernel::iterator it = s->kernel->begin();
	381	it != s->kernel->end(); it++)
9b7e0f3d	382	mark(it->i);
8d2eb09e	383
07777235 SM	384	/* Check kernels of existing states for size equivalence and for
	385	unmarked items (similar to re2c's original algorithm): */
	386	unsigned n = s->kernel->size();
	387	for (state *t = first; t != NULL; t = t->next)
	388	{
	389	if (t->kernel->size() == n)
	390	{
	391	for (state_kernel::iterator it = t->kernel->begin();
	392	it != t->kernel->end(); it++)
8d2eb09e SM	393	if (!marked(it->i))
8d2eb09e SM	394	goto next_state;
07777235	395
8d2eb09e SM	396	// TODOXXX check for existence of reordering r
	397	answer = t;
	398	goto cleanup;
07777235	399	}
8d2eb09e SM	400	next_state:
8d2eb09e SM	401	;
07777235 SM	402	}
	403
	404	cleanup:
	405	for (state_kernel::iterator it = s->kernel->begin();
	406	it != s->kernel->end(); it++)
	407	unmark(it->i);
	408
	409	return answer;
	410	}
	411
	412
	413	dfa::dfa (ins *i, int ntags, vector<string>& outcome_snippets)
	414	: orig_nfa(i), nstates(0), ntags(ntags), outcome_snippets(outcome_snippets)
	415	{
	416	/* Initialize empty linked list of states: */
	417	first = last = NULL;
	418
	419	ins *start = &i[0];
9f232afb	420	state_kernel *initial_kernel = te_closure(make_kernel(start), ntags, true);
248f3856 SM	421	state *initial = add_state(new state(initial_kernel));
248f3856 SM	422	queue<state *> worklist; worklist.push(initial);
07777235 SM	423
	424	while (!worklist.empty())
	425	{
248f3856	426	state *curr = worklist.front(); worklist.pop();
07777235 SM	427
	428	vector<list<ins *> > edges(NUM_REAL_CHARS);
	429
	430	/* Using the CHAR instructions in kernel, build the initial
	431	table of spans for curr. Also check for final states. */
	432
248f3856 SM	433	for (list<kernel_point>::iterator it = curr->kernel->begin();
248f3856 SM	434	it != curr->kernel->end(); it++)
07777235 SM	435	{
	436	if (it->i->i.tag == CHAR)
	437	{
	438	for (ins j = &it->i[1]; j < (ins ) it->i->i.link; j++)
	439	edges[j->c.value].push_back((ins *) it->i->i.link);
	440	}
	441	else if (it->i->i.tag == ACCEPT)
	442	{
	443	/* Always prefer the highest numbered outcome: */
	444	curr->accepts = true;
	445	curr->accept_outcome = max(it->i->i.param, curr->accept_outcome);
	446	}
	447	}
	448
aaed700d	449	for (unsigned c = 0; c < NUM_REAL_CHARS; )
07777235 SM	450	{
	451	list <ins *> e = edges[c];
	452	assert (!e.empty()); // XXX: ensured by fail_re in stapregex_compile
	453
	454	span s;
	455
	456	s.lb = c;
	457
	458	while (++c < NUM_REAL_CHARS && edges[c] == e) ;
	459
248f3856	460	s.ub = c - 1;
07777235 SM	461
	462	s.reach_pairs = new state_kernel;
	463
	464	for (list<ins *>::iterator it = e.begin();
	465	it != e.end(); it++)
	466	add_kernel (s.reach_pairs, *it);
	467
	468	curr->spans.push_back(s);
	469	}
	470
	471	/* For each of the spans in curr, determine the reachable
	472	points assuming a character in the span. */
	473	for (list<span>::iterator it = curr->spans.begin();
	474	it != curr->spans.end(); it++)
	475	{
	476	state_kernel *reach_pairs = it->reach_pairs;
	477
	478	/* Set up candidate target state: */
	479	state_kernel *u_pairs = te_closure(reach_pairs, ntags);
	480	state *target = new state(u_pairs);
	481	tdfa_action c;
	482
	483	/* Generate position-save commands for any map items
248f3856	484	that do not appear in curr->kernel: */
07777235 SM	485
07777235 SM	486	set<map_item> all_items;
248f3856 SM	487	for (state_kernel::const_iterator jt = curr->kernel->begin();
248f3856 SM	488	jt != curr->kernel->end(); jt++)
07777235 SM	489	for (list<map_item>::const_iterator kt = jt->map_items.begin();
	490	kt != jt->map_items.end(); jt++)
	491	all_items.insert(*kt);
	492
	493	list<map_item> store_items;
	494	for (state_kernel::const_iterator jt = u_pairs->begin();
	495	jt != u_pairs->end(); jt++)
	496	for (list<map_item>::const_iterator kt = jt->map_items.begin();
	497	kt != jt->map_items.end(); kt++)
	498	if (all_items.find(*kt) == all_items.end())
	499	store_items.push_back(*kt);
	500
	501	for (list<map_item>::iterator jt = store_items.begin();
	502	jt != store_items.end(); jt++)
	503	{
	504	// append m[i,n] <- <curr position> to c
	505	tdfa_insn insn;
	506	insn.to = *jt;
	507	insn.save_pos = true;
	508	c.push_back(insn);
	509	}
	510
	511	/* If there is a state t_prime in states such that some
	512	sequence of reordering commands r produces t_prime
	513	from target, use t_prime as the target state,
	514	appending the reordering commands to c. */
	515	state *t_prime = find_equivalent(target, c);
9b7e0f3d	516	if (t_prime != NULL)
248f3856	517	{
07777235 SM	518	delete target;
	519	}
	520	else
	521	{
	522	/* We need to actually add target to the dfa: */
	523	t_prime = target;
	524	add_state(t_prime);
248f3856	525	worklist.push(t_prime);
07777235 SM	526
	527	if (t_prime->accepts)
	528	{
	529	// TODOXXX set the finisher of t_prime
	530	}
	531	}
	532
	533	/* Set the transition: */
	534	it->to = t_prime;
	535	it->action = c;
	536	}
	537	}
	538	}
	539
	540	dfa::~dfa ()
	541	{
	542	state * s;
	543	while ((s = first))
	544	{
	545	first = s->next;
	546	delete s;
	547	}
	548
	549	delete orig_nfa;
	550	}
	551
	552	// ------------------------------------------------------------------------
	553
54f92ee4 SM	554	// TODOXXX add emission instructions for tag_ops
	555
	556	void
	557	span::emit_jump (translator_output o, const dfa d) const
	558	{
	559	o->newline () << "printf(\" --> GOTO yystate%d\\n\", " << to->label << ");";
	560
	561	// TODOXXX tags feature allows proper longest-match priority
	562	if (to->accepts)
	563	{
	564	emit_final(o, d);
	565	}
	566	else
	567	{
	568	o->newline () << "YYCURSOR++;";
	569	o->newline () << "goto yystate" << to->label << ";";
	570	}
	571	}
	572
	573	/* Assuming the target DFA of the span is a final state, emit code to
	574	(TODOXXX) cleanup tags and exit with a final answer. */
	575	void
	576	span::emit_final (translator_output o, const dfa d) const
	577	{
	578	assert (to->accepts); // XXX: must guarantee correct usage of emit_final()
	579	o->newline() << d->outcome_snippets[to->accept_outcome];
	580	o->newline() << "goto yyfinish;";
	581	}
	582
	583	string c_char(char c)
	584	{
	585	stringstream o;
	586	o << "'";
	587	print_escaped(o, c);
	588	o << "'";
	589	return o.str();
	590	}
	591
	592	void
	593	state::emit (translator_output o, const dfa d) const
	594	{
	595	o->newline() << "yystate" << label << ": ";
	596	o->newline () << "printf(\"READ '%s' %c\", cur, *YYCURSOR);";
	597	o->newline() << "switch (*YYCURSOR) {";
	598	o->indent(1);
	599	for (list<span>::const_iterator it = spans.begin();
	600	it != spans.end(); it++)
	601	{
	602	// If we see a '\0', go immediately into an accept state:
	603	if (it->lb == '\0')
	604	{
	605	o->newline() << "case " << c_char('\0') << ":";
	606	it->emit_final(o, d); // TODOXXX extra function may be unneeded
	607	}
	608
	609	// Emit labels to handle all the other elements of the span:
	610	for (unsigned c = max('\1', it->lb); c <= it->ub; c++) {
	611	o->newline() << "case " << c_char((char) c) << ":";
	612	}
	613	it->emit_jump(o, d);
	614
	615	// TODOXXX handle a 'default' set of characters for the largest span...
	616	// TODOXXX optimize by accepting before end of string whenever possible... (also necessary for proper first-matched-substring selection)
	617	}
618	o->newline(-1) << "}";
619	}
620
07777235 SM	621	void
	622	dfa::emit (translator_output *o) const
	623	{
54f92ee4	624	#ifdef STAPREGEX_DEBUG_DFA
07777235	625	print(o);
54f92ee4 SM	626	#else
	627	o->newline() << "{";
	628	o->newline(1);
	629	for (state *s = first; s; s = s->next)
	630	s->emit(o, this);
	631	o->newline() << "yyfinish: ;";
	632	o->newline(-1) << "}";
	633	#endif
07777235 SM	634	}
	635
	636	void
	637	dfa::emit_tagsave (translator_output *o, std::string tag_states,
	638	std::string tag_vals, std::string tag_count) const
	639	{
	640	// TODOXXX implement after testing the preceding algorithms
	641	}
	642
	643	// ------------------------------------------------------------------------
	644
	645	std::ostream&
	646	operator << (std::ostream &o, const tdfa_action& a)
	647	{
	648	for (list<tdfa_insn>::const_iterator it = a.begin();
	649	it != a.end(); it++)
	650	{
	651	if (it != a.begin()) o << "; ";
	652
	653	o << "m[" << it->to.first << "," << it->to.second << "] <- ";
	654
	655	if (it->save_pos)
	656	o << "p";
	657	else
	658	o << "m[" << it->from.first << "," << it->from.second << "]";
	659	}
	660
	661	return o;
	662	}
	663
248f3856 SM	664	std::ostream&
	665	operator << (std::ostream &o, const arc_priority& p)
	666	{
	667	o << p.first << "/" << (1 << p.second);
	668	return o;
	669	}
	670
07777235 SM	671	void
	672	state::print (translator_output *o) const
	673	{
	674	o->line() << "state " << label;
	675	if (accepts)
	676	o->line() << " accepts " << accept_outcome;
	677	if (!finalizer.empty())
	678	o->line() << " [" << finalizer << "]";
	679
	680	o->indent(1);
	681	for (list<span>::const_iterator it = spans.begin();
	682	it != spans.end(); it++)
	683	{
248f3856	684	o->newline() << "'";
07777235	685	if (it->lb == it->ub)
248f3856 SM	686	{
	687	print_escaped (o->line(), it->lb);
	688	o->line() << " ";
	689	}
07777235	690	else
248f3856 SM	691	{
	692	print_escaped (o->line(), it->lb);
	693	o->line() << "-";
	694	print_escaped (o->line(), it->ub);
	695	}
07777235	696
248f3856	697	o->line() << "' -> " << it->to->label;
07777235 SM	698
	699	if (!it->action.empty())
	700	o->line() << " [" << it->action << "]";
	701	}
	702	o->newline(-1);
	703	}
	704
	705	void
	706	dfa::print (std::ostream& o) const
	707	{
	708	translator_output to(o); print(&to);
	709	}
	710
	711	void
	712	dfa::print (translator_output *o) const
	713	{
248f3856	714	o->newline();
07777235 SM	715	for (state *s = first; s; s = s->next)
	716	{
	717	s->print(o);
	718	o->newline();
	719	}
248f3856	720	o->newline();
07777235 SM	721	}
	722
	723	std::ostream&
	724	operator << (std::ostream& o, const dfa& d)
	725	{
	726	d.print(o);
	727	return o;
	728	}
	729
	730	std::ostream&
	731	operator << (std::ostream &o, const dfa *d)
	732	{
	733	o << *d;
	734	return o;
	735	}
	736
	737	};
	738
	739	/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */