[systemtap.git] / stapregex-parse.cxx

// -*- C++ -*-
// Copyright (C) 2012-2013 Red Hat Inc.
//
// This file is part of systemtap, and is free software.  You can
// redistribute it and/or modify it under the terms of the GNU General
// Public License (GPL); either version 2, or (at your option) any
// later version.
//
// ---
//
// This file incorporates code from the re2c project; please see
// the file README.stapregex for details.

#include "util.h"

#include "stapregex-tree.h"
#include "stapregex-parse.h"

#include <cstdlib>
#include <cstring>
#include <string>

using namespace std;

namespace stapregex {

// TODOXXX compress / eliminate / move to util

// void prtChOrHex(std::ostream& o, unsigned c)
// {
// 	if (eFlag)
// 	{
// 		prtHex(o, c);
// 	}
// 	else if ((c < 256u) && (isprint(c) || isspace(c)))
// 	{
// 		prtCh(o, c);
// 	}
// 	else
// 	{
// 		prtHex(o, c);
// 	}
// }

// void prtHex(std::ostream& o, unsigned c)
// {
// 	int oc = (int)(c);

// 	if (re2c::uFlag)
// 	{
// 		o << "0x"
// 		  << hexCh(oc >> 28)
// 		  << hexCh(oc >> 24)
// 		  << hexCh(oc >> 20)
// 		  << hexCh(oc >> 16)
// 		  << hexCh(oc >> 12)
// 		  << hexCh(oc >>  8)
// 		  << hexCh(oc >>  4)
// 		  << hexCh(oc);
// 	}
// 	else if (re2c::wFlag)
// 	{
// 		o << "0x"
// 		  << hexCh(oc >> 12)
// 		  << hexCh(oc >>  8)
// 		  << hexCh(oc >>  4)
// 		  << hexCh(oc);
// 	}
// 	else
// 	{
// 		o << "0x"
// 		  << hexCh(oc >>  4) 
// 		  << hexCh(oc);
// 	}
// }

char octCh(unsigned c)
{
	return '0' + c % 8;
}

void prtCh(std::ostream& o, unsigned c)
{
	int oc = (int)(c);

	switch (oc)
	{
		case '\'':
		o << "\\'";
		break;

		case '"':
		o << "\\\"";
		break;

		case '\n':
		o << "\\n";
		break;

		case '\t':
		o << "\\t";
		break;

		case '\v':
		o << "\\v";
		break;

		case '\b':
		o << "\\b";
		break;

		case '\r':
		o << "\\r";
		break;

		case '\f':
		o << "\\f";
		break;

		case '\a':
		o << "\\a";
		break;

		case '\\':
		o << "\\\\";
		break;

		default:

		if ((oc < 256) && isprint(oc))
		{
			o << (char) oc;
		}
		else
		{
			o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc);
		}
	}
}

void print_escaped(std::ostream& o, char c)
{
  prtCh(o, c);
}

// ------------------------------------------------------------------------

cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
                   last_pos(~0), finished(false), next_c(0), last_c(0) {}

cursor::cursor(const std::string *input, bool do_unescape)
  : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
{
  next_c = 0; last_c = 0;
  finished = ( pos >= input->length() );
}

char
cursor::next ()
{
  if (! next_c && finished)
    throw regex_error(_("unexpected end of regex"), pos);
  if (! next_c)
    get_unescaped();

  last_c = next_c;
  // advance by zeroing next_c
  next_c = 0;

  return last_c;
}

char
cursor::peek ()
{
  if (! next_c && ! finished)
      get_unescaped();

  // don't advance by zeroing next_c
  last_c = next_c;

  return next_c;
}

bool
cursor::has (unsigned n)
{
  return ( pos <= input->length() - n );
}

/* Systemtap doesn't unescape string literals for us, presuming to
   pass the backslashes intact to a C compiler; hence we need to do
   our own unescaping here.

   This functionality needs to be handled as part of cursor, in order
   to correctly retain the original positions in the string when doing
   error reporting. */
void
cursor::get_unescaped ()
{
  static const char *hex = "0123456789abcdef";
  static const char *oct = "01234567";

  last_pos = pos;
  char c = (*input)[pos];

  if (c != '\\' || !do_unescape)
    {
      next_c = c;
      pos++;
      finished = ( pos >= input->length() );
      return;
    }

  pos++;

  /* Check for improper string end: */
  if (pos >= input->length())
    throw regex_error(_("unexpected end of regex"), pos);

  /* The logic is based on re2c's Scanner::unescape() method;
     the set of accepted escape codes should correspond to
     lexer::scan() in parse.cxx. */
  c = (*input)[pos];
  switch (c)
    {
    case 'a': c = '\a'; break;
    case 'b': c = '\b'; break;
    case 't': c = '\t'; break;
    case 'n': c = '\n'; break;
    case 'v': c = '\v'; break;
    case 'f': c = '\f'; break;
    case 'r': c = '\r'; break;

    case 'x':
      {
      if (pos >= input->length() - 2)
        throw regex_error(_("two hex digits required in escape sequence"), pos);

      const char *d1 = strchr(hex, tolower((*input)[pos+1]));
      const char *d2 = strchr(hex, tolower((*input)[pos+2]));

      if (!d1 || !d2)
        throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));

      c = (char)((d1-hex) << 4) + (char)(d2-hex);
      pos += 2; // skip two chars more than usual
      break;
      }
    case '4' ... '7':
      // XXX: perhaps perform error recovery (slurp 3 octal chars)?
      throw regex_error(_("octal escape sequence out of range"), pos);

    case '0' ... '3':
      {
      if (pos >= input->length() - 2)
        throw regex_error(_("three octal digits required in escape sequence"), pos);

      const char *d0 = strchr(oct, (*input)[pos]);
      const char *d1 = strchr(oct, (*input)[pos+1]);
      const char *d2 = strchr(oct, (*input)[pos+2]);

      if (!d0 || !d1 || !d2)
        throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
      
      c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
      pos += 2; // skip two chars more than usual
      break;
      }
    default:
      // do nothing; this removes the backslash from c
      ;
    }

  next_c = c;
  pos++;
  finished = ( pos >= input->length() );
}

// ------------------------------------------------------------------------

regexp *
regex_parser::parse (bool do_tag)
{
  cur = cursor(&input, do_unescape);
  num_tags = 0; this->do_tag = do_tag;

  regexp *result = parse_expr ();

  // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
  if (do_tag) {
    result = new cat_op(new tag_op(num_tags++), result);
    result = new cat_op(result, new tag_op(num_tags++));
  }

  if (! cur.finished)
    {
      char c = cur.peek ();
      if (c == ')')
        parse_error (_("unbalanced ')'"), cur.pos);
      else
        // This should not be possible:
        parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
    }

  // PR15065 store num_tags in result
  result->num_tags = num_tags;
  return result;
}

bool
regex_parser::isspecial (char c)
{
  return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
           || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
           || c == '^' || c == '$' );
}

void
regex_parser::expect (char expected)
{
  char c = 0;
  try {
    c = cur.next ();
  } catch (const regex_error &e) {
    parse_error (_F("expected %c, found end of regex", expected));
  }

  if (c != expected)
    parse_error (_F("expected %c, found %c", expected, c));
}

void
regex_parser::parse_error (const string& msg, unsigned pos)
{
  throw regex_error(msg, pos);
}

void
regex_parser::parse_error (const string& msg)
{
  parse_error (msg, cur.last_pos);
}

// ------------------------------------------------------------------------

regexp *
regex_parser::parse_expr ()
{
  regexp *result = parse_term ();

  char c = cur.peek ();
  while (c && c == '|')
    {
      cur.next ();
      regexp *alt = parse_term ();
      result = make_alt (result, alt);
      c = cur.peek ();
    }

  return result;
}

regexp *
regex_parser::parse_term ()
{
  regexp *result = parse_factor ();

  char c = cur.peek ();
  while (c && c != '|' && c != ')')
    {
      regexp *next = parse_factor ();
      result = new cat_op(result, next);
      c = cur.peek ();
    }

  return result;
}

regexp *
regex_parser::parse_factor ()
{
  regexp *result;
  regexp *old_result = NULL;

  char c = cur.peek ();
  if (! c || c == '|' || c == ')')
    {
      result = new null_op;
      return result;
    }
  else if (c == '*' || c == '+' || c == '?' || c == '{')
    {
      parse_error(_F("unexpected '%c'", c));
    }

  if (isspecial (c) && c != '\\')
    cur.next (); // c is guaranteed to be swallowed

  if (c == '.')
    {
      result = make_dot ();
    }
  else if (c == '[')
    {
      result = parse_char_range ();
      expect (']');
    }
  else if (c == '(')
    {
      result = parse_expr ();

      // PR15065 glom appropriate tag_ops onto the expr
      if (do_tag) {
        result = new cat_op(new tag_op(num_tags++), result);
        result = new cat_op(result, new tag_op(num_tags++));
      } else {
        // XXX: workaround for certain error checking test cases which
        // would otherwise produce divergent behaviour
        // (e.g. "^*" vs "(^)*").
        result = new cat_op(result, new null_op);
      }

      expect (')');
    }
  else if (c == '^' || c == '$')
    {
      result = new anchor_op(c);
    }
  else // escaped or ordinary character -- not yet swallowed
    {
      string accumulate;
      char d = 0;

      while (c && ( ! isspecial (c) || c == '\\' ))
        {
          if (c == '\\')
            {
              cur.next ();
              c = cur.peek ();
            }

          cur.next ();
          d = cur.peek ();

          /* if we end in a closure, it should only govern the last character */
          if (d == '*' || d == '+' || d == '?' || d == '{')
            {
              /* save the last character */
              d = c; break;
            }

          accumulate.push_back (c);
          c = d; d = 0;
        }

      result = str_to_re (accumulate);

      /* separately deal with the last character before a closure */
      if (d != 0) {
        old_result = result; /* will add it back outside closure at the end */
        result = str_to_re (string(1,d));
      }
    }

  /* parse closures or other postfix operators */
  c = cur.peek ();
  while (c == '*' || c == '+' || c == '?' || c == '{')
    {
      cur.next ();

      /* closure-type operators applied to $^ are definitely not kosher */
      if (result->type_of() == "anchor_op")
        {
          parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
        }

      if (c == '*')
        {
          result = make_alt (new close_op(result), new null_op);
        }
      else if (c == '+')
        {
          result = new close_op(result);
        }
      else if (c == '?')
        {
          result = make_alt (result, new null_op);
        }
      else if (c == '{')
        {
          int minsize = parse_number ();
          int maxsize = -1;

          c = cur.next ();
          if (c == ',')
            {
              c = cur.peek ();
              if (c == '}')
                {
                  cur.next ();
                  maxsize = -1;
                }
              else if (isdigit (c))
                {
                  maxsize = parse_number ();
                  expect ('}');
                }
              else
                parse_error(_("expected '}' or number"), cur.pos);
            }
          else if (c == '}')
            {
              maxsize = minsize;
            }
          else
            parse_error(_("expected ',' or '}'"));

          /* optimize {0,0}, {0,} and {1,} */
          if (!do_tag && minsize == 0 && maxsize == 0)
            {
              // XXX: this optimization is only used when
              // subexpression-extraction is disabled
              delete result;
              result = new null_op;
            }
          else if (minsize == 0 && maxsize == -1)
            {
              result = make_alt (new close_op(result), new null_op);
            }
          else if (minsize == 1 && maxsize == -1)
            {
              result = new close_op(result);
            }
          else
            {
              result = new closev_op(result, minsize, maxsize);
            }
        }
      
      c = cur.peek ();
    }

  if (old_result)
    result = new cat_op(old_result, result);

  return result;
}

regexp *
regex_parser::parse_char_range ()
{
  range *ran = NULL;

  // check for inversion
  bool inv = false;
  char c = cur.peek ();
  if (c == '^')
    {
      inv = true;
      cur.next ();
    }

  for (;;)
    {
      // break on string end whenever we encounter it
      if (cur.finished) parse_error(_("unclosed character class")); // TODOXXX doublecheck that this is triggered correctly

      range *add = stapregex_getrange (cur);
      range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
      delete ran; if (new_ran != add) delete add;
      ran = new_ran;

      // break on ']' (except at the start of the class)
      c = cur.peek ();
      if (c == ']')
        break;
    }

  if (inv)
    {
      range *new_ran = range_invert(ran);
      delete ran;
      ran = new_ran;
    }

  if (ran == NULL)
    return new null_op;

  return new match_op(ran);
}

unsigned
regex_parser::parse_number ()
{
  string digits;

  char c = cur.peek ();
  while (c && isdigit (c))
    {
      cur.next ();
      digits.push_back (c);
      c = cur.peek ();
    }

  if (digits == "") parse_error(_("expected number"), cur.pos);

  char *endptr = NULL;
  int val = strtol (digits.c_str (), &endptr, 10);

  if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
    parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
#define MAX_DFA_REPETITIONS 12345
  if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
    parse_error(_F("%s is too large", digits.c_str()), cur.pos);

  return atoi (digits.c_str ());
}

// ------------------------------------------------------------------------

std::map<std::string, range *> named_char_classes;

range *
named_char_class (const string& name)
{
  // static initialization of table
  if (named_char_classes.empty())
    {
      // original source for these is http://www.regular-expressions.info/posixbrackets.html
      // also checked against (intended to match) the c stdlib isFOO() chr class functions
      named_char_classes["alpha"] = new range("A-Za-z");
      named_char_classes["alnum"] = new range("A-Za-z0-9");
      named_char_classes["blank"] = new range(" \t");
      named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
      named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
      named_char_classes["xdigit"] = new range("0-9a-fA-F");
      named_char_classes["graph"] = new range("\x21-\x7E");
      named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
      named_char_classes["print"] = new range("\x20-\x7E");
      named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
      named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
      named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
    }

  if (named_char_classes.find(name) == named_char_classes.end())
    {
      throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
    }

  return new range(*named_char_classes[name]);
}

range *
stapregex_getrange (cursor& cur)
{
  char c = cur.peek ();

  if (c == '\\')
    {
      // Grab escaped char regardless of what it is.
      cur.next (); c = cur.peek (); cur.next ();
    }
  else if (c == '[')
    {
      // Check for '[:' digraph.
      char old_c = c; cur.next (); c = cur.peek ();

      if (c == ':')
        {
          cur.next (); c = cur.peek (); // skip ':'
          string charclass;

          for (;;)
            {
              if (cur.finished)
                throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);

              if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
                {
                  cur.next (); cur.next (); // skip ':]'
                  return named_char_class(charclass);
                }

              charclass.push_back(c); cur.next(); c = cur.peek();
            }
        }
      else
        {
          // Backtrack; fall through to processing c.
          c = old_c;
        }
    }
  else
    cur.next ();

  char lb = c, ub;

  if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
    {
      ub = lb;
    }
  else
    {
      cur.next (); // skip '-'
      ub = cur.peek ();

      if (ub < lb)
        throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);

      cur.next ();
    }

  return new range(lb, ub);
}

};

/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */
Commit	Line	Data
cd4882d7 SM	1	// -- C++ --
	2	// Copyright (C) 2012-2013 Red Hat Inc.
	3	//
	4	// This file is part of systemtap, and is free software. You can
	5	// redistribute it and/or modify it under the terms of the GNU General
	6	// Public License (GPL); either version 2, or (at your option) any
	7	// later version.
	8	//
	9	// ---
	10	//
	11	// This file incorporates code from the re2c project; please see
	12	// the file README.stapregex for details.
	13
	14	#include "util.h"
	15
	16	#include "stapregex-tree.h"
	17	#include "stapregex-parse.h"
	18
	19	#include <cstdlib>
	20	#include <cstring>
	21	#include <string>
	22
	23	using namespace std;
	24
	25	namespace stapregex {
	26
07777235 SM	27	// TODOXXX compress / eliminate / move to util
	28
	29	// void prtChOrHex(std::ostream& o, unsigned c)
	30	// {
	31	// if (eFlag)
	32	// {
	33	// prtHex(o, c);
	34	// }
	35	// else if ((c < 256u) && (isprint(c) \|\| isspace(c)))
	36	// {
	37	// prtCh(o, c);
	38	// }
	39	// else
	40	// {
	41	// prtHex(o, c);
	42	// }
	43	// }
	44
	45	// void prtHex(std::ostream& o, unsigned c)
	46	// {
	47	// int oc = (int)(c);
	48
	49	// if (re2c::uFlag)
	50	// {
	51	// o << "0x"
	52	// << hexCh(oc >> 28)
	53	// << hexCh(oc >> 24)
	54	// << hexCh(oc >> 20)
	55	// << hexCh(oc >> 16)
	56	// << hexCh(oc >> 12)
	57	// << hexCh(oc >> 8)
	58	// << hexCh(oc >> 4)
	59	// << hexCh(oc);
	60	// }
	61	// else if (re2c::wFlag)
	62	// {
	63	// o << "0x"
	64	// << hexCh(oc >> 12)
	65	// << hexCh(oc >> 8)
	66	// << hexCh(oc >> 4)
	67	// << hexCh(oc);
	68	// }
	69	// else
	70	// {
	71	// o << "0x"
	72	// << hexCh(oc >> 4)
	73	// << hexCh(oc);
	74	// }
	75	// }
	76
	77	char octCh(unsigned c)
	78	{
	79	return '0' + c % 8;
	80	}
	81
	82	void prtCh(std::ostream& o, unsigned c)
	83	{
	84	int oc = (int)(c);
	85
	86	switch (oc)
	87	{
	88	case '\'':
	89	o << "\\'";
	90	break;
91
92	case '"':
93	o << "\\\"";
94	break;
95
96	case '\n':
97	o << "\\n";
98	break;
99
100	case '\t':
101	o << "\\t";
102	break;
103
104	case '\v':
105	o << "\\v";
106	break;
107
108	case '\b':
109	o << "\\b";
110	break;
111
112	case '\r':
113	o << "\\r";
114	break;
115
116	case '\f':
117	o << "\\f";
118	break;
119
120	case '\a':
121	o << "\\a";
122	break;
123
124	case '\\':
125	o << "\\\\";
126	break;
127
128	default:
129
130	if ((oc < 256) && isprint(oc))
131	{
132	o << (char) oc;
133	}
134	else
135	{
136	o << '\\' << octCh(oc / 64) << octCh(oc / 8) << octCh(oc);
137	}
138	}
139	}
140
141	void print_escaped(std::ostream& o, char c)
142	{
143	prtCh(o, c);
144	}
145
146	// ------------------------------------------------------------------------
147
c92d3b42	148	cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
82c6d474	149	last_pos(~0), finished(false), next_c(0), last_c(0) {}
e5fcd199	150
28a27de2	151	cursor::cursor(const std::string *input, bool do_unescape)
82c6d474	152	: input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
cd4882d7 SM	153	{
	154	next_c = 0; last_c = 0;
	155	finished = ( pos >= input->length() );
	156	}
	157
	158	char
	159	cursor::next ()
	160	{
	161	if (! next_c && finished)
e5fcd199	162	throw regex_error(_("unexpected end of regex"), pos);
cd4882d7	163	if (! next_c)
e5fcd199	164	get_unescaped();
cd4882d7 SM	165
	166	last_c = next_c;
	167	// advance by zeroing next_c
	168	next_c = 0;
	169
	170	return last_c;
	171	}
	172
	173	char
	174	cursor::peek ()
	175	{
	176	if (! next_c && ! finished)
	177	get_unescaped();
	178
	179	// don't advance by zeroing next_c
	180	last_c = next_c;
	181
	182	return next_c;
	183	}
	184
40fd16cf SM	185	bool
	186	cursor::has (unsigned n)
	187	{
e5fcd199	188	return ( pos <= input->length() - n );
40fd16cf SM	189	}
40fd16cf SM	190
cd4882d7 SM	191	/* Systemtap doesn't unescape string literals for us, presuming to
	192	pass the backslashes intact to a C compiler; hence we need to do
	193	our own unescaping here.
	194
	195	This functionality needs to be handled as part of cursor, in order
	196	to correctly retain the original positions in the string when doing
	197	error reporting. */
	198	void
	199	cursor::get_unescaped ()
	200	{
	201	static const char *hex = "0123456789abcdef";
	202	static const char *oct = "01234567";
	203
	204	last_pos = pos;
	205	char c = (*input)[pos];
	206
	207	if (c != '\\' \|\| !do_unescape)
	208	{
	209	next_c = c;
	210	pos++;
248f3856	211	finished = ( pos >= input->length() );
cd4882d7 SM	212	return;
	213	}
	214
40fd16cf SM	215	pos++;
	216
	217	/* Check for improper string end: */
	218	if (pos >= input->length())
	219	throw regex_error(_("unexpected end of regex"), pos);
	220
cd4882d7 SM	221	/* The logic is based on re2c's Scanner::unescape() method;
	222	the set of accepted escape codes should correspond to
	223	lexer::scan() in parse.cxx. */
40fd16cf	224	c = (*input)[pos];
cd4882d7 SM	225	switch (c)
	226	{
	227	case 'a': c = '\a'; break;
	228	case 'b': c = '\b'; break;
	229	case 't': c = '\t'; break;
	230	case 'n': c = '\n'; break;
	231	case 'v': c = '\v'; break;
	232	case 'f': c = '\f'; break;
	233	case 'r': c = '\r'; break;
	234
	235	case 'x':
e5fcd199	236	{
cd4882d7 SM	237	if (pos >= input->length() - 2)
	238	throw regex_error(_("two hex digits required in escape sequence"), pos);
	239
	240	const char d1 = strchr(hex, tolower((input)[pos+1]));
	241	const char d2 = strchr(hex, tolower((input)[pos+2]));
	242
	243	if (!d1 \|\| !d2)
	244	throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
	245
e5fcd199	246	c = (char)((d1-hex) << 4) + (char)(d2-hex);
cd4882d7 SM	247	pos += 2; // skip two chars more than usual
cd4882d7 SM	248	break;
e5fcd199	249	}
cd4882d7 SM	250	case '4' ... '7':
	251	// XXX: perhaps perform error recovery (slurp 3 octal chars)?
	252	throw regex_error(_("octal escape sequence out of range"), pos);
	253
	254	case '0' ... '3':
e5fcd199	255	{
cd4882d7 SM	256	if (pos >= input->length() - 2)
	257	throw regex_error(_("three octal digits required in escape sequence"), pos);
	258
	259	const char d0 = strchr(oct, (input)[pos]);
	260	const char d1 = strchr(oct, (input)[pos+1]);
	261	const char d2 = strchr(oct, (input)[pos+2]);
	262
	263	if (!d0 \|\| !d1 \|\| !d2)
	264	throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
	265
	266	c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
	267	pos += 2; // skip two chars more than usual
	268	break;
e5fcd199	269	}
cd4882d7 SM	270	default:
cd4882d7 SM	271	// do nothing; this removes the backslash from c
e5fcd199	272	;
cd4882d7 SM	273	}
	274
	275	next_c = c;
	276	pos++;
	277	finished = ( pos >= input->length() );
	278	}
	279
	280	// ------------------------------------------------------------------------
	281
	282	regexp *
	283	regex_parser::parse (bool do_tag)
	284	{
112e9e2b	285	cur = cursor(&input, do_unescape);
e5fcd199	286	num_tags = 0; this->do_tag = do_tag;
cd4882d7	287
e5fcd199	288	regexp *result = parse_expr ();
cd4882d7 SM	289
	290	// PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
	291	if (do_tag) {
	292	result = new cat_op(new tag_op(num_tags++), result);
	293	result = new cat_op(result, new tag_op(num_tags++));
	294	}
	295
	296	if (! cur.finished)
	297	{
	298	char c = cur.peek ();
	299	if (c == ')')
e5fcd199	300	parse_error (_("unbalanced ')'"), cur.pos);
cd4882d7 SM	301	else
cd4882d7 SM	302	// This should not be possible:
e5fcd199	303	parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
cd4882d7 SM	304	}
	305
	306	// PR15065 store num_tags in result
e5fcd199	307	result->num_tags = num_tags;
cd4882d7 SM	308	return result;
	309	}
	310
	311	bool
	312	regex_parser::isspecial (char c)
	313	{
	314	return ( c == '.' \|\| c == '[' \|\| c == '{' \|\| c == '(' \|\| c == ')'
	315	\|\| c == '\\' \|\| c == '*' \|\| c == '+' \|\| c == '?' \|\| c == '\|'
	316	\|\| c == '^' \|\| c == '$' );
	317	}
	318
	319	void
	320	regex_parser::expect (char expected)
	321	{
	322	char c = 0;
	323	try {
	324	c = cur.next ();
	325	} catch (const regex_error &e) {
	326	parse_error (_F("expected %c, found end of regex", expected));
	327	}
	328
	329	if (c != expected)
	330	parse_error (_F("expected %c, found %c", expected, c));
	331	}
	332
	333	void
	334	regex_parser::parse_error (const string& msg, unsigned pos)
	335	{
	336	throw regex_error(msg, pos);
	337	}
	338
	339	void
	340	regex_parser::parse_error (const string& msg)
	341	{
	342	parse_error (msg, cur.last_pos);
	343	}
	344
	345	// ------------------------------------------------------------------------
	346
	347	regexp *
	348	regex_parser::parse_expr ()
	349	{
	350	regexp *result = parse_term ();
	351
	352	char c = cur.peek ();
	353	while (c && c == '\|')
	354	{
40fd16cf	355	cur.next ();
cd4882d7 SM	356	regexp *alt = parse_term ();
	357	result = make_alt (result, alt);
	358	c = cur.peek ();
	359	}
	360
	361	return result;
	362	}
	363
	364	regexp *
	365	regex_parser::parse_term ()
	366	{
	367	regexp *result = parse_factor ();
	368
	369	char c = cur.peek ();
	370	while (c && c != '\|' && c != ')')
	371	{
	372	regexp *next = parse_factor ();
	373	result = new cat_op(result, next);
	374	c = cur.peek ();
	375	}
	376
	377	return result;
	378	}
	379
	380	regexp *
	381	regex_parser::parse_factor ()
	382	{
	383	regexp *result;
	384	regexp *old_result = NULL;
	385
	386	char c = cur.peek ();
	387	if (! c \|\| c == '\|' \|\| c == ')')
	388	{
	389	result = new null_op;
	390	return result;
	391	}
	392	else if (c == '*' \|\| c == '+' \|\| c == '?' \|\| c == '{')
	393	{
	394	parse_error(_F("unexpected '%c'", c));
	395	}
	396
	397	if (isspecial (c) && c != '\\')
	398	cur.next (); // c is guaranteed to be swallowed
	399
	400	if (c == '.')
	401	{
	402	result = make_dot ();
	403	}
	404	else if (c == '[')
	405	{
	406	result = parse_char_range ();
	407	expect (']');
	408	}
	409	else if (c == '(')
	410	{
	411	result = parse_expr ();
	412
	413	// PR15065 glom appropriate tag_ops onto the expr
	414	if (do_tag) {
	415	result = new cat_op(new tag_op(num_tags++), result);
	416	result = new cat_op(result, new tag_op(num_tags++));
d6833508 SM	417	} else {
	418	// XXX: workaround for certain error checking test cases which
	419	// would otherwise produce divergent behaviour
	420	// (e.g. "^" vs "(^)").
	421	result = new cat_op(result, new null_op);
cd4882d7 SM	422	}
	423
	424	expect (')');
	425	}
	426	else if (c == '^' \|\| c == '$')
	427	{
	428	result = new anchor_op(c);
	429	}
	430	else // escaped or ordinary character -- not yet swallowed
	431	{
	432	string accumulate;
	433	char d = 0;
	434
	435	while (c && ( ! isspecial (c) \|\| c == '\\' ))
	436	{
	437	if (c == '\\')
	438	{
	439	cur.next ();
	440	c = cur.peek ();
	441	}
	442
	443	cur.next ();
	444	d = cur.peek ();
	445
	446	/* if we end in a closure, it should only govern the last character */
	447	if (d == '*' \|\| d == '+' \|\| d == '?' \|\| d == '{')
	448	{
	449	/* save the last character */
	450	d = c; break;
	451	}
	452
	453	accumulate.push_back (c);
	454	c = d; d = 0;
	455	}
	456
	457	result = str_to_re (accumulate);
	458
	459	/* separately deal with the last character before a closure */
	460	if (d != 0) {
	461	old_result = result; /* will add it back outside closure at the end */
e5fcd199	462	result = str_to_re (string(1,d));
cd4882d7 SM	463	}
	464	}
	465
	466	/* parse closures or other postfix operators */
	467	c = cur.peek ();
	468	while (c == '*' \|\| c == '+' \|\| c == '?' \|\| c == '{')
	469	{
	470	cur.next ();
	471
	472	/* closure-type operators applied to $^ are definitely not kosher */
	473	if (result->type_of() == "anchor_op")
	474	{
	475	parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
	476	}
	477
	478	if (c == '*')
	479	{
	480	result = make_alt (new close_op(result), new null_op);
	481	}
	482	else if (c == '+')
	483	{
	484	result = new close_op(result);
	485	}
	486	else if (c == '?')
	487	{
	488	result = make_alt (result, new null_op);
	489	}
	490	else if (c == '{')
	491	{
	492	int minsize = parse_number ();
	493	int maxsize = -1;
	494
	495	c = cur.next ();
	496	if (c == ',')
	497	{
	498	c = cur.peek ();
	499	if (c == '}')
	500	{
	501	cur.next ();
	502	maxsize = -1;
	503	}
	504	else if (isdigit (c))
	505	{
	506	maxsize = parse_number ();
	507	expect ('}');
	508	}
	509	else
	510	parse_error(_("expected '}' or number"), cur.pos);
	511	}
	512	else if (c == '}')
	513	{
	514	maxsize = minsize;
	515	}
	516	else
	517	parse_error(_("expected ',' or '}'"));
	518
	519	/* optimize {0,0}, {0,} and {1,} */
e5fcd199	520	if (!do_tag && minsize == 0 && maxsize == 0)
cd4882d7	521	{
e5fcd199 SM	522	// XXX: this optimization is only used when
e5fcd199 SM	523	// subexpression-extraction is disabled
cd4882d7 SM	524	delete result;
	525	result = new null_op;
	526	}
	527	else if (minsize == 0 && maxsize == -1)
	528	{
	529	result = make_alt (new close_op(result), new null_op);
	530	}
	531	else if (minsize == 1 && maxsize == -1)
	532	{
	533	result = new close_op(result);
	534	}
	535	else
	536	{
	537	result = new closev_op(result, minsize, maxsize);
	538	}
	539	}
	540
	541	c = cur.peek ();
	542	}
	543
	544	if (old_result)
	545	result = new cat_op(old_result, result);
	546
	547	return result;
	548	}
	549
cd4882d7 SM	550	regexp *
	551	regex_parser::parse_char_range ()
	552	{
40fd16cf	553	range *ran = NULL;
cd4882d7	554
40fd16cf	555	// check for inversion
cd4882d7 SM	556	bool inv = false;
	557	char c = cur.peek ();
	558	if (c == '^')
	559	{
	560	inv = true;
	561	cur.next ();
cd4882d7 SM	562	}
cd4882d7 SM	563
cd4882d7 SM	564	for (;;)
cd4882d7 SM	565	{
40fd16cf	566	// break on string end whenever we encounter it
e5fcd199	567	if (cur.finished) parse_error(_("unclosed character class")); // TODOXXX doublecheck that this is triggered correctly
cd4882d7	568
e5fcd199	569	range *add = stapregex_getrange (cur);
28a27de2 SM	570	range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
	571	delete ran; if (new_ran != add) delete add;
	572	ran = new_ran;
cd4882d7	573
40fd16cf	574	// break on ']' (except at the start of the class)
248f3856	575	c = cur.peek ();
40fd16cf	576	if (c == ']')
248f3856	577	break;
cd4882d7 SM	578	}
cd4882d7 SM	579
e5fcd199 SM	580	if (inv)
e5fcd199 SM	581	{
28a27de2 SM	582	range *new_ran = range_invert(ran);
	583	delete ran;
	584	ran = new_ran;
e5fcd199 SM	585	}
e5fcd199 SM	586
40fd16cf SM	587	if (ran == NULL)
	588	return new null_op;
	589
	590	return new match_op(ran);
cd4882d7 SM	591	}
	592
	593	unsigned
	594	regex_parser::parse_number ()
	595	{
	596	string digits;
	597
	598	char c = cur.peek ();
	599	while (c && isdigit (c))
	600	{
	601	cur.next ();
	602	digits.push_back (c);
	603	c = cur.peek ();
	604	}
	605
	606	if (digits == "") parse_error(_("expected number"), cur.pos);
	607
	608	char *endptr = NULL;
	609	int val = strtol (digits.c_str (), &endptr, 10);
	610
	611	if (*endptr != '\0' \|\| errno == ERANGE) // paranoid error checking
	612	parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
	613	#define MAX_DFA_REPETITIONS 12345
	614	if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
	615	parse_error(_F("%s is too large", digits.c_str()), cur.pos);
	616
	617	return atoi (digits.c_str ());
	618	}
	619
cd4882d7 SM	620	// ------------------------------------------------------------------------
cd4882d7 SM	621
e5fcd199	622	std::map<std::string, range *> named_char_classes;
40fd16cf SM	623
	624	range *
	625	named_char_class (const string& name)
cd4882d7	626	{
40fd16cf SM	627	// static initialization of table
	628	if (named_char_classes.empty())
	629	{
	630	// original source for these is http://www.regular-expressions.info/posixbrackets.html
	631	// also checked against (intended to match) the c stdlib isFOO() chr class functions
	632	named_char_classes["alpha"] = new range("A-Za-z");
	633	named_char_classes["alnum"] = new range("A-Za-z0-9");
	634	named_char_classes["blank"] = new range(" \t");
	635	named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
	636	named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
	637	named_char_classes["xdigit"] = new range("0-9a-fA-F");
	638	named_char_classes["graph"] = new range("\x21-\x7E");
	639	named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
	640	named_char_classes["print"] = new range("\x20-\x7E");
d6833508	641	named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{\|}~-");
40fd16cf SM	642	named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
	643	named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
	644	}
cd4882d7	645
40fd16cf	646	if (named_char_classes.find(name) == named_char_classes.end())
cd4882d7	647	{
e5fcd199	648	throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
40fd16cf	649	}
cd4882d7	650
40fd16cf SM	651	return new range(*named_char_classes[name]);
	652	}
	653
	654	range *
	655	stapregex_getrange (cursor& cur)
	656	{
	657	char c = cur.peek ();
40fd16cf SM	658
	659	if (c == '\\')
	660	{
	661	// Grab escaped char regardless of what it is.
	662	cur.next (); c = cur.peek (); cur.next ();
	663	}
	664	else if (c == '[')
	665	{
	666	// Check for '[:' digraph.
	667	char old_c = c; cur.next (); c = cur.peek ();
	668
	669	if (c == ':')
cd4882d7	670	{
248f3856	671	cur.next (); c = cur.peek (); // skip ':'
40fd16cf	672	string charclass;
cd4882d7	673
40fd16cf SM	674	for (;;)
	675	{
	676	if (cur.finished)
	677	throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
cd4882d7	678
e5fcd199	679	if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
40fd16cf SM	680	{
	681	cur.next (); cur.next (); // skip ':]'
	682	return named_char_class(charclass);
	683	}
	684
	685	charclass.push_back(c); cur.next(); c = cur.peek();
	686	}
cd4882d7	687	}
40fd16cf SM	688	else
	689	{
	690	// Backtrack; fall through to processing c.
	691	c = old_c;
	692	}
	693	}
	694	else
	695	cur.next ();
	696
	697	char lb = c, ub;
	698
d6833508	699	if (!cur.has(2) \|\| cur.peek () != '-' \|\| (*cur.input)[cur.pos] == ']')
40fd16cf SM	700	{
40fd16cf SM	701	ub = lb;
cd4882d7	702	}
40fd16cf SM	703	else
	704	{
	705	cur.next (); // skip '-'
	706	ub = cur.peek ();
	707
	708	if (ub < lb)
	709	throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
	710
	711	cur.next ();
	712	}
	713
	714	return new range(lb, ub);
cd4882d7 SM	715	}
cd4882d7 SM	716
40fd16cf SM	717	};
40fd16cf SM	718
cd4882d7	719	/* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */