stapregex-parse.cxx

   1 // -*- C++ -*-
   2 // Copyright (C) 2012-2018 Red Hat Inc.
   3 //
   4 // This file is part of systemtap, and is free software.  You can
   5 // redistribute it and/or modify it under the terms of the GNU General
   6 // Public License (GPL); either version 2, or (at your option) any
   7 // later version.
   8 //
   9 // ---
  10 //
  11 // This file incorporates code from the re2c project; please see
  12 // the file README.stapregex for details.
  13
  14 #include "util.h"
  15
  16 #include "stapregex-tree.h"
  17 #include "stapregex-parse.h"
  18
  19 #include <cstdlib>
  20 #include <cstring>
  21 #include <string>
  22
  23 using namespace std;
  24
  25 namespace stapregex {
  26
  27 void print_escaped(std::ostream& o, rchar c)
  28 {
  29   o << escaped_character((unsigned)c);
  30 }
  31
  32 // ------------------------------------------------------------------------
  33
  34 cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
  35                    last_pos(~0), finished(false), next_c(0), last_c(0) {}
  36
  37 cursor::cursor(const std::string *input, bool do_unescape)
  38   : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
  39 {
  40   next_c = 0; last_c = 0;
  41   finished = ( pos >= input->length() );
  42 }
  43
  44 rchar
  45 cursor::next ()
  46 {
  47   if (! next_c && finished)
  48     throw regex_error(_("unexpected end of regex"), pos);
  49   if (! next_c)
  50     get_unescaped();
  51
  52   last_c = next_c;
  53   // advance by zeroing next_c
  54   next_c = 0;
  55
  56   return last_c;
  57 }
  58
  59 rchar
  60 cursor::peek ()
  61 {
  62   if (! next_c && ! finished)
  63       get_unescaped();
  64
  65   // don't advance by zeroing next_c
  66   last_c = next_c;
  67
  68   return next_c;
  69 }
  70
  71 bool
  72 cursor::has (unsigned n)
  73 {
  74   return ( pos <= input->length() - n );
  75 }
  76
  77 /* Systemtap doesn't unescape string literals for us, presuming to
  78    pass the backslashes intact to a C compiler; hence we need to do
  79    our own unescaping here.
  80
  81    This functionality needs to be handled as part of cursor, in order
  82    to correctly retain the original positions in the string when doing
  83    error reporting. */
  84 void
  85 cursor::get_unescaped ()
  86 {
  87   static const char *hex = "0123456789abcdef";
  88   static const char *oct = "01234567";
  89
  90   last_pos = pos;
  91   char c = (*input)[pos];
  92
  93   if (c != '\\' || !do_unescape)
  94     {
  95       next_c = c;
  96       pos++;
  97       finished = ( pos >= input->length() );
  98       return;
  99     }
 100
 101   pos++;
 102
 103   /* Check for improper string end: */
 104   if (pos >= input->length())
 105     throw regex_error(_("unexpected end of regex"), pos);
 106
 107   /* The logic is based on re2c's Scanner::unescape() method;
 108      the set of accepted escape codes should correspond to
 109      lexer::scan() in parse.cxx. */
 110   c = (*input)[pos];
 111   switch (c)
 112     {
 113     case 'a': c = '\a'; break;
 114     case 'b': c = '\b'; break;
 115     case 't': c = '\t'; break;
 116     case 'n': c = '\n'; break;
 117     case 'v': c = '\v'; break;
 118     case 'f': c = '\f'; break;
 119     case 'r': c = '\r'; break;
 120
 121     case 'x':
 122       {
 123       if (pos >= input->length() - 2)
 124         throw regex_error(_("two hex digits required in escape sequence"), pos);
 125
 126       const char *d1 = strchr(hex, tolower((*input)[pos+1]));
 127       const char *d2 = strchr(hex, tolower((*input)[pos+2]));
 128
 129       if (!d1 || !d2)
 130         throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
 131
 132       c = (char)((d1-hex) << 4) + (char)(d2-hex);
 133       pos += 2; // skip two chars more than usual
 134       break;
 135       }
 136     case '4' ... '7':
 137       // XXX: perhaps perform error recovery (slurp 3 octal chars)?
 138       throw regex_error(_("octal escape sequence out of range"), pos);
 139
 140     case '0' ... '3':
 141       {
 142       if (pos >= input->length() - 2)
 143         throw regex_error(_("three octal digits required in escape sequence"), pos);
 144
 145       const char *d0 = strchr(oct, (*input)[pos]);
 146       const char *d1 = strchr(oct, (*input)[pos+1]);
 147       const char *d2 = strchr(oct, (*input)[pos+2]);
 148
 149       if (!d0 || !d1 || !d2)
 150         throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
 151
 152       c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
 153       pos += 2; // skip two chars more than usual
 154       break;
 155       }
 156     default:
 157       // do nothing; this removes the backslash from c
 158       ;
 159     }
 160
 161   next_c = c;
 162   pos++;
 163   finished = ( pos >= input->length() );
 164 }
 165
 166 // ------------------------------------------------------------------------
 167
 168 regexp *
 169 regex_parser::parse (bool do_tag)
 170 {
 171   cur = cursor(&input, do_unescape);
 172   this->do_tag = do_tag;
 173   num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag
 174
 175   regexp *result = parse_expr ();
 176
 177   // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
 178   if (do_tag) {
 179     result = new cat_op(new tag_op(TAG_START(0)), result);
 180     result = new cat_op(result, new tag_op(TAG_END(0)));
 181   }
 182
 183   if (! cur.finished)
 184     {
 185       rchar c = cur.peek ();
 186       if (c == ')')
 187         parse_error (_("unbalanced ')'"), cur.pos);
 188       else
 189         // This should not be possible:
 190         parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
 191     }
 192
 193   // PR15065 store num_tags in result
 194   result->num_tags = 2 * num_subexpressions;
 195   return result;
 196 }
 197
 198 bool
 199 regex_parser::isspecial (rchar c)
 200 {
 201   return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
 202            || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
 203            || c == '^' || c == '$' );
 204 }
 205
 206 void
 207 regex_parser::expect (rchar expected)
 208 {
 209   rchar c = 0;
 210   try {
 211     c = cur.next ();
 212   } catch (const regex_error &e) {
 213     parse_error (_F("expected %c, found end of regex", expected));
 214   }
 215
 216   if (c != expected)
 217     parse_error (_F("expected %c, found %c", expected, c));
 218 }
 219
 220 void
 221 regex_parser::parse_error (const string& msg, unsigned pos)
 222 {
 223   throw regex_error(msg, pos);
 224 }
 225
 226 void
 227 regex_parser::parse_error (const string& msg)
 228 {
 229   parse_error (msg, cur.last_pos);
 230 }
 231
 232 // ------------------------------------------------------------------------
 233
 234 regexp *
 235 regex_parser::parse_expr ()
 236 {
 237   regexp *result = parse_term ();
 238
 239   rchar c = cur.peek ();
 240   while (c && c == '|')
 241     {
 242       cur.next ();
 243       regexp *alt = parse_term ();
 244       result = make_alt (result, alt);
 245       c = cur.peek ();
 246     }
 247
 248   return result;
 249 }
 250
 251 regexp *
 252 regex_parser::parse_term ()
 253 {
 254   regexp *result = parse_factor ();
 255
 256   rchar c = cur.peek ();
 257   while (c && c != '|' && c != ')')
 258     {
 259       regexp *next = parse_factor ();
 260       result = new cat_op(result, next);
 261       c = cur.peek ();
 262     }
 263
 264   return result;
 265 }
 266
 267 regexp *
 268 regex_parser::parse_factor ()
 269 {
 270   regexp *result;
 271   regexp *old_result = NULL;
 272
 273   rchar c = cur.peek ();
 274   if (! c || c == '|' || c == ')')
 275     {
 276       result = new null_op;
 277       return result;
 278     }
 279   else if (c == '*' || c == '+' || c == '?' || c == '{')
 280     {
 281       parse_error(_F("unexpected '%c'", c));
 282     }
 283
 284   if (isspecial (c) && c != '\\')
 285     cur.next (); // c is guaranteed to be swallowed
 286
 287   if (c == '.')
 288     {
 289       result = make_dot ();
 290     }
 291   else if (c == '[')
 292     {
 293       result = parse_char_range ();
 294       expect (']');
 295     }
 296   else if (c == '(')
 297     {
 298       // Allow non-capturing group (?:...)
 299       bool do_tag_now = do_tag;
 300       rchar c2 = cur.peek();
 301       if (c2 == '?')
 302         {
 303           cur.next();
 304           rchar c3 = cur.peek();
 305           if (c3 != ':')
 306             parse_error(_F("unexpected '(?%c'", c3));
 307           do_tag_now = false;
 308         }
 309
 310       // To number tags correctly, reserve a subexpression number here:
 311       unsigned curr_subexpression = 0;
 312       if (do_tag_now)
 313         curr_subexpression = num_subexpressions++;
 314
 315       result = parse_expr ();
 316
 317       // PR15065 glom appropriate tag_ops onto the expr
 318       if (do_tag_now) {
 319         result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result);
 320         result = new cat_op(result, new tag_op(TAG_END(curr_subexpression)));
 321       } else {
 322         // XXX: workaround for certain error checking test cases which
 323         // would otherwise produce divergent behaviour without tag_ops
 324         // (e.g. "^*" vs "(^)*").
 325         result = new cat_op(result, new null_op);
 326       }
 327
 328       expect (')');
 329     }
 330   else if (c == '^' || c == '$')
 331     {
 332       result = new anchor_op(c);
 333     }
 334   else // escaped or ordinary character -- not yet swallowed
 335     {
 336       string accumulate;
 337       rchar d = 0;
 338
 339       while (c && ( ! isspecial (c) || c == '\\' ))
 340         {
 341           if (c == '\\')
 342             {
 343               cur.next ();
 344               c = cur.peek ();
 345             }
 346
 347           cur.next ();
 348           d = cur.peek ();
 349
 350           /* if we end in a closure, it should only govern the last character */
 351           if (d == '*' || d == '+' || d == '?' || d == '{')
 352             {
 353               /* save the last character */
 354               d = c; break;
 355             }
 356
 357           accumulate.push_back (c);
 358           c = d; d = 0;
 359         }
 360
 361       result = str_to_re (accumulate);
 362
 363       /* separately deal with the last character before a closure */
 364       if (d != 0) {
 365         old_result = result; /* will add it back outside closure at the end */
 366         result = str_to_re (string(1,d));
 367       }
 368     }
 369
 370   /* parse closures or other postfix operators */
 371   c = cur.peek ();
 372   while (c == '*' || c == '+' || c == '?' || c == '{')
 373     {
 374       cur.next ();
 375
 376       /* closure-type operators applied to $^ are definitely not kosher */
 377       if (result->type_of() == "anchor_op")
 378         {
 379           parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
 380         }
 381
 382       if (c == '*')
 383         {
 384           result = make_alt (new close_op(result), new null_op);
 385         }
 386       else if (c == '+')
 387         {
 388           result = new close_op(result);
 389         }
 390       else if (c == '?')
 391         {
 392           result = make_alt (result, new null_op);
 393         }
 394       else if (c == '{')
 395         {
 396           int minsize = parse_number ();
 397           int maxsize = -1;
 398
 399           c = cur.next ();
 400           if (c == ',')
 401             {
 402               c = cur.peek ();
 403               if (c == '}')
 404                 {
 405                   cur.next ();
 406                   maxsize = -1;
 407                 }
 408               else if (isdigit (c))
 409                 {
 410                   maxsize = parse_number ();
 411                   expect ('}');
 412                 }
 413               else
 414                 parse_error(_("expected '}' or number"), cur.pos);
 415             }
 416           else if (c == '}')
 417             {
 418               maxsize = minsize;
 419             }
 420           else
 421             parse_error(_("expected ',' or '}'"));
 422
 423           /* optimize {0,0}, {0,} and {1,} */
 424           if (!do_tag && minsize == 0 && maxsize == 0)
 425             {
 426               // XXX: this optimization is only used when
 427               // subexpression-extraction is disabled
 428               delete result;
 429               result = new null_op;
 430             }
 431           else if (minsize == 0 && maxsize == -1)
 432             {
 433               result = make_alt (new close_op(result), new null_op);
 434             }
 435           else if (minsize == 1 && maxsize == -1)
 436             {
 437               result = new close_op(result);
 438             }
 439           else
 440             {
 441               result = new closev_op(result, minsize, maxsize);
 442             }
 443         }
 444
 445       c = cur.peek ();
 446     }
 447
 448   if (old_result)
 449     result = new cat_op(old_result, result);
 450
 451   return result;
 452 }
 453
 454 regexp *
 455 regex_parser::parse_char_range ()
 456 {
 457   range *ran = NULL;
 458
 459   // check for inversion
 460   bool inv = false;
 461   rchar c = cur.peek ();
 462   if (c == '^')
 463     {
 464       inv = true;
 465       cur.next ();
 466     }
 467
 468   for (;;)
 469     {
 470       // break on string end whenever we encounter it
 471       if (cur.finished) parse_error(_("unclosed character class"));
 472
 473       range *add = stapregex_getrange (cur);
 474       range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
 475       delete ran; if (new_ran != add) delete add;
 476       ran = new_ran;
 477
 478       // break on ']' (except at the start of the class)
 479       c = cur.peek ();
 480       if (c == ']')
 481         break;
 482     }
 483
 484   if (inv)
 485     {
 486       range *new_ran = range_invert(ran);
 487       delete ran;
 488       ran = new_ran;
 489     }
 490
 491   if (ran == NULL)
 492     return new null_op;
 493
 494   return new match_op(ran);
 495 }
 496
 497 unsigned
 498 regex_parser::parse_number ()
 499 {
 500   string digits;
 501
 502   rchar c = cur.peek ();
 503   while (c && isdigit (c))
 504     {
 505       cur.next ();
 506       digits.push_back (c);
 507       c = cur.peek ();
 508     }
 509
 510   if (digits == "") parse_error(_("expected number"), cur.pos);
 511
 512   char *endptr = NULL;
 513   int val = strtol (digits.c_str (), &endptr, 10);
 514
 515   if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
 516     parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
 517 #define MAX_DFA_REPETITIONS 12345
 518   if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
 519     parse_error(_F("%s is too large", digits.c_str()), cur.pos);
 520
 521   return atoi (digits.c_str ());
 522 }
 523
 524 // ------------------------------------------------------------------------
 525
 526 std::map<std::string, range *> named_char_classes;
 527
 528 range *
 529 named_char_class (const string& name)
 530 {
 531   // static initialization of table
 532   if (named_char_classes.empty())
 533     {
 534       // original source for these is http://www.regular-expressions.info/posixbrackets.html
 535       // also checked against (intended to match) the c stdlib isFOO() chr class functions
 536       named_char_classes["alpha"] = new range("A-Za-z");
 537       named_char_classes["alnum"] = new range("A-Za-z0-9");
 538       named_char_classes["blank"] = new range(" \t");
 539       named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
 540       named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
 541       named_char_classes["xdigit"] = new range("0-9a-fA-F");
 542       named_char_classes["graph"] = new range("\x21-\x7E");
 543       named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
 544       named_char_classes["print"] = new range("\x20-\x7E");
 545       named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
 546       named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
 547       named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
 548     }
 549
 550   if (named_char_classes.find(name) == named_char_classes.end())
 551     {
 552       throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
 553     }
 554
 555   return new range(*named_char_classes[name]);
 556 }
 557
 558 range *
 559 stapregex_getrange (cursor& cur)
 560 {
 561   rchar c = cur.peek ();
 562
 563   if (c == '\\')
 564     {
 565       // Grab escaped char regardless of what it is.
 566       cur.next (); c = cur.peek (); cur.next ();
 567     }
 568   else if (c == '[')
 569     {
 570       // Check for '[:' digraph.
 571       rchar old_c = c; cur.next (); c = cur.peek ();
 572
 573       if (c == ':')
 574         {
 575           cur.next (); c = cur.peek (); // skip ':'
 576           string charclass;
 577
 578           for (;;)
 579             {
 580               if (cur.finished)
 581                 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
 582
 583               if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
 584                 {
 585                   cur.next (); cur.next (); // skip ':]'
 586                   return named_char_class(charclass);
 587                 }
 588
 589               charclass.push_back(c); cur.next(); c = cur.peek();
 590             }
 591         }
 592       else
 593         {
 594           // Backtrack; fall through to processing c.
 595           c = old_c;
 596         }
 597     }
 598   else
 599     cur.next ();
 600
 601   rchar lb = c, ub;
 602
 603   if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
 604     {
 605       ub = lb;
 606     }
 607   else
 608     {
 609       cur.next (); // skip '-'
 610       ub = cur.peek ();
 611
 612       if (ub < lb)
 613         throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
 614
 615       cur.next ();
 616     }
 617
 618   return new range(lb, ub);
 619 }
 620
 621 };
 622
 623 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */