stapregex-parse.cxx

   1 // -*- C++ -*-
   2 // Copyright (C) 2012-2018 Red Hat Inc.
   3 //
   4 // This file is part of systemtap, and is free software.  You can
   5 // redistribute it and/or modify it under the terms of the GNU General
   6 // Public License (GPL); either version 2, or (at your option) any
   7 // later version.
   8 //
   9 // ---
  10 //
  11 // This file incorporates code from the re2c project; please see
  12 // the file README.stapregex for details.
  13
  14 #include "util.h"
  15
  16 #include "stapregex-tree.h"
  17 #include "stapregex-parse.h"
  18
  19 #include <cstdlib>
  20 #include <cstring>
  21 #include <string>
  22
  23 using namespace std;
  24
  25 namespace stapregex {
  26
  27 void print_escaped(std::ostream& o, rchar c)
  28 {
  29   o << escaped_character((unsigned)c);
  30 }
  31
  32 // ------------------------------------------------------------------------
  33
  34 cursor::cursor() : input(NULL), do_unescape(false), pos(~0),
  35                    last_pos(~0), finished(false), next_c(0), last_c(0) {}
  36
  37 cursor::cursor(const std::string *input, bool do_unescape)
  38   : input(input), do_unescape(do_unescape), pos(0), last_pos(0), finished(false)
  39 {
  40   next_c = 0; last_c = 0;
  41   finished = ( pos >= input->length() );
  42 }
  43
  44 rchar
  45 cursor::next ()
  46 {
  47   if (! next_c && finished)
  48     throw regex_error(_("unexpected end of regex"), pos);
  49   if (! next_c)
  50     get_unescaped();
  51
  52   last_c = next_c;
  53   // advance by zeroing next_c
  54   next_c = 0;
  55
  56   return last_c;
  57 }
  58
  59 rchar
  60 cursor::peek ()
  61 {
  62   if (! next_c && ! finished)
  63       get_unescaped();
  64
  65   // don't advance by zeroing next_c
  66   last_c = next_c;
  67
  68   return next_c;
  69 }
  70
  71 bool
  72 cursor::has (unsigned n)
  73 {
  74   return ( pos <= input->length() - n );
  75 }
  76
  77 /* Systemtap doesn't unescape string literals for us, presuming to
  78    pass the backslashes intact to a C compiler; hence we need to do
  79    our own unescaping here.
  80
  81    This functionality needs to be handled as part of cursor, in order
  82    to correctly retain the original positions in the string when doing
  83    error reporting. */
  84 void
  85 cursor::get_unescaped ()
  86 {
  87   static const char *hex = "0123456789abcdef";
  88   static const char *oct = "01234567";
  89
  90   last_pos = pos;
  91   char c = (*input)[pos];
  92
  93   if (c != '\\' || !do_unescape)
  94     {
  95       next_c = c;
  96       pos++;
  97       finished = ( pos >= input->length() );
  98       return;
  99     }
 100
 101   pos++;
 102
 103   /* Check for improper string end: */
 104   if (pos >= input->length())
 105     throw regex_error(_("unexpected end of regex"), pos);
 106
 107   /* The logic is based on re2c's Scanner::unescape() method;
 108      the set of accepted escape codes should correspond to
 109      lexer::scan() in parse.cxx. */
 110   c = (*input)[pos];
 111   switch (c)
 112     {
 113     case 'a': c = '\a'; break;
 114     case 'b': c = '\b'; break;
 115     case 't': c = '\t'; break;
 116     case 'n': c = '\n'; break;
 117     case 'v': c = '\v'; break;
 118     case 'f': c = '\f'; break;
 119     case 'r': c = '\r'; break;
 120
 121     case 'x':
 122       {
 123       if (pos >= input->length() - 2)
 124         throw regex_error(_("two hex digits required in escape sequence"), pos);
 125
 126       const char *d1 = strchr(hex, tolower((*input)[pos+1]));
 127       const char *d2 = strchr(hex, tolower((*input)[pos+2]));
 128
 129       if (!d1 || !d2)
 130         throw regex_error(_("two hex digits required in escape sequence"), pos + (d1 ? 1 : 2));
 131
 132       c = (char)((d1-hex) << 4) + (char)(d2-hex);
 133       pos += 2; // skip two chars more than usual
 134       break;
 135       }
 136     case '4' ... '7':
 137       // XXX: perhaps perform error recovery (slurp 3 octal chars)?
 138       throw regex_error(_("octal escape sequence out of range"), pos);
 139
 140     case '0' ... '3':
 141       {
 142       if (pos >= input->length() - 2)
 143         throw regex_error(_("three octal digits required in escape sequence"), pos);
 144
 145       const char *d0 = strchr(oct, (*input)[pos]);
 146       const char *d1 = strchr(oct, (*input)[pos+1]);
 147       const char *d2 = strchr(oct, (*input)[pos+2]);
 148
 149       if (!d0 || !d1 || !d2)
 150         throw regex_error(_("three octal digits required in escape sequence"), pos + (d1 ? 1 : 2));
 151
 152       c = (char)((d0-oct) << 6) + (char)((d1-oct) << 3) + (char)(d2-oct);
 153       pos += 2; // skip two chars more than usual
 154       break;
 155       }
 156     default:
 157       // do nothing; this removes the backslash from c
 158       ;
 159     }
 160
 161   next_c = c;
 162   pos++;
 163   finished = ( pos >= input->length() );
 164 }
 165
 166 // ------------------------------------------------------------------------
 167
 168 regexp *
 169 regex_parser::parse (bool do_tag)
 170 {
 171   cur = cursor(&input, do_unescape);
 172   this->do_tag = do_tag;
 173   num_subexpressions = do_tag ? 1 : 0; // group 0 is guaranteed when using tag
 174
 175   regexp *result = parse_expr ();
 176
 177   // PR15065 glom appropriate tag_ops onto the expr (subexpression 0)
 178   if (do_tag) {
 179     result = new cat_op(new tag_op(TAG_START(0)), result);
 180     result = new cat_op(result, new tag_op(TAG_END(0)));
 181   }
 182
 183   if (! cur.finished)
 184     {
 185       rchar c = cur.peek ();
 186       if (c == ')')
 187         parse_error (_("unbalanced ')'"), cur.pos);
 188       else
 189         // This should not be possible:
 190         parse_error ("BUG -- regex parse failed to finish for unknown reasons", cur.pos);
 191     }
 192
 193   // PR15065 store num_tags in result
 194   result->num_tags = 2 * num_subexpressions;
 195   return result;
 196 }
 197
 198 bool
 199 regex_parser::isspecial (rchar c)
 200 {
 201   return ( c == '.' || c == '[' || c == '{' || c == '(' || c == ')'
 202            || c == '\\' || c == '*' || c == '+' || c == '?' || c == '|'
 203            || c == '^' || c == '$' );
 204 }
 205
 206 void
 207 regex_parser::expect (rchar expected)
 208 {
 209   rchar c = 0;
 210   try {
 211     c = cur.next ();
 212   } catch (const regex_error &e) {
 213     parse_error (_F("expected %c, found end of regex", expected));
 214   }
 215
 216   if (c != expected)
 217     parse_error (_F("expected %c, found %c", expected, c));
 218 }
 219
 220 void
 221 regex_parser::parse_error (const string& msg, unsigned pos)
 222 {
 223   throw regex_error(msg, pos);
 224 }
 225
 226 void
 227 regex_parser::parse_error (const string& msg)
 228 {
 229   parse_error (msg, cur.last_pos);
 230 }
 231
 232 // ------------------------------------------------------------------------
 233
 234 regexp *
 235 regex_parser::parse_expr ()
 236 {
 237   regexp *result = parse_term ();
 238
 239   rchar c = cur.peek ();
 240   while (c && c == '|')
 241     {
 242       cur.next ();
 243       regexp *alt = parse_term ();
 244       result = make_alt (result, alt);
 245       c = cur.peek ();
 246     }
 247
 248   return result;
 249 }
 250
 251 regexp *
 252 regex_parser::parse_term ()
 253 {
 254   regexp *result = parse_factor ();
 255
 256   rchar c = cur.peek ();
 257   while (c && c != '|' && c != ')')
 258     {
 259       regexp *next = parse_factor ();
 260       result = new cat_op(result, next);
 261       c = cur.peek ();
 262     }
 263
 264   return result;
 265 }
 266
 267 regexp *
 268 regex_parser::parse_factor ()
 269 {
 270   regexp *result;
 271   regexp *old_result = NULL;
 272
 273   rchar c = cur.peek ();
 274   if (! c || c == '|' || c == ')')
 275     {
 276       result = new null_op;
 277       return result;
 278     }
 279   else if (c == '*' || c == '+' || c == '?' || c == '{')
 280     {
 281       parse_error(_F("unexpected '%c'", c));
 282     }
 283
 284   if (isspecial (c) && c != '\\')
 285     cur.next (); // c is guaranteed to be swallowed
 286
 287   if (c == '.')
 288     {
 289       result = make_dot ();
 290     }
 291   else if (c == '[')
 292     {
 293       result = parse_char_range ();
 294       expect (']');
 295     }
 296   else if (c == '(')
 297     {
 298       // To number tags correctly, reserve a subexpression number here:
 299       unsigned curr_subexpression = 0;
 300       if (do_tag)
 301         curr_subexpression = num_subexpressions++;
 302
 303       result = parse_expr ();
 304
 305       // PR15065 glom appropriate tag_ops onto the expr
 306       if (do_tag) {
 307         result = new cat_op(new tag_op(TAG_START(curr_subexpression)), result);
 308         result = new cat_op(result, new tag_op(TAG_END(curr_subexpression)));
 309       } else {
 310         // XXX: workaround for certain error checking test cases which
 311         // would otherwise produce divergent behaviour without tag_ops
 312         // (e.g. "^*" vs "(^)*").
 313         result = new cat_op(result, new null_op);
 314       }
 315
 316       expect (')');
 317     }
 318   else if (c == '^' || c == '$')
 319     {
 320       result = new anchor_op(c);
 321     }
 322   else // escaped or ordinary character -- not yet swallowed
 323     {
 324       string accumulate;
 325       rchar d = 0;
 326
 327       while (c && ( ! isspecial (c) || c == '\\' ))
 328         {
 329           if (c == '\\')
 330             {
 331               cur.next ();
 332               c = cur.peek ();
 333             }
 334
 335           cur.next ();
 336           d = cur.peek ();
 337
 338           /* if we end in a closure, it should only govern the last character */
 339           if (d == '*' || d == '+' || d == '?' || d == '{')
 340             {
 341               /* save the last character */
 342               d = c; break;
 343             }
 344
 345           accumulate.push_back (c);
 346           c = d; d = 0;
 347         }
 348
 349       result = str_to_re (accumulate);
 350
 351       /* separately deal with the last character before a closure */
 352       if (d != 0) {
 353         old_result = result; /* will add it back outside closure at the end */
 354         result = str_to_re (string(1,d));
 355       }
 356     }
 357
 358   /* parse closures or other postfix operators */
 359   c = cur.peek ();
 360   while (c == '*' || c == '+' || c == '?' || c == '{')
 361     {
 362       cur.next ();
 363
 364       /* closure-type operators applied to $^ are definitely not kosher */
 365       if (result->type_of() == "anchor_op")
 366         {
 367           parse_error(_F("postfix closure '%c' applied to anchoring operator", c));
 368         }
 369
 370       if (c == '*')
 371         {
 372           result = make_alt (new close_op(result), new null_op);
 373         }
 374       else if (c == '+')
 375         {
 376           result = new close_op(result);
 377         }
 378       else if (c == '?')
 379         {
 380           result = make_alt (result, new null_op);
 381         }
 382       else if (c == '{')
 383         {
 384           int minsize = parse_number ();
 385           int maxsize = -1;
 386
 387           c = cur.next ();
 388           if (c == ',')
 389             {
 390               c = cur.peek ();
 391               if (c == '}')
 392                 {
 393                   cur.next ();
 394                   maxsize = -1;
 395                 }
 396               else if (isdigit (c))
 397                 {
 398                   maxsize = parse_number ();
 399                   expect ('}');
 400                 }
 401               else
 402                 parse_error(_("expected '}' or number"), cur.pos);
 403             }
 404           else if (c == '}')
 405             {
 406               maxsize = minsize;
 407             }
 408           else
 409             parse_error(_("expected ',' or '}'"));
 410
 411           /* optimize {0,0}, {0,} and {1,} */
 412           if (!do_tag && minsize == 0 && maxsize == 0)
 413             {
 414               // XXX: this optimization is only used when
 415               // subexpression-extraction is disabled
 416               delete result;
 417               result = new null_op;
 418             }
 419           else if (minsize == 0 && maxsize == -1)
 420             {
 421               result = make_alt (new close_op(result), new null_op);
 422             }
 423           else if (minsize == 1 && maxsize == -1)
 424             {
 425               result = new close_op(result);
 426             }
 427           else
 428             {
 429               result = new closev_op(result, minsize, maxsize);
 430             }
 431         }
 432
 433       c = cur.peek ();
 434     }
 435
 436   if (old_result)
 437     result = new cat_op(old_result, result);
 438
 439   return result;
 440 }
 441
 442 regexp *
 443 regex_parser::parse_char_range ()
 444 {
 445   range *ran = NULL;
 446
 447   // check for inversion
 448   bool inv = false;
 449   rchar c = cur.peek ();
 450   if (c == '^')
 451     {
 452       inv = true;
 453       cur.next ();
 454     }
 455
 456   for (;;)
 457     {
 458       // break on string end whenever we encounter it
 459       if (cur.finished) parse_error(_("unclosed character class"));
 460
 461       range *add = stapregex_getrange (cur);
 462       range *new_ran = ( ran != NULL ? range_union(ran, add) : add );
 463       delete ran; if (new_ran != add) delete add;
 464       ran = new_ran;
 465
 466       // break on ']' (except at the start of the class)
 467       c = cur.peek ();
 468       if (c == ']')
 469         break;
 470     }
 471
 472   if (inv)
 473     {
 474       range *new_ran = range_invert(ran);
 475       delete ran;
 476       ran = new_ran;
 477     }
 478
 479   if (ran == NULL)
 480     return new null_op;
 481
 482   return new match_op(ran);
 483 }
 484
 485 unsigned
 486 regex_parser::parse_number ()
 487 {
 488   string digits;
 489
 490   rchar c = cur.peek ();
 491   while (c && isdigit (c))
 492     {
 493       cur.next ();
 494       digits.push_back (c);
 495       c = cur.peek ();
 496     }
 497
 498   if (digits == "") parse_error(_("expected number"), cur.pos);
 499
 500   char *endptr = NULL;
 501   int val = strtol (digits.c_str (), &endptr, 10);
 502
 503   if (*endptr != '\0' || errno == ERANGE) // paranoid error checking
 504     parse_error(_F("could not parse number %s", digits.c_str()), cur.pos);
 505 #define MAX_DFA_REPETITIONS 12345
 506   if (val >= MAX_DFA_REPETITIONS) // XXX: is there a more sensible max size?
 507     parse_error(_F("%s is too large", digits.c_str()), cur.pos);
 508
 509   return atoi (digits.c_str ());
 510 }
 511
 512 // ------------------------------------------------------------------------
 513
 514 std::map<std::string, range *> named_char_classes;
 515
 516 range *
 517 named_char_class (const string& name)
 518 {
 519   // static initialization of table
 520   if (named_char_classes.empty())
 521     {
 522       // original source for these is http://www.regular-expressions.info/posixbrackets.html
 523       // also checked against (intended to match) the c stdlib isFOO() chr class functions
 524       named_char_classes["alpha"] = new range("A-Za-z");
 525       named_char_classes["alnum"] = new range("A-Za-z0-9");
 526       named_char_classes["blank"] = new range(" \t");
 527       named_char_classes["cntrl"] = new range("\x01-\x1F\x7F"); // XXX: include \x00 in range? -- probably not!
 528       named_char_classes["d"] = named_char_classes["digit"] = new range("0-9");
 529       named_char_classes["xdigit"] = new range("0-9a-fA-F");
 530       named_char_classes["graph"] = new range("\x21-\x7E");
 531       named_char_classes["l"] = named_char_classes["lower"] = new range("a-z");
 532       named_char_classes["print"] = new range("\x20-\x7E");
 533       named_char_classes["punct"] = new range("!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~-");
 534       named_char_classes["s"] = named_char_classes["space"] = new range(" \t\r\n\v\f");
 535       named_char_classes["u"] = named_char_classes["upper"] = new range("A-Z");
 536     }
 537
 538   if (named_char_classes.find(name) == named_char_classes.end())
 539     {
 540       throw regex_error (_F("unknown character class '%s'", name.c_str())); // XXX: position unknown
 541     }
 542
 543   return new range(*named_char_classes[name]);
 544 }
 545
 546 range *
 547 stapregex_getrange (cursor& cur)
 548 {
 549   rchar c = cur.peek ();
 550
 551   if (c == '\\')
 552     {
 553       // Grab escaped char regardless of what it is.
 554       cur.next (); c = cur.peek (); cur.next ();
 555     }
 556   else if (c == '[')
 557     {
 558       // Check for '[:' digraph.
 559       rchar old_c = c; cur.next (); c = cur.peek ();
 560
 561       if (c == ':')
 562         {
 563           cur.next (); c = cur.peek (); // skip ':'
 564           string charclass;
 565
 566           for (;;)
 567             {
 568               if (cur.finished)
 569                 throw regex_error (_F("unclosed character class '[:%s'", charclass.c_str()), cur.pos);
 570
 571               if (cur.has(2) && c == ':' && (*cur.input)[cur.pos] == ']')
 572                 {
 573                   cur.next (); cur.next (); // skip ':]'
 574                   return named_char_class(charclass);
 575                 }
 576
 577               charclass.push_back(c); cur.next(); c = cur.peek();
 578             }
 579         }
 580       else
 581         {
 582           // Backtrack; fall through to processing c.
 583           c = old_c;
 584         }
 585     }
 586   else
 587     cur.next ();
 588
 589   rchar lb = c, ub;
 590
 591   if (!cur.has(2) || cur.peek () != '-' || (*cur.input)[cur.pos] == ']')
 592     {
 593       ub = lb;
 594     }
 595   else
 596     {
 597       cur.next (); // skip '-'
 598       ub = cur.peek ();
 599
 600       if (ub < lb)
 601         throw regex_error (_F("Inverted character range %c-%c", lb, ub), cur.pos);
 602
 603       cur.next ();
 604     }
 605
 606   return new range(lb, ub);
 607 }
 608
 609 };
 610
 611 /* vim: set sw=2 ts=8 cino=>4,n-2,{2,^-2,t0,(0,u0,w1,M1 : */