Optimize string usage in the lexer

author Josh Stone <jistone@redhat.com>

Thu, 9 Jul 2009 19:43:00 +0000 (12:43 -0700)

committer Josh Stone <jistone@redhat.com>

Thu, 9 Jul 2009 23:17:14 +0000 (16:17 -0700)
author Josh Stone <jistone@redhat.com>
Thu, 9 Jul 2009 19:43:00 +0000 (12:43 -0700)
committer Josh Stone <jistone@redhat.com>
Thu, 9 Jul 2009 23:17:14 +0000 (16:17 -0700)
diff --git a/parse.cxx b/parse.cxx

index a26d594cf98066e7b96ea03178f2b1f24b6daf33..028ac71a1b51e4cc1fa4f0af88f9a241d415c800 100644 (file)
--- a/parse.cxx
+++ b/parse.cxx
@@ -583,16 +583,38 @@ parser::peek_kw (std::string const & kw)
  
  
  
-lexer::lexer (istream& i, const string& in, systemtap_session& s):
-  input (i), input_name (in), input_contents (""),
-  input_pointer (0), cursor_suspend_count(0),
-  cursor_line (1), cursor_column (1), session(s),
-  current_file (0)
-{
-  char c;
-  while(input.get(c))
-    input_contents.push_back(c);
-}
+lexer::lexer (istream& input, const string& in, systemtap_session& s):
+  input_name (in), input_contents (""), input_pointer (0),
+  cursor_suspend_count(0), cursor_line (1), cursor_column (1),
+  session(s), current_file (0)
+{
+  getline(input, input_contents, '\0');
+  input_pointer = input_contents.data();
+  input_end = input_contents.data() + input_contents.size();
+
+  if (keywords.empty())
+    {
+      keywords.insert("probe");
+      keywords.insert("global");
+      keywords.insert("function");
+      keywords.insert("if");
+      keywords.insert("else");
+      keywords.insert("for");
+      keywords.insert("foreach");
+      keywords.insert("in");
+      keywords.insert("limit");
+      keywords.insert("return");
+      keywords.insert("delete");
+      keywords.insert("while");
+      keywords.insert("break");
+      keywords.insert("continue");
+      keywords.insert("next");
+      keywords.insert("string");
+      keywords.insert("long");
+    }
+}
+
+set<string> lexer::keywords;
  
  std::string
  lexer::get_input_contents ()
@@ -609,21 +631,20 @@ lexer::set_current_file (stapfile* f)
  int
  lexer::input_peek (unsigned n)
  {
-  if (input_contents.size() > (input_pointer + n))
-    return (int)(unsigned char)input_contents[input_pointer+n];
-  else
-    return -1;
+  if (input_pointer + n >= input_end)
+    return -1; // EOF
+  return (unsigned char)*(input_pointer + n);
  }
  
  
  int
  lexer::input_get ()
  {
-  int c = input_peek (0);
-  input_pointer ++;
-
+  int c = input_peek();
    if (c < 0) return c; // EOF
  
+  ++input_pointer;
+
    if (cursor_suspend_count)
      // Track effect of input_put: preserve previous cursor/line_column
      // until all of its characters are consumed.
@@ -648,9 +669,12 @@ lexer::input_get ()
  void
  lexer::input_put (const string& chars)
  {
-  // clog << "[put:" << chars << " @" << input_pointer << "]";
-  input_contents.insert (input_contents.begin() + input_pointer, chars.begin(), chars.end());
+  size_t pos = input_pointer - input_contents.data();
+  // clog << "[put:" << chars << " @" << pos << "]";
+  input_contents.insert (pos, chars);
    cursor_suspend_count += chars.size();
+  input_pointer = input_contents.data() + pos;
+  input_end = input_contents.data() + input_contents.size();
  }
  
  
@@ -676,7 +700,6 @@ lexer::scan (bool wildcard)
      }
  
    int c = input_get();
-  int c2 = input_peek ();
    // clog << "{" << (char)c << (char)c2 << "}";
    if (c < 0)
      {
@@ -687,6 +710,8 @@ lexer::scan (bool wildcard)
    if (isspace (c))
      goto skip;
  
+  int c2 = input_peek ();
+
    // Paste command line arguments as character streams into
    // the beginning of a token.  $1..$999 go through as raw
    // characters; @1..@999 are quoted/escaped as strings.
@@ -740,23 +765,7 @@ lexer::scan (bool wildcard)
            c2 = input_peek ();
          }
  
-      if (n->content    == "probe"
-          || n->content == "global"
-          || n->content == "function"
-          || n->content == "if"
-          || n->content == "else"
-          || n->content == "for"
-          || n->content == "foreach"
-          || n->content == "in"
-          || n->content == "limit"
-          || n->content == "return"
-          || n->content == "delete"
-          || n->content == "while"
-          || n->content == "break"
-          || n->content == "continue"
-          || n->content == "next"
-          || n->content == "string"
-          || n->content == "long")
+      if (keywords.count(n->content))
          n->type = tok_keyword;
  
        return n;
@@ -767,23 +776,15 @@ lexer::scan (bool wildcard)
        n->type = tok_number;
        n->content = (char) c;
  
-      while (1)
+      while (isalnum (c2))
         {
-         int c2 = input_peek ();
-         if (c2 < 0)
-           break;
-
            // NB: isalnum is very permissive.  We rely on strtol, called in
            // parser::parse_literal below, to confirm that the number string
            // is correctly formatted and in range.
  
-         if (isalnum (c2))
-           {
-             n->content.push_back (c2);
-             input_get ();
-           }
-         else
-           break;
+          input_get ();
+          n->content.push_back (c2);
+          c2 = input_peek ();
         }
        return n;
      }
@@ -835,25 +836,21 @@ lexer::scan (bool wildcard)
  
    else if (ispunct (c))
      {
-      int c2 = input_peek ();
        int c3 = input_peek (1);
-      string s1 = string("") + (char) c;
-      string s2 = (c2 > 0 ? s1 + (char) c2 : s1);
-      string s3 = (c3 > 0 ? s2 + (char) c3 : s2);
  
        // NB: if we were to recognize negative numeric literals here,
        // we'd introduce another grammar ambiguity:
        // 1-1 would be parsed as tok_number(1) and tok_number(-1)
        // instead of tok_number(1) tok_operator('-') tok_number(1)
  
-      if (s1 == "#") // shell comment
+      if (c == '#') // shell comment
          {
            unsigned this_line = cursor_line;
            do { c = input_get (); }
            while (c >= 0 && cursor_line == this_line);
            goto skip;
          }
-      else if (s2 == "//") // C++ comment
+      else if ((c == '/' && c2 == '/')) // C++ comment
          {
            unsigned this_line = cursor_line;
            do { c = input_get (); }
@@ -862,15 +859,15 @@ lexer::scan (bool wildcard)
          }
        else if (c == '/' && c2 == '*') // C comment
         {
+          (void) input_get (); // swallow '*' already in c2
+          c = input_get ();
            c2 = input_get ();
-          unsigned chars = 0;
            while (c2 >= 0)
              {
-              chars ++; // track this to prevent "/*/" from being accepted
+              if (c == '*' && c2 == '/')
+                break;
                c = c2;
                c2 = input_get ();
-              if (chars > 1 && c == '*' && c2 == '/')
-                break;
              }
            goto skip;
         }
@@ -878,73 +875,63 @@ lexer::scan (bool wildcard)
          {
            n->type = tok_embedded;
            (void) input_get (); // swallow '{' already in c2
-          while (true)
+          c = input_get ();
+          c2 = input_get ();
+          while (c2 >= 0)
              {
-              c = input_get ();
-              if (c < 0) // EOF
-                {
-                  n->type = tok_junk;
-                  break;
-                }
-              if (c == '%')
-                {
-                  c2 = input_peek ();
-                  if (c2 == '}')
-                    {
-                      (void) input_get (); // swallow '}' too
-                      break;
-                    }
-                }
+              if (c == '%' && c2 == '}')
+                return n;
                n->content += c;
+              c = c2;
+              c2 = input_get ();
              }
+          n->type = tok_junk;
            return n;
          }
  
        // We're committed to recognizing at least the first character
        // as an operator.
        n->type = tok_operator;
+      n->content = c;
  
        // match all valid operators, in decreasing size order
-      if (s3 == "<<<" ||
-          s3 == "<<=" ||
-          s3 == ">>=")
+      if ((c == '<' && c2 == '<' && c3 == '<') ||
+          (c == '<' && c2 == '<' && c3 == '=') ||
+          (c == '>' && c2 == '>' && c3 == '='))
          {
-          n->content = s3;
+          n->content += c2;
+          n->content += c3;
            input_get (); input_get (); // swallow other two characters
          }
-      else if (s2 == "==" ||
-               s2 == "!=" ||
-               s2 == "<=" ||
-               s2 == ">=" ||
-               s2 == "+=" ||
-               s2 == "-=" ||
-               s2 == "*=" ||
-               s2 == "/=" ||
-               s2 == "%=" ||
-               s2 == "&=" ||
-               s2 == "^=" ||
-               s2 == "|=" ||
-               s2 == ".=" ||
-               s2 == "&&" ||
-               s2 == "||" ||
-               s2 == "++" ||
-               s2 == "--" ||
-               s2 == "->" ||
-               s2 == "<<" ||
-               s2 == ">>" ||
+      else if ((c == '=' && c2 == '=') ||
+               (c == '!' && c2 == '=') ||
+               (c == '<' && c2 == '=') ||
+               (c == '>' && c2 == '=') ||
+               (c == '+' && c2 == '=') ||
+               (c == '-' && c2 == '=') ||
+               (c == '*' && c2 == '=') ||
+               (c == '/' && c2 == '=') ||
+               (c == '%' && c2 == '=') ||
+               (c == '&' && c2 == '=') ||
+               (c == '^' && c2 == '=') ||
+               (c == '|' && c2 == '=') ||
+               (c == '.' && c2 == '=') ||
+               (c == '&' && c2 == '&') ||
+               (c == '|' && c2 == '|') ||
+               (c == '+' && c2 == '+') ||
+               (c == '-' && c2 == '-') ||
+               (c == '-' && c2 == '>') ||
+               (c == '<' && c2 == '<') ||
+               (c == '>' && c2 == '>') ||
                 // preprocessor tokens
-               s2 == "%(" ||
-               s2 == "%?" ||
-               s2 == "%:" ||
-               s2 == "%)")
+               (c == '%' && c2 == '(') ||
+               (c == '%' && c2 == '?') ||
+               (c == '%' && c2 == ':') ||
+               (c == '%' && c2 == ')'))
          {
-          n->content = s2;
+          n->content += c2;
            input_get (); // swallow other character
          }
-      else
-        {
-          n->content = s1;
-        }
  
        return n;
      }
diff --git a/parse.h b/parse.h

index 59046bf3039d769c30cf9b5b66c0c6caa2e69120..4cc4f7b21308e4579e55e547ae8ab9dbf75174ad 100644 (file)
--- a/parse.h
+++ b/parse.h
@@ -15,6 +15,7 @@
  #include <fstream>
  #include <iostream>
  #include <vector>
+#include <set>
  #include <stdexcept>
  #include <stdint.h>
  
@@ -78,19 +79,19 @@ public:
    void set_current_file (stapfile* f);
  
  private:
-  int input_get ();
-  void input_put (int);
+  inline int input_get ();
+  inline int input_peek (unsigned n=0);
    void input_put (const std::string&);
-  int input_peek (unsigned n=0);
-  std::istream& input;
    std::string input_name;
    std::string input_contents;
-  int input_pointer; // index into input_contents
+  const char *input_pointer; // index into input_contents
+  const char *input_end;
    unsigned cursor_suspend_count;
    unsigned cursor_line;
    unsigned cursor_column;
    systemtap_session& session;
    stapfile* current_file;
+  static std::set<std::string> keywords;
  };
  
  struct probe;
author	Josh Stone <jistone@redhat.com>
	Thu, 9 Jul 2009 19:43:00 +0000 (12:43 -0700)
committer	Josh Stone <jistone@redhat.com>
	Thu, 9 Jul 2009 23:17:14 +0000 (16:17 -0700)
parse.cxx		patch \| blob \| blame \| history
parse.h		patch \| blob \| blame \| history