From 534aad8b89d52c2a83eac2087d8d2dcfa0eba92a Mon Sep 17 00:00:00 2001 From: Serguei Makarov Date: Thu, 26 Jul 2012 11:02:42 -0400 Subject: [PATCH] PR11207: initial implementation of macros (definitions visible in same file). This patch augments systemtap's preprocessor with a simple macro facility, as follows: @define add(a,b) %( ((@a)+(@b)) %) x = add(2,2) Further details are available in the comments on the preprocessor code, or in the proposal posted earlier on the mailing list. --- parse.cxx | 404 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- parse.h | 1 + 2 files changed, 392 insertions(+), 13 deletions(-) diff --git a/parse.cxx b/parse.cxx index 007e7fd10..0d05077e2 100644 --- a/parse.cxx +++ b/parse.cxx @@ -38,6 +38,8 @@ class lexer { public: bool ate_whitespace; // the most recent token followed whitespace + bool saw_tokens; // the lexer found tokens (before preprocessing occurred) + token* scan (); lexer (istream&, const string&, systemtap_session&); void set_current_file (stapfile* f); @@ -80,6 +82,20 @@ private: PP_SKIP_ELSE, } pp_state_t; + struct pp1_activation; + + struct pp_macrodecl { + const token* tok; + vector param_names; + vector body; + + pp1_activation* parent_act; + + pp_macrodecl (const token tok, pp1_activation* parent_act = NULL) + : tok(new token(tok)), parent_act(parent_act) { } + ~pp_macrodecl (); + }; + systemtap_session& session; string input_name; istream* free_input; @@ -87,7 +103,29 @@ private: bool privileged; parse_context context; - // preprocessing subordinate + // preprocessing subordinate, first pass (macros) + struct pp1_activation { + const token* tok; + unsigned cursor; // position within macro body + map params; + bool is_closure; // are params shared with an earlier activation record? + + pp_macrodecl* curr_macro; + + pp1_activation (const token tok, pp_macrodecl* curr_macro) + : tok(new token(tok)), cursor(0), is_closure(false), + curr_macro(curr_macro) { } + ~pp1_activation (); + }; + + map pp1_namespace; + vector pp1_state; + const token* next_pp1 (); + const token* scan_pp1 (); + const token* slurp_pp1_param (vector& param); + const token* slurp_pp1_body (vector& body); + + // preprocessing subordinate, final pass (conditionals) vector > pp_state; const token* scan_pp (); const token* skip_pp (); @@ -294,6 +332,13 @@ parser::print_error (const parse_error &pe) cerr << _("\tsaw: ") << input_name << " EOF" << endl; } + // print chained macro invocations + while (tok && tok->chain) { + tok = tok->chain; + cerr << _("\tin expansion of macro: ") << *tok << endl; + session.print_error_source (cerr, align_parse_error, tok); + } + num_errors ++; } @@ -326,7 +371,330 @@ bool eval_comparison (const OPERAND& lhs, const token* op, const OPERAND& rhs) } -// Here, we perform on-the-fly preprocessing. +// Here, we perform on-the-fly preprocessing in two passes. + +// First pass - macro declaration and expansion. +// +// The basic form of a declaration is @define SIGNATURE %( BODY %) +// where SIGNATURE is of the form macro_name (a, b, c, ...) +// and BODY can obtain the parameter contents as @a, @b, @c, .... +// Note that parameterless macros can also be declared. +// +// Macro definitions may not be nested. +// A macro is available textually after it has been defined. +// +// The basic form of a macro invocation +// for a parameterless macro is @macro_name, +// for a macro with parameters is @macro_name(param_1, param_2, ...). +// +// TODOXXX NB: this means that a parameterless macro @foo called as +// @foo(a, b, c) leaves its 'parameters' alone, rather than consuming +// them to result in a "too many parameters error". +// +// Invocations of unknown macros are left unexpanded, to allow +// the continued use of constructs such as @cast, @var, etc. + +parser::pp_macrodecl::~pp_macrodecl () +{ + delete tok; + for (vector::iterator it = body.begin(); + it != body.end(); it++) + delete *it; +} + +parser::pp1_activation::~pp1_activation () +{ + delete tok; + if (is_closure) return; // body is shared with an earlier declaration + for (map::iterator it = params.begin(); + it != params.end(); it++) + delete it->second; +} + +// Grab a token from the current input source (main file or macro body): +const token* +parser::next_pp1 () +{ + if (pp1_state.empty()) + return input.scan (); + + // otherwise, we're inside a macro + pp1_activation* act = pp1_state.back(); + unsigned& cursor = act->cursor; + if (cursor < act->curr_macro->body.size()) + { + token* t = new token(*act->curr_macro->body[cursor]); + t->chain = act->tok; // mark chained token + cursor++; + return t; + } + else + return 0; // reached end of macro body +} + +const token* +parser::scan_pp1 () +{ + while (true) + { + const token* t = next_pp1 (); + if (t == 0) // EOF or end of macro body + { + if (pp1_state.empty()) // actual EOF + return 0; + + // Exit macro and loop around to look for the next token. + pp1_activation* act = pp1_state.back(); + pp1_state.pop_back(); delete act; + continue; + } + + // macro definition + if (t->type == tok_operator && t->content == "@define") + { + if (!pp1_state.empty()) + throw parse_error (_("'@define' forbidden inside macro body"), t); + delete t; + + // handle macro definition + // (1) consume macro signature + t = input.scan(); + if (! (t && t->type == tok_identifier)) + throw parse_error (_("expected identifier"), t); + string name = t->content; + + // check for redefinition of existing macro + if (pp1_namespace.find(name) != pp1_namespace.end()) + // TODOXXX use a slightly different chaining hack to also point to + // pp1_namespace[name]->tok, the site of the original definition? + throw parse_error (_F("attempt to redefine macro '@%s' in the same file", name.c_str ()), t); + // TODOXXX this is only really necessary if we want to leave open the possibility of statically-scoped semantics in the future...? + + // TODOXXX this cascades into further parse errors as the + // parser tries to parse the remaining definition... + if (name == "define") + throw parse_error (_("attempt to redefine '@define'"), t); + if (input.atwords.count("@" + name)) + session.print_warning (_F("macro redefines built-in operator '@%s'", name.c_str()), t); + + pp_macrodecl* decl = (pp1_namespace[name] = new pp_macrodecl(*t)); + delete t; + + // determine if the macro takes parameters + t = input.scan(); + if (t && t->type == tok_operator && t->content == "(") + do + { + delete t; + + t = input.scan (); + if (! (t && t->type == tok_identifier)) + throw parse_error(_("expected identifier"), t); + decl->param_names.push_back(t->content); + delete t; + + t = input.scan (); + if (t && t->type == tok_operator && t->content == ",") + { + continue; + } + else if (t && t->type == tok_operator && t->content == ")") + { + delete t; + t = input.scan(); + break; + } + else + { + throw parse_error (_("expected ',' or ')'"), t); + } + } + while (true); + + // (2) identify & consume macro body + if (! (t && t->type == tok_operator && t->content == "%(")) + // TODOXXX support & document heredoc & one-line macros + throw parse_error (_("expected '%('"), t); + // TODOXXX perhaps 'expected '%(' or '(' if params not seen? + delete t; + + t = slurp_pp1_body (decl->body); + if (!t) + throw parse_error (_("incomplete macro definition - missing '%)'"), decl->tok); + delete t; + + // Now loop around to look for a real token. + continue; + } + + // (potential) macro invocation + if (t->type == tok_operator && t->content[0] == '@') + { + string name = t->content.substr(1); // strip initial '@' + + // check if name refers to a real parameter or macro + pp_macrodecl* decl; + pp1_activation* act = pp1_state.empty() ? 0 : pp1_state.back(); + if (act && act->params.find(name) != act->params.end()) + decl = act->params[name]; + else if (pp1_namespace.find(name) != pp1_namespace.end()) + decl = pp1_namespace[name]; + else // this is an ordinary @operator + return t; + + // handle macro invocation + pp1_activation *new_act = new pp1_activation(*t, decl); + unsigned num_params = decl->param_names.size(); + + // (1a) restore parameter invocation closure + if (num_params == 0 && decl->parent_act) + { + // NB: decl->parent_act is always safe since the + // parameter decl (if any) comes from an activation + // record which deeper in the stack than new_act. + + // decl is a macro parameter which must be evaluated in + // the context of the original point of invocation: + new_act->params = decl->parent_act->params; + new_act->is_closure = true; // hack to prevent double-freeing params + goto expand; + } + + // (1b) consume macro parameters (if any) + if (num_params == 0) + goto expand; + + // for simplicity, we do not allow macro constructs here + // -- if we did, we'd have to recursively call scan_pp1() + t = next_pp1 (); + if (! (t->type == tok_operator && t->content == "(")) + { + delete new_act; + throw parse_error (_F(ngettext + ("expected '(' in invocation of macro '@%s'" + "taking %d parameter", + "expected '(' in invocation of macro '@%s'" + "taking %d parameters", + num_params), name.c_str(), num_params), t); + } + + // XXX perhaps parse/count the full number of params, + // so we can say "expected x, found y params" on error? + for (unsigned i = 0; i < num_params; i++) + { + delete t; + + // create parameter closure + string param_name = decl->param_names[i]; + pp_macrodecl* p = (new_act->params[param_name] + = new pp_macrodecl(*new_act->tok, act)); + // NB: *new_act->tok points to invocation, act is NULL at top level + + t = slurp_pp1_param (p->body); + + // check correct usage of ',' or ')' + if (t == 0) // hit unexpected EOF or end of macro + { + // XXX could we pop the stack and continue parsing + // the invocation, allowing macros to construct new + // invocations in piecemeal fashion?? + const token* orig_t = new token(*new_act->tok); + delete new_act; + throw parse_error (_("could not find end of macro invocation"), orig_t); + } + if (t->type == tok_operator && t->content == ",") + { + if (i + 1 == num_params) + { + delete new_act; + throw parse_error (_F("too many parameters for macro '@%s' (expected %d)", name.c_str(), num_params), t); + } + } + else if (t->type == tok_operator && t->content == ")") + { + if (i + 1 != num_params) + { + delete new_act; + throw parse_error (_F("too few parameters for macro '@%s' (expected %d)", name.c_str(), num_params), t); + } + } + else + { + // XXX this is, incidentally, impossible + delete new_act; + throw parse_error(_("expected ',' or ')' after macro parameter"), t); + } + } + + delete t; + + // (2) set up macro expansion + expand: + pp1_state.push_back (new_act); + + // Now loop around to look for a real token. + continue; + } + + // Otherwise, we have an ordinary token. + return t; + } +} + +// Consume a single macro invocation's parameters, heeding nested ( ) +// brackets and stopping on an unbalanced ')' or an unbracketed ',' +// (and returning the final separator token). +const token* +parser::slurp_pp1_param (vector& param) +{ + const token* t = 0; + unsigned nesting = 0; + do + { + t = next_pp1 (); + + if (!t) + break; + if (t->type == tok_operator && t->content == "(") + ++nesting; + else if (nesting && t->type == tok_operator && t->content == ")") + --nesting; + else if (!nesting && t->type == tok_operator + && (t->content == ")" || t->content == ",")) + break; + param.push_back(t); + } + while (true); + return t; // report ")" or "," or NULL +} + + +// Consume a macro declaration's body, heeding nested %( %) brackets. +const token* +parser::slurp_pp1_body (vector& body) +{ + const token* t = 0; + unsigned nesting = 0; + do + { + t = next_pp1 (); + + if (!t) + break; + if (t->type == tok_operator && t->content == "%(") + ++nesting; + else if (nesting && t->type == tok_operator && t->content == "%)") + --nesting; + else if (!nesting && t->type == tok_operator && t->content == "%)") + break; + body.push_back(t); + } + while (true); + return t; // report final "%)" or NULL +} + +// Second pass - preprocessor conditional expansion. +// // The basic form is %( CONDITION %? THEN-TOKENS %: ELSE-TOKENS %) // where CONDITION is: kernel_v[r] COMPARISON-OP "version-string" // or: arch COMPARISON-OP "arch-string" @@ -525,7 +893,7 @@ parser::scan_pp () if (pp == PP_SKIP_THEN || pp == PP_SKIP_ELSE) t = skip_pp (); else - t = input.scan (); + t = scan_pp1 (); if (t == 0) // EOF { @@ -578,9 +946,9 @@ parser::scan_pp () const token *n = NULL; do { const token *l, *op, *r; - l = input.scan (); - op = input.scan (); - r = input.scan (); + l = scan_pp1 (); + op = scan_pp1 (); + r = scan_pp1 (); if (l == 0 || op == 0 || r == 0) throw parse_error (_("incomplete condition after '%('"), t); // NB: consider generalizing to consume all tokens until %?, and @@ -599,7 +967,7 @@ parser::scan_pp () delete op; delete n; - n = input.scan (); + n = scan_pp1 (); if (n && n->type == tok_operator && n->content == "&&") continue; result |= and_result; @@ -636,7 +1004,7 @@ parser::skip_pp () { try { - t = input.scan (); + t = scan_pp1 (); } catch (const parse_error &e) { @@ -955,8 +1323,14 @@ token* lexer::scan () { ate_whitespace = false; // reset for each new token + + // XXX be very sure to restore old_saw_tokens if we return without a token: + bool old_saw_tokens = saw_tokens; + saw_tokens = true; + token* n = new token; n->location.file = current_file; + n->chain = NULL; // important safety dance skip: bool suspended = (cursor_suspend_count > 0); @@ -968,6 +1342,7 @@ skip: if (c < 0) { delete n; + saw_tokens = old_saw_tokens; return 0; } @@ -1255,7 +1630,7 @@ parser::parse () { systemtap_v_seen = 0; const token* t = peek (); - if (! t) // nice clean EOF + if (! t) // nice clean EOF, modulo any preprocessing that occurred break; empty = false; @@ -1316,7 +1691,11 @@ parser::parse () if (empty) { - cerr << _F("Input file '%s' is empty or missing.", input_name.c_str()) << endl; + // vary message depending on whether file was *actually* empty: + cerr << (input.saw_tokens + ? _F("Input file '%s' is empty after preprocessing.", input_name.c_str()) + : _F("Input file '%s' is empty or missing.", input_name.c_str())) + << endl; delete f; f = 0; } @@ -1731,9 +2110,8 @@ parser::parse_probe_point () // consume u next (); } - token* new_t = new token; - new_t->location = t->location; - new_t->type = t->type; + // get around const-ness of t: + token* new_t = new token(*t); new_t->content = content; delete t; t = new_t; diff --git a/parse.h b/parse.h index b56714c25..eb9921a04 100644 --- a/parse.h +++ b/parse.h @@ -46,6 +46,7 @@ struct token std::string content; std::string msg; // for tok_junk void make_junk (std::string msg); + const token* chain; // macro invocation that produced this token }; -- 2.43.5