From b8ca56fe055a72979d05dbd79f9f81ec0edbf441 Mon Sep 17 00:00:00 2001
From: Ryan Goldberg <rgoldber@redhat.com>
Date: Tue, 20 Dec 2022 11:52:34 -0500
Subject: [PATCH] Added language server feature: completion

---
 elaborate.h                              |   3 +
 language-server/stap-code-completion.cxx | 478 +++++++++++++++++++++++
 parse.cxx                                |  24 ++
 tapsets.cxx                              |   3 +-
 4 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 language-server/stap-code-completion.cxx
diff --git a/elaborate.h b/elaborate.h
index 7bfb439b8..91c12a5b8 100644
--- a/elaborate.h
+++ b/elaborate.h
@@ -391,6 +391,7 @@ match_key
 };
 
 
+class lsp_method_text_document_completion; // Used for the language-server
 class
 match_node
 {
@@ -422,6 +423,8 @@ match_node
 
 private:
   privilege_t privilege;
+
+  friend class lsp_method_text_document_completion;
 };
 
 // ------------------------------------------------------------------------
diff --git a/language-server/stap-code-completion.cxx b/language-server/stap-code-completion.cxx
new file mode 100644
index 000000000..bbb4b8427
--- /dev/null
+++ b/language-server/stap-code-completion.cxx
@@ -0,0 +1,478 @@
+// Language server code completion
+// Copyright (C) 2022 Red Hat Inc.
+//
+// This file is part of systemtap, and is free software.  You can
+// redistribute it and/or modify it under the terms of the GNU General
+// Public License (GPL); either version 2, or (at your option) any
+// later version.
+#include "config.h"
+#include "staptree.h"
+#include "parse.h"
+#include "session.h"
+#include "elaborate.h"
+#include "stap-probe.h"
+#include "stap-language-server.h"
+
+#include <json-c/json.h>
+#include <iostream>
+#include <signal.h>
+#include <sys/select.h>
+#include <filesystem>
+#include <numeric>
+
+/* Utilities */
+
+static string
+pc2str(parse_context pc)
+{
+    switch (pc)
+    {
+    case con_unknown:  return "con_unknown";
+    case con_probe:    return "con_probe";
+    case con_function: return "con_function";
+    case con_global:   return "con_global";
+    case con_embedded: return "con_embedded";
+    }
+    return "undefined context";
+}
+
+static string join_vector(vector<string> vec, string delim)
+{
+    stringstream result;
+    for (auto it = vec.begin(); it != vec.end(); ++it)
+    {
+        result << *it;
+        if (it != vec.end() - 1)
+            result << delim;
+    }
+    return result.str();
+}
+
+static inline bool
+in_body(string &code_snippet)
+{
+    return code_snippet.find('{') != string::npos;
+}
+
+/* Split a probe signature into a vector of component-param pairs
+ * Ex. foo("a").bar(2).baz -> [ {"foo", "a"}, {bar, "2"}, {"baz", ""} ] 
+ */
+void tokenize_probe(const string &str, vector<pair<string, string>> &tokens_and_params)
+{
+    auto lex_probe_component = [&tokens_and_params](string full_token)
+    {
+        size_t p_start = full_token.find('(');
+        if (p_start != string::npos)
+        {
+            tokens_and_params.push_back({
+                full_token.substr(0, p_start),
+                full_token.substr(p_start + 1, full_token.find(')', p_start) - p_start - 1)
+            });
+        }
+        else
+        {
+            tokens_and_params.push_back({full_token, ""});
+        }
+    };
+
+    // Walk the probe signature, until a new component is found
+    // Lex the previous stringified component into a token & (possible) param
+    // It is important to consider if we're in a string since '.' can appear within a string (Ex. libc.so)
+    bool in_quote = false;
+    size_t start = 0;
+    for (size_t pos = 0; pos < str.size(); pos++)
+    {
+        char c = str[pos];
+        if (c == '.' && !in_quote)
+        {
+            lex_probe_component(str.substr(start, pos - start));
+            start = pos + 1;
+        }
+        else if (c == '\"')
+        {
+            in_quote = !in_quote;
+        }
+    }
+    // Get that last component
+    lex_probe_component(str.substr(start));
+}
+
+void lsp_method_text_document_completion::add_completion_item(string text, string type, string docs, string insert_text)
+{
+    lsp_object item;
+    item.insert("label", text);
+    if (!type.empty())
+        item.insert("detail", type);
+    if (!docs.empty())
+        item.insert("documentation", docs);
+    if (!insert_text.empty())
+    {
+        // Use the insertText if provided, this is the text that will be inserted as opposed to label
+        item.insert("insertText", insert_text);
+        // Using textEdit is preferable to insertText, but is not supported by all clients.
+        // insertText is subject to interpretation by the client side
+        // setting range's start and end to insert_position and using insert_text means append
+        // the NEW part of the completion to the original (Ex. "foob<CURSOR>"
+        // would have insert_text "ar" to get "foobar", with insert_position at <CURSOR>)
+        lsp_object text_edit;
+        text_edit.insert("range").insert("start", insert_position);
+        text_edit.insert("range").insert("end", insert_position);
+        text_edit.insert("newText", insert_text);
+        item.insert("textEdit", text_edit);
+    }
+    completion_items.push_back(item);
+}
+
+void lsp_method_text_document_completion::complete_body(string &partial_statement)
+{
+    vector<string> body_components;
+    tokenize(partial_statement, body_components, " \t\n{}");
+
+    // The completion assumes body_components[:-1] are complete and we just want to
+    // complete the final component
+    string to_complete;
+    if (body_components.size() != 0)
+        to_complete = body_components.back();
+    else
+        to_complete = "";
+
+    if (lang_server->verbose >= 2)
+        cerr << "Completing [body]: '" << to_complete << "'" << endl;
+
+    // FIXME: I miss things like print*
+
+    if (!startswith(to_complete, "@"))
+    {
+        for (auto func = lang_server->s->functions.begin(); func != lang_server->s->functions.end(); ++func)
+        {
+            functiondecl *f = func->second;
+            if (startswith(f->unmangled_name, to_complete))
+            {
+                stringstream docs;
+                f->printsig(docs);
+                add_completion_item(f->unmangled_name.to_string(), "function", docs.str());
+            }
+        }
+        for (auto global = lang_server->s->globals.begin(); global != lang_server->s->globals.end(); ++global)
+        {
+            vardecl *g = *global;
+            if (startswith(g->unmangled_name, to_complete))
+            {
+                stringstream docs;
+                g->printsig(docs);
+                add_completion_item(g->unmangled_name.to_string(), "global", docs.str());
+            }
+        }
+        // FIXME: Perhaps I should pull the keywords and atwords from lexer in parse.cxx
+        vector<string> keywords = {"probe", "global", "private", "function", "if", "else", "for", "foreach", "in",
+                                   "limit", "return", "delete", "while", "break", "continue", "next", "string", "long", "try", "catch"};
+        for (auto keyword = keywords.begin(); keyword != keywords.end(); ++keyword)
+            if (startswith(*keyword, to_complete))
+                add_completion_item(*keyword, "keyword");
+    }
+    else
+    {
+        string at_to_complete = to_complete.substr(1); // Remove the @
+        for (auto macro = lang_server->s->library_macros.begin(); macro != lang_server->s->library_macros.end(); ++macro)
+        {
+            macrodecl *m = macro->second;
+            string name = m->tok->content;
+            if (startswith(name, at_to_complete))
+            {
+                stringstream docs;
+                docs << name << "(" << join_vector(m->formal_args, ", ") << ")";
+                // insertText is used here since LS clients have trouble matching @xyz so
+                // when matching @x insert only yz
+                add_completion_item(name, "macro", docs.str(), name.substr(at_to_complete.size()));
+            }
+        }
+        vector<string> atwords = {"cast", "defined", "probewrite", "entry", "perf", "var", "avg", "count", "sum",
+                                  "min", "max", "hist_linear", "hist_log", "const", "variance", "kregister", "uregister", "kderef", "uderef"};
+        for (auto atword = atwords.begin(); atword != atwords.end(); ++atword)
+            if (startswith(*atword, at_to_complete))
+                add_completion_item(*atword, "atword");
+    }
+}
+
+void lsp_method_text_document_completion::complete_probe_signature(string &partial_signature)
+{
+    vector<pair<string, string>> probe_components;
+    tokenize_probe(partial_signature, probe_components);
+
+    // The completion assumes probe_components[:-1] are complete and we just want to
+    // complete the final component
+    string to_complete = probe_components.back().first;
+    if (lang_server->verbose >= 2)
+        cerr << "Completing [probe signature]: '" << to_complete << "'" << endl;
+
+    match_node *node = lang_server->s->pattern_root;
+    // Find the current position within the match_node tree (which will be a parent of the result(s))
+    for (auto ip = probe_components.begin(); ip != probe_components.end(); ++ip)
+    {
+        string comp = ip->first;
+        string param = ip->second;
+
+        match_key key = match_key(interned_string(comp));
+        key.have_parameter = !param.empty();
+        key.parameter_type = param.find('"') != param.npos ? pe_string : pe_long;
+
+        for (match_node::sub_map_iterator_t it = node->sub.begin(); it != node->sub.end(); ++it)
+            if (it->first.str() == key.str() && it->first.parameter_type == key.parameter_type)
+            {
+                node = it->second;
+                break;
+            }
+    }
+
+    // Find the matching node, which completes the component (to_complete)
+    for (match_node::sub_map_iterator_t it = node->sub.begin(); it != node->sub.end(); ++it)
+        if (startswith(it->first.str(), to_complete))
+        {
+            string docs = it->first.str();
+            string comp = it->first.name.to_string();
+
+            // LS clients have trouble distinguishing completion items with the same label
+            // even if their signatures are different, so having foo, foo(" and foo( serves this purpose
+            if (it->first.have_parameter)
+                comp = comp + (it->first.parameter_type == pe_string ? "(\"" : "(");
+
+            add_completion_item(comp, "probe component", docs);
+        }
+}
+
+void lsp_method_text_document_completion::complete_path(string path)
+{
+    // TODO: PATH can have wildcards which means globbing needs to be supported
+    if (path.empty())
+        return;
+
+    path = path.substr(1); // Remove the opening quote (since we're completing strings, there is no closing)
+
+    vector<string> dirs;
+    string partial_path = ""; // The path from the start (not including a slash at idx 0) to the last slash (inclusive)
+    bool absolute;
+
+    size_t last_slash = path.find_last_of('/');
+    string to_complete = path.substr(last_slash + 1);
+
+    if (path[0] == '/')
+    {
+        absolute = true;
+        dirs.push_back("/");
+
+        if (last_slash != string::npos && path.size() >= 1)
+            partial_path = path.substr(1, last_slash);
+    }
+    else
+    {
+        absolute = false;
+
+        const char *PATH = getenv("PATH");
+        if (PATH)
+            tokenize(string(PATH), dirs, string(":"));
+
+        if (last_slash != string::npos)
+            partial_path = path.substr(0, last_slash + 1);
+    }
+
+    for (auto dir : dirs)
+        try
+        {
+            // If dir is a prefix of partial_path then just look at partial_path
+            // otherwise assume that dir is in PATH and partial path is an unexplored tree within dir
+            if (!partial_path.empty() && startswith(partial_path, dir.c_str()))
+                dir = partial_path;
+            else
+                dir += partial_path;
+
+            for (const auto &file : std::filesystem::directory_iterator(dir))
+            {
+                string prefix = dir + (dir.back() != '/' ? "/" : "") + to_complete;
+                if (startswith(file.path(), prefix) && (file.is_directory() || file.is_regular_file()))
+                {
+                    string type = file.is_directory() ? "directory" : "file";
+                    if (absolute)
+                        add_completion_item(file.path(), type, "", string(file.path()).substr(prefix.size()));
+                    else
+                        add_completion_item(file.path().filename(), type, "", string(file.path()).substr(prefix.size()));
+                }
+            }
+        }
+        catch (...)
+        {} // If there is an issue just skip
+}
+
+void lsp_method_text_document_completion::complete_string(string &partial_signature)
+{
+    vector<pair<string, string>> probe_components;
+    tokenize_probe(partial_signature, probe_components);
+
+    // The completion assumes probe_components[:-1] are complete and we just want to
+    // complete the final component
+    string to_complete = probe_components.back().second; // The string param to complete
+    string completion_component = probe_components.back().first;
+    probe_components.pop_back();
+
+    if (lang_server->verbose >= 2)
+        cerr << "Completing [string]: '" << to_complete << "'" << endl;
+
+    // TODO: There are still probe components to handle; see man stapprobes
+    if (completion_component == "process" || completion_component == "procfs" || completion_component == "library") // The PATHlike components
+    {
+        return complete_path(to_complete);
+    }
+    else // Try our luck with wildcard expansion as the param, this logic is the same as that used by stap -L
+    {
+        try
+        {
+            stringstream ss(string("probe ") + partial_signature + "*\") {}");
+            probe *p = parse_synthetic_probe(*lang_server->s, ss, NULL);
+            if (p)
+            {
+                vector<derived_probe *> dps;
+
+                derive_probes(*lang_server->s, p, dps, false /* Not optional */, true /* Rethrow semantic errors */);
+                for (auto it = dps.begin(); it != dps.end(); ++it)
+                {
+                    derived_probe *p = *it;
+                    ostringstream o, docs;
+                    o << *(*it)->sole_location()->components.back()->arg;
+
+                    // TODO: Might want to grab p->locals[j] as well
+                    list<string> args;
+                    p->getargs(args);
+                    if (args.size() > 0)
+                    {
+                        docs << "Target Vars: ";
+                        for (auto ia = args.begin(); ia != args.end(); ++ia)
+                            docs << *ia << " ";
+                    }
+                    // The insertion text is just the tail of the completion
+                    add_completion_item(o.str(), "string", docs.str(), o.str().substr(to_complete.size()));
+                }
+            }
+        }
+        catch (semantic_error&){}
+    }
+}
+
+void lsp_method_text_document_completion::complete(string code)
+{
+    size_t code_start = 0;
+    // Skip magic lines [jupyter client specific]
+    while (code.size() >= 2 && code[0] == '%' && code[1] == '%')
+    {
+        code_start = code.find('\n', code_start) + 1;
+        code = code.substr(code_start);
+    }
+
+    /* Completion relies on parsing the code and seeing where there is a faliure
+     * since the faliure point shows what is expected and what is wrong there
+     * Pass 1 is called to determine the completion rule using:
+     * - The context in which p1 failed
+     * - Some information in particular about the failing token
+    */ 
+    const token *tok = pass_1(*lang_server->s, code);
+
+    if (lang_server->verbose >= 2)
+    {
+        cerr << "Completion context: " << pc2str(lang_server->code_completion_context) << endl;
+        cerr << "Code Block:" << code << endl;
+    }
+
+    switch (lang_server->code_completion_context)
+    {
+    case con_unknown:
+        if (startswith(code, "private "))
+            complete(code.substr(strlen("private ")));
+        else
+            for (string s : {"probe", "global", "private", "function"})
+                if (startswith(s, code)) add_completion_item(s, "keyword");
+        break;
+    case con_probe:
+    case con_function:
+        if (in_body(code))
+        {
+            /* Completion of a function or probe body, which are the same */
+            size_t body_start = code.find('{');
+            string body = (body_start != code.size() - 1) ? code.substr(body_start + 1) : "";
+            complete_body(body);
+        }
+        else if (lang_server->code_completion_context == con_probe && code.size() > strlen("probe"))
+        {
+            //  code is of the form 'probe...' up to and not incluing a possible body start ({, %{)
+            string partial_sig = code.substr(strlen("probe") + 1);
+            if (tok && tok->type == tok_junk && tok->junk_type == tok_junk_unclosed_quote)
+                // The special case of string completion
+                complete_string(partial_sig);
+            else
+                complete_probe_signature(partial_sig);
+        }
+        else
+        {
+            /* snippets of the form 'function ... {' don't have any sort of completion to do
+                likewise neither do snippets of the form 'probe' */
+        }
+        break;
+    case con_global:
+    case con_embedded:
+    default:
+        /* snippets of the form global x = ... don't have any sort of completion to do*/
+        /* TODO: For now we do not support C completion */
+        break;
+    }
+    return;
+}
+
+jsonrpc_response *
+lsp_method_text_document_completion::handle(json_object *p)
+{
+    lsp_object completion_params(p);
+
+    document *doc = lang_server->wspace->get_document(completion_params.extract_substruct("textDocument").extract_string("uri"));
+
+    insert_position = completion_params.extract_substruct("position");
+
+    // Determine the completion snippet (the code block needed for the completion)
+    vector<string> completion_block;
+    vector<string> start_tokens = {"probe", "function", "global", "private"};
+
+    // NB: For the right-strip, DON'T remove spaces since token seperation requires it ex. "probe" vs "probe "
+    auto strip = [](string s)
+        { return s.erase(0, s.find_first_not_of("\t\n\r ")).erase(s.find_last_not_of("\t\n\r") + 1); };
+
+    vector<string> lines = doc->get_lines();
+    int cursor_line_num = insert_position.extract_int("line");
+
+    // The line has the cursor on it so truncate at cursor pos
+    lines.at(cursor_line_num) = lines.at(cursor_line_num).substr(0, insert_position.extract_int("character") + 1);
+
+    // Keep appending lines until a valid start token (or at least the start of one) is found.
+    for (int i = cursor_line_num; i >= 0; i--)
+    {
+        string line = strip(lines[i]);
+        completion_block.push_back(line); // The last step will be to reverse the whole vector and merge it
+        vector<string> words;
+        tokenize(line, words, " ");
+        if (words.size() > 0 && any_of(start_tokens.cbegin(), start_tokens.cend(), [words](string s_token)
+                                       { return startswith(s_token, words[0]); }))
+            break; // Found the context start (the absolute start is handled by the loop condition itself)
+    }
+    reverse(completion_block.begin(), completion_block.end());
+    string code = join_vector(completion_block, "\n");
+
+    //TODO: The lines from wherever i ends to 0 might contain functions/globals/macros which should be included as completions. 
+    // Pass them through a pass1 to get them
+    // BUT, Ex. how to handle syntax errors i.e 2 def but the first of them has an issue I'd still want #2
+
+    lsp_object completion_list;
+    // Allows the client to be more efficient, as it doesn't need to requery when another char is added, just filter with
+    // the same completion list (Ex. completing 'pro' offers 'probe' so don't bother recomputing for 'prob' its the same)
+    completion_list.insert("isIncomplete", false);
+    complete(code);
+    completion_list.insert("items", completion_items);
+
+    jsonrpc_response *res = new jsonrpc_response;
+    res->set_result(completion_list.to_json());
+    return res;
+}
\ No newline at end of file
diff --git a/parse.cxx b/parse.cxx
index fa476d335..912ce65e5 100644
--- a/parse.cxx
+++ b/parse.cxx
@@ -16,6 +16,10 @@
 #include "util.h"
 #include "stringtable.h"
 
+#ifdef HAVE_JSON_C
+#include "language-server/stap-language-server.h"
+#endif
+
 #include <iostream>
 
 #include <fstream>
@@ -1207,6 +1211,11 @@ parser::next ()
     throw PARSE_ERROR (_("unexpected end-of-file"));
 
   last_t = next_t;
+  #ifdef HAVE_JSON_C
+  if(this->session.language_server_mode)
+    this->session.language_server->code_completion_target = last_t;
+  #endif
+
   // advance by zeroing next_t
   next_t = 0;
   return last_t;
@@ -1221,6 +1230,10 @@ parser::peek ()
 
   // don't advance by zeroing next_t
   last_t = next_t;
+  #ifdef HAVE_JSON_C
+  if(this->session.language_server_mode)
+    this->session.language_server->code_completion_target = last_t;
+  #endif
   return next_t;
 }
 
@@ -1233,6 +1246,10 @@ parser::swallow ()
   delete last_t;
   // advance by zeroing next_t
   last_t = next_t = 0;
+  #ifdef HAVE_JSON_C
+  if(this->session.language_server_mode)
+    this->session.language_server->code_completion_target = last_t;
+  #endif
 }
 
 
@@ -1973,6 +1990,13 @@ parser::parse ()
 	{
 	  print_error (pe, errs_as_warnings);
 
+    #ifdef HAVE_JSON_C
+    // This is the spot where the parsing failed i.e the place to complete, so send the data up
+    if(this->session.language_server_mode){
+      this->session.language_server->code_completion_context = context;
+      throw pe;
+    }
+    #endif
           // XXX: do we want tok_junk to be able to force skip_some behaviour?
           if (pe.skip_some) // for recovery
             // Quietly swallow all tokens until the next keyword we can start parsing from.
diff --git a/tapsets.cxx b/tapsets.cxx
index 46b10f26e..2e8a56c5a 100644
--- a/tapsets.cxx
+++ b/tapsets.cxx
@@ -5696,7 +5696,8 @@ dwarf_derived_probe::dwarf_derived_probe(interned_string funcname,
       // Save the local variables for listing mode. If the scope_die is null,
       // local vars aren't accessible, so no need to invoke saveargs (PR10820).
       if (!null_die(scope_die) &&
-          q.sess.dump_mode == systemtap_session::dump_matched_probes_vars)
+          (q.sess.dump_mode == systemtap_session::dump_matched_probes_vars || 
+          q.sess.language_server_mode))
         saveargs(q, scope_die, dwfl_addr);
   }
 
-- 
2.43.5