perlnavigator-server
Version:
Perl language server
674 lines (578 loc) • 25.2 kB
text/typescript
import { PerlDocument, PerlElem, PerlSymbolKind, ParseType, TagKind, ElemSource } from "./types";
import { TextDocument } from "vscode-languageserver-textdocument";
import Uri from "vscode-uri";
import fs = require("fs");
import path = require("path");
import vsctm = require("vscode-textmate");
import oniguruma = require("vscode-oniguruma");
function init_doc(textDocument: TextDocument): PerlDocument {
// We probably dont need this
const filePath = Uri.parse(textDocument.uri).fsPath;
let perlDoc: PerlDocument = {
elems: new Map(),
canonicalElems: new Map(),
autoloads: new Map(),
imported: new Map(),
parents: new Map(),
uri: textDocument.uri,
};
return perlDoc;
}
type ParserState = {
stmt: string;
line_number: number;
var_continues: boolean;
package_name: string;
uri: string;
perlDoc: PerlDocument;
parseType: ParseType;
codeArray: string[];
};
type ParseFunc = (state: ParserState) => boolean;
export async function parseFromUri(uri: string, parseType: ParseType): Promise<PerlDocument | undefined> {
// File may not exists. Return nothing if it doesn't
const absolutePath = Uri.parse(uri).fsPath;
try {
var content = await fs.promises.readFile(absolutePath, "utf8");
} catch {
return;
}
const document = TextDocument.create(uri, "perl", 1, content);
return await parseDocument(document, parseType);
}
export async function parseDocument(textDocument: TextDocument, parseType: ParseType): Promise<PerlDocument> {
let parseFunctions: ParseFunc[] = [];
switch (parseType) {
case ParseType.outline:
parseFunctions = [subs, labels, constants, fields, imports, dancer];
break;
case ParseType.selfNavigation:
parseFunctions = [knownObj, localVars, subs, labels, constants, fields, imports, autoloads, dancer];
break;
case ParseType.refinement:
parseFunctions = [subs, fields];
break;
}
parseFunctions.unshift(packages); // Packages always need to be found to be able to categorize the elements.
let perlDoc = init_doc(textDocument);
let state: ParserState = {
stmt: "",
line_number: 0,
package_name: "",
perlDoc: perlDoc,
uri: textDocument.uri,
var_continues: false,
codeArray: await cleanCode(textDocument, perlDoc, parseType),
parseType: parseType,
};
for (state.line_number = 0; state.line_number < state.codeArray.length; state.line_number++) {
state.stmt = state.codeArray[state.line_number];
// Nothing left? Never mind.
if (!state.stmt) continue;
parseFunctions.some((fn) => fn(state));
}
return perlDoc;
}
function knownObj(state: ParserState): boolean {
let match;
// TODO, allow specifying list of constructor names as config
// Declaring an object. Let's store the type
// my $constructors = qr/(?:new|connect)/;
if (
(match = state.stmt.match(/^(?:my|our|local|state)\s+(\$\w+)\s*\=\s*([\w\:]+)\-\>new\s*(?:\((?!.*\)\->)|;)/)) ||
(match = state.stmt.match(/^(?:my|our|local|state)\s+(\$\w+)\s*\=\s*new (\w[\w\:]+)\s*(?:\((?!.*\)\->)|;)/))
) {
let varName = match[1];
let objName = match[2];
MakeElem(varName, PerlSymbolKind.LocalVar, objName, state);
state.var_continues = false; // We skipped ahead of the line here. Why though?
return true;
} else {
return false;
}
}
function localVars(state: ParserState): boolean {
// This is a variable declaration if one was started on the previous
// line, or if this line starts with my or local
let match;
if (state.var_continues || (match = state.stmt.match(/^(?:my|our|local|state)\b/))) {
// The declaration continues unless there's a semicolon, signature end, or sub start.
// This can get tripped up with comments, but it's not a huge deal. subroutines are more important
state.var_continues = !state.stmt.match(/[\)\=\}\{;]/);
let mod_stmt = state.stmt;
// Remove my or local from statement, if present
mod_stmt = mod_stmt.replace(/^(my|our|local|state)\s+/, "");
// Remove any assignment piece. Breaks with signature defaults
mod_stmt = mod_stmt.replace(/\s*=.*/, "");
// Remove part where sub starts (for signatures), while exempting default {} args
mod_stmt = mod_stmt.replace(/\s*(\{[^\}]|\)).*/, "");
// Now find all variable names, i.e. "words" preceded by $, @ or %
let vars = mod_stmt.matchAll(/([\$\@\%][\w:]+)\b/g);
for (let match of vars) MakeElem(match[1], PerlSymbolKind.LocalVar, "", state);
return true;
// Lexical loop variables, potentially with labels in front. foreach my $foo
} else if ((match = state.stmt.match(/^(?:(\w+)\s*:(?!\:))?\s*(?:for|foreach)\s+my\s+(\$[\w]+)\b/))) {
if (match[1]) MakeElem(match[1], PerlSymbolKind.Label, "", state);
MakeElem(match[2], PerlSymbolKind.LocalVar, "", state);
// Lexical match variables if(my ($foo, $bar) ~= ). Optional to detect (my $newstring = $oldstring) =~ s/foo/bar/g;
} else if ((match = state.stmt.match(/^(?:\}\s*elsif|if|unless|while|until|for)?\s*\(\s*my\b(.*)$/))) {
// Remove any assignment piece
const mod_stmt = state.stmt.replace(/\s*=.*/, "");
let vars = mod_stmt.matchAll(/([\$\@\%][\w]+)\b/g);
for (let match of vars) MakeElem(match[1], PerlSymbolKind.LocalVar, "", state);
// Try-catch exception variables
} else if ((match = state.stmt.match(/^\}?\s*catch\s*\(\s*(\$\w+)\s*\)\s*\{?$/))) {
MakeElem(match[1], PerlSymbolKind.LocalVar, "", state);
} else {
return false;
}
return true;
}
function packages(state: ParserState): boolean {
// This is a package declaration if the line starts with package
let match;
if ((match = state.stmt.match(/^package\s+([\w:]+)/))) {
// Get name of the package
state.package_name = match[1];
const endLine = PackageEndLine(state);
MakeElem(state.package_name, PerlSymbolKind.Package, "", state, endLine);
// This is a class decoration for Object::Pad, Corinna, or Moops
} else if ((match = state.stmt.match(/^class\s+([\w:]+)/))) {
let class_name = match[1];
state.package_name = class_name;
const endLine = PackageEndLine(state);
MakeElem(class_name, PerlSymbolKind.Class, "", state, endLine);
} else if ((match = state.stmt.match(/^role\s+([\w:]+)/))) {
const roleName = match[1];
// state.package_name = roleName; # Being cautious against changing the package name
const endLine = SubEndLine(state);
MakeElem(roleName, PerlSymbolKind.Role, "", state, endLine);
} else {
return false;
}
return true;
}
function subs(state: ParserState): boolean {
let match;
// This is a sub declaration if the line starts with sub
if (
(match = state.stmt.match(/^(?:async\s+)?(sub)\s+([\w:]+)(\s+:method)?([^{]*)/)) ||
(match = state.stmt.match(/^(?:async\s+)?(method)\s+\$?([\w:]+)()([^{]*)/)) ||
(state.perlDoc.imported.has("Function::Parameters") && (match = state.stmt.match(/^(fun)\s+([\w:]+)()([^{]*)/)))
) {
const subName = match[2];
const signature = match[4];
const kind = match[1] === "method" || match[3] ? PerlSymbolKind.LocalMethod : PerlSymbolKind.LocalSub;
const endLine = SubEndLine(state);
// Match the after the sub declaration and before the start of the actual sub for signatures (if any).
// TODO: Change this to multi-line signatures
const vars = signature.matchAll(/([\$\@\%][\w:]+)\b/g);
let signature_params = [];
// Define subrountine signatures, but exclude prototypes
// The declaration continues if the line does not end with ;
state.var_continues = !(state.stmt.endsWith(';') || state.stmt.match(/[\)\=\}\{]/));
for (const matchvar of vars) {
signature_params.push(matchvar[1]);
MakeElem(matchvar[1], PerlSymbolKind.LocalVar, "", state);
}
const extras = look_ahead_signatures(state);
for (const extra of extras) {
signature_params.push(extra);
}
MakeElem(subName, kind, "", state, endLine, signature_params);
} else {
return false;
}
return true;
}
function look_ahead_signatures(state: ParserState): string[] {
let sig_vars: string[] = [];
let sig_continues = true;
for (let i = state.line_number; i < state.codeArray.length; i++) {
// Limit depth for speed and accuracy.
let depth = i - state.line_number;
let stmt = state.codeArray[i];
if (sig_continues) {
// The signature continues if the line does not end with ;
sig_continues = !stmt.endsWith(";") && !stmt.match(/[\)\}\{]/);
if (depth > 0) {
// First line is already parsed
// Remove part where sub starts (for signatures). Consider other options here.
let mod_stmt = stmt.replace(/\s*(\{[^\}]|\)).*/, "");
// Now find all variable names, i.e. "words" preceded by $, @ or %
let vars = mod_stmt.matchAll(/([\$\@\%][\w:]+)\b/g);
for (const matchvar of vars) {
sig_vars.push(matchvar[0]);
}
}
}
let match;
if ((match = stmt.match(/(?:^|{)\s*my\s*(\(\s*[\$@%]\w+\s*(?:,\s*[\$@%]\w+\s*)*\))\s*=\s*\@_/)) || // my ($foo, $bar) = @_
(match = stmt.match(/(?:^|{)\s*my\s+(\s*[\$@%]\w+\s*)=\s*shift\b/)) || // my $foo = shift
(match = stmt.match(/(?:^|{)\s*my\s*(\(\s*[\$@%]\w+\s*\))\s*=\s*shift\b/)) // my ($foo) = shift
) {
let vars = match[1].matchAll(/([\$\@\%][\w:]+)\b/g);
for (const matchvar of vars) {
sig_vars.push(matchvar[0]);
}
}
if (depth > 4 || stmt.match(/(?:^|[^{])\}/)) {
// Sub has ended, we don't want to find the signature from the next sub.
return sig_vars;
}
}
return sig_vars;
}
function labels(state: ParserState): boolean {
let match;
// Phaser block
if ((match = state.stmt.match(/^(BEGIN|INIT|CHECK|UNITCHECK|END)\s*\{/))) {
const phaser = match[1];
const endLine = SubEndLine(state);
MakeElem(phaser, PerlSymbolKind.Phaser, "", state, endLine);
}
// Label line
else if ((match = state.stmt.match(/^([a-zA-Z_][a-zA-Z0-9_]*)\s*:[^:].*{\s*$/))) {
const label = match[1];
const endLine = SubEndLine(state);
MakeElem(label, PerlSymbolKind.Label, "", state, endLine);
} else {
return false;
}
return true;
}
function constants(state: ParserState): boolean {
let match;
// Constants. Important because they look like subs (and technically are), so I'll tags them as such
if ((match = state.stmt.match(/^use\s+constant\s+(\w+)\b/))) {
MakeElem(match[1], PerlSymbolKind.Constant, "", state);
MakeElem("constant", TagKind.UseStatement, "", state);
return true;
} else {
return false;
}
}
function fields(state: ParserState): boolean {
let match;
// Moo/Moose/Object::Pad/Moops/Corinna attributes
if ((match = state.stmt.match(/^(?:has|field)(?:\s+|\()["']?\+?([\$@%]?\w+)\b/))) {
const attr = match[1];
let type;
if (attr.match(/^\w/)) {
type = PerlSymbolKind.Field;
// If you have a locally defined package/class Foo want to reference the attributes as Foo::attr or foo->attr, you need the full path.
// Subs don't need this since we find them at compile time. We also find "d" types from imported packages in Inquisitor.pm
MakeElem(state.package_name + "::" + attr, PerlSymbolKind.PathedField, "", state);
} else {
type = PerlSymbolKind.LocalVar;
}
// TODO: Define new type. Class variables should probably be shown in the Outline view even though lexical variables are not
MakeElem(attr, type, "", state);
}
// Is this captured above?
// else if (state.perlDoc.imported.has("Object::Pad") &&
// (match = stmt.match(/^field\s+([\$@%]\w+)\b/))) { // Object::Pad field
// const attr = match[1];
// MakeElem(attr, PerlSymbolKind.LocalVar, '', file, package_name, line_num, perlDoc);
// }
else if ((state.perlDoc.imported.has("Mars::Class") || state.perlDoc.imported.has("Venus::Class")) && (match = state.stmt.match(/^attr\s+["'](\w+)\b/))) {
// Mars attributes
const attr = match[1];
MakeElem(attr, PerlSymbolKind.Field, "", state);
MakeElem(state.package_name + "::" + attr, PerlSymbolKind.PathedField, "", state);
} else if ((match = state.stmt.match(/^around\s+["']?(\w+)\b/))) {
// Moo/Moose overriding subs.
MakeElem(match[1], PerlSymbolKind.LocalSub, "", state);
} else {
return false;
}
return true;
}
function imports(state: ParserState): boolean {
let match;
if ((match = state.stmt.match(/^use\s+([\w:]+)\b/))) {
// Keep track of explicit imports for filtering
const importPkg = match[1];
MakeElem(importPkg, TagKind.UseStatement, "", state);
return true;
} else {
return false;
}
}
function autoloads(state: ParserState): boolean {
let match;
if ((match = state.stmt.match(/^\$self\->\{\s*(['"]|)_(\w+)\1\s*\}\s*(?:\|\||\/\/)?=/))) {
// Common paradigm is for autoloaders to basically just point to the class variable
const variable = match[2];
MakeElem("get_" + variable, PerlSymbolKind.AutoLoadVar, "", state);
return true;
} else {
return false;
}
}
function dancer(state: ParserState): boolean {
if (!(state.perlDoc.imported.has("Dancer") || state.perlDoc.imported.has("Dancer2") || state.perlDoc.imported.has("Mojolicious::Lite"))) {
return false;
}
//const rFilter = /qr\{[^\}]+\}/ ;
let match;
if ((match = state.stmt.match(/^(?:any|before\_route)\s+\[([^\]]+)\]\s+(?:=>\s*)?(['"])([^"']+)\2\s*=>\s*sub/))) {
// Multiple request routing paths
let requests = match[1];
let route = match[3];
// TODO: Put this back
requests = requests.replace(/['"\s\n]+/g, "");
route = `${requests} ${route}`;
const endLine = SubEndLine(state);
MakeElem(route, PerlSymbolKind.HttpRoute, "", state, endLine);
// TODO: I think this is a bug with [^\2] not working
// any ['get', 'post'] => '/login' => sub {
} else if ((match = state.stmt.match(/^(get|any|post|put|patch|delete|del|options|ajax|before_route)\s+(?:[\s\w,\[\]'"]+=>\s*)?(['"])([^'"]+)\2\s*=>\s*sub/))) {
// Routing paths
let route = match[1] + " " + match[3];
const endLine = SubEndLine(state);
MakeElem(route, PerlSymbolKind.HttpRoute, "", state, endLine);
} else if ((match = state.stmt.match(/^(get|any|post|put|patch|delete|del|options|ajax|before_route)\s+(qr\{[^\}]+\})\s+\s*=>\s*sub/))) {
// Regexp routing paths
let route = match[1] + " " + match[2];
const endLine = SubEndLine(state);
MakeElem(route, PerlSymbolKind.HttpRoute, "", state, endLine);
} else if ((match = state.stmt.match(/^(?:hook)\s+(['"]|)(\w+)\1\s*=>\s*sub/))) {
// Hooks
let hook = match[2];
const endLine = SubEndLine(state);
MakeElem(hook, PerlSymbolKind.HttpRoute, "", state, endLine);
} else {
return false;
}
return true; // Must've matched
}
async function cleanCode(textDocument: TextDocument, perlDoc: PerlDocument, parseType: ParseType): Promise<string[]> {
let code = textDocument.getText();
const codeArray = code.split("\n");
// const offset = textDocument.offsetAt(textDocument.positionAt(0));
let codeClean = [];
let commentState: ParserState = {
stmt: "",
line_number: 0,
package_name: "",
perlDoc: perlDoc,
uri: textDocument.uri,
var_continues: false,
codeArray: codeArray,
parseType: parseType,
};
for (commentState.line_number = 0; commentState.line_number < codeArray.length; commentState.line_number++) {
commentState.stmt = codeArray[commentState.line_number];
let match;
if (parseType == ParseType.selfNavigation && (match = commentState.stmt.match(/#.*(\$\w+) isa ([\w:]+)\b/))) {
const pvar = match[1];
const typeName = match[2];
// TODO: Do I need a file or package here? Canonical variables are weird
MakeElem(pvar, PerlSymbolKind.Canonical, typeName, commentState);
}
let mod_stmt = commentState.stmt;
mod_stmt = mod_stmt.replace(/^\s*/, "");
mod_stmt = mod_stmt.replace(/\s*$/, "");
codeClean.push(mod_stmt);
}
if (parseType == ParseType.outline) {
// If only doing shallow parsing, we don't need to strip {} or find start-end points of subs
codeClean = await stripCommentsAndQuotes(codeClean);
}
return codeClean;
}
function MakeElem(name: string, type: PerlSymbolKind | TagKind, typeDetail: string, state: ParserState, lineEnd: number = 0, signature: string[] = []): void {
if (!name) return; // Don't store empty names (shouldn't happen)
if (lineEnd == 0) {
lineEnd = state.line_number;
}
if (type == TagKind.UseStatement) {
// Explictly loaded module. Helpful for focusing autocomplete results
state.perlDoc.imported.set(name, state.line_number);
// if(/\bDBI$/.exec(name)) perlDoc.imported.set(name + "::db", true); // TODO: Build mapping of common constructors to types
return; // Don't store it as an element
}
if (type == TagKind.Canonical2) {
state.perlDoc.parents.set(name, typeDetail);
return; // Don't store it as an element
}
const newElem: PerlElem = {
name: name,
type: type,
typeDetail: typeDetail,
uri: state.uri,
package: state.package_name,
line: state.line_number,
lineEnd: lineEnd,
value: "",
source: ElemSource.parser,
};
if (type == PerlSymbolKind.AutoLoadVar) {
state.perlDoc.autoloads.set(name, newElem);
return; // Don't store it as an element
}
if (signature?.length > 0) {
newElem.signature = signature;
}
if (typeDetail.length > 0) {
// TODO: The canonicalElems don't need to be PerlElems, they might be just a string.
// We overwrite, so the last typed element is the canonical one. No reason for this.
state.perlDoc.canonicalElems.set(name, newElem);
if (type == "1") {
// This object is only intended as the canonicalLookup, not for anything else.
return;
}
}
let array = state.perlDoc.elems.get(name) || [];
array.push(newElem);
state.perlDoc.elems.set(name, array);
return;
}
function SubEndLine(state: ParserState, rFilter: RegExp | null = null): number {
let pos = 0;
let found = false;
if (state.parseType != ParseType.outline) {
return state.line_number;
}
for (let i = state.line_number; i < state.codeArray.length; i++) {
// Perhaps limit the max depth?
let stmt = state.codeArray[i];
if (i == state.line_number) {
if (rFilter) stmt = stmt.replace(rFilter, "");
// Default argument of empty hash. Other types of hashes may still trip this up
stmt = stmt.replace(/\$\w+\s*=\s*\{\s*\}/, "");
if(stmt.match(/;\s*$/)){
// "Forward" declaration, such as `sub foo;`
return i;
}
}
stmt.split("").forEach((char: string) => {
if (char == "{") {
// You may just be finding default function args = {}
found = true;
pos++;
} else if (char == "}") {
pos--;
}
});
// Checking outside the statement is faster, but less accurate
if (found && pos == 0) {
return i;
}
}
return state.line_number;
}
function PackageEndLine(state: ParserState) {
if (state.parseType != ParseType.outline) {
return state.line_number;
}
let start_line = state.line_number;
if (state.codeArray[start_line].match(/(class|package)[^#]+;/)) {
// Single line package definition.
if (state.codeArray[start_line].match(/{.*(class|package)/)) {
// Will need to hunt for the end
} else if (start_line > 0 && state.codeArray[start_line - 1].match(/\{[^}]*$/)) {
start_line -= 1;
}
}
let pos = 0;
let found = false;
for (let i = start_line; i < state.codeArray.length; i++) {
// Perhaps limit the max depth?
let stmt = state.codeArray[i];
stmt.split("").forEach((char: string) => {
if (char == "{") {
found = true;
pos++;
} else if (char == "}") {
pos--;
}
});
if (found == false) {
// If we haven't found the start of the package block, there probably isn't one.
if (stmt.indexOf(';') != -1 || i - start_line > 1) {
break;
}
}
// Checking outside the forEach statement is faster, but less accurate
if (found && pos == 0) {
return i;
}
}
for (let i = start_line + 1; i < state.codeArray.length; i++) {
// TODO: update with class inheritance / version numbers, etc
// Although should we do with nested packages/classes? (e.g. Pack A -> Pack B {} -> A)
if (state.codeArray[i].match(/^\s*(class|package)\s+([\w:]+)/)) {
return i - 1;
}
}
// If we didn't find an end, run until end of file
return state.codeArray.length;
}
// we first try to find by absolute path, which is needed in webpack
let onigWasmPath = path.join(__dirname, "./../node_modules/vscode-oniguruma/release/onig.wasm")
if (!fs.existsSync(onigWasmPath)) {
// dynmacially retrieve the path to onig.wasm (we need to eval the require to stop webpack from
// bundling the wasm, which doesn't werk)
onigWasmPath = eval('require.resolve')('vscode-oniguruma/release/onig.wasm');
}
// Read the file
const wasmBin = fs.readFileSync(onigWasmPath).buffer;
const vscodeOnigurumaLib = oniguruma.loadWASM(wasmBin).then(() => {
return {
createOnigScanner(patterns: any) {
return new oniguruma.OnigScanner(patterns);
},
createOnigString(s: any) {
return new oniguruma.OnigString(s);
},
};
});
const registry = new vsctm.Registry({
onigLib: vscodeOnigurumaLib,
loadGrammar: async (scopeName) => {
const grammarpath = path.join(__dirname, "./../perl.tmLanguage.json");
const grammar = await fs.promises.readFile(grammarpath, "utf8");
return vsctm.parseRawGrammar(grammar, grammarpath);
},
});
async function stripCommentsAndQuotes(code: string[]): Promise<string[]> {
const grammar = await registry.loadGrammar("source.perl");
if (!grammar) {
throw new Error("Couldn't load Textmate grammar");
}
let ruleStack: vsctm.StateStack | null = vsctm.INITIAL;
let codeStripped = [];
for (const line of code) {
const result = grammar.tokenizeLine(line, ruleStack);
ruleStack = result.ruleStack;
let strippedCode = "";
let lastEndIndex = 0;
for (const token of result.tokens) {
const content = line.substring(lastEndIndex, token.endIndex);
lastEndIndex = token.endIndex;
// This includes regexes and pod too
const isComment = token.scopes.some((scope) => scope.startsWith("comment"));
if (isComment) {
// Remove all comments
continue;
}
const isString = token.scopes.some((scope) => scope.startsWith("string"));
const isPunc = token.scopes.some((scope) => scope.startsWith("punctuation"));
if (isString && !isPunc) {
if (strippedCode == "") {
// The 2nd-Nth lines of multi-line strings should be stripped
strippedCode += "___";
continue;
} else if (content.match(/[\{\}]/)) {
// In-line strings that contains {} need to be stripped regardless of position
continue;
}
}
strippedCode += content;
}
codeStripped.push(strippedCode);
}
return codeStripped;
}