UNPKG

ima-parse

Version:

Easy Simple Parser, that only requires a Grammar JSON to output an AST.

171 lines (170 loc) 8.97 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Parser = void 0; const RuleParser_1 = require("./RuleParser"); const charCodeHelpers_1 = require("../helpers/charCodeHelpers"); class Parser { grammar; state = "not_started"; topLevelParser; currentParser; // Reader phrase = ""; phraseKind = "chars"; cursor = { ln: 1, col: 1 }; wordChars = charCodeHelpers_1.defaultWordChars; numberChars = charCodeHelpers_1.defaultNumberChars; validChars = charCodeHelpers_1.defaultValidChars; brokenContent = []; constructor(grammar) { this.grammar = grammar; this.topLevelParser = new RuleParser_1.RuleParser(this.grammar.TopLevel, this.grammar); this.currentParser = this.topLevelParser; if (grammar.wordChars) this.wordChars = grammar.wordChars; if (grammar.numberChars) this.numberChars = grammar.numberChars; if (grammar.validChars) this.validChars = grammar.validChars; } getTopLevelParser() { if (this.state !== "done") throw new Error(`Parsing not ${this.state === "not_started" ? "started" : "finished"} yet`); return this.topLevelParser; } parseText(text) { if (this.state !== "not_started") throw new Error("A Parser instance can only run once, create a new one instead"); this.state = "parsing"; for (let i = 0; i < text.length; i++) this.parseChar(text[i]); if (this.phrase) this.parseCurrentPhrase(true); this.state = "done"; } parseChar(char) { const charCode = char.charCodeAt(0); const receivedNewline = (0, charCodeHelpers_1.matchCharCodes)(charCode, charCodeHelpers_1.CharCodes.newline); // Check if the current parsed part is in text mode. If so; we give it all characters we receive and continue. if (this.currentParser.parsedParts.at(-1)?.textMode) { this.phrase = char; this.advanceCursor(receivedNewline); this.parseCurrentPhrase(true); // A text rule can decide that it's done, after it has received a certain character (which it doesn't include). // Example: <node>text</node> where < can be the end of the text but also the start of something else. if (!this.currentParser.parsedParts.at(-1)?.ignoredPhrase) return; } const receivedWordChar = (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.wordChars); const receivedNumberChar = (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.numberChars); const receivedValidNonWordChar = !receivedWordChar && !receivedNumberChar && (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.validChars); // Here we try define the phrase OR continue the current phrase, which requires the character to match the type if ( // Word phrases can only start with word characters (receivedWordChar && (!this.phrase || this.phraseKind === "word")) || // Number phrases can only contain number characters and words can contain numbers as well (receivedNumberChar && (!this.phrase || this.phraseKind === "word" || this.phraseKind === "number")) || // Character phrases can only consist of a set of allowed non-word and non-number characters, nothing else (receivedValidNonWordChar && (!this.phrase || this.phraseKind === "chars"))) { const phraseKind = this.phrase ? this.phraseKind : receivedWordChar ? "word" : receivedNumberChar ? "number" : "chars"; this.addCharAndAdvanceCursor(char, phraseKind); // If it's a non-word, we try to parse (characters are more often next to each other), but if it doesn't succeed that's fine. if (receivedValidNonWordChar) this.parseCurrentPhrase(false); return; } // If we reach this code, we've received a char that can't be added to the phrase, so we must parse the phrase and continue if (this.phrase) { this.parseCurrentPhrase(true); // We're done with the current phrase, let's give the new char a chance return this.parseChar(char); } // From here, we've received a character we cannot parse, which might be a whitespace or an invalid character const startCursor = { ...this.cursor }; this.advanceCursor(receivedNewline); // We ignore all whitespace characters const receivedInvalidChar = !(0, charCodeHelpers_1.matchCharCodes)(charCode, charCodeHelpers_1.CharCodes.tab, charCodeHelpers_1.CharCodes.space, charCodeHelpers_1.CharCodes.newline); if (receivedInvalidChar) { const reason = { type: "unknown_character" }; this.brokenContent.push({ position: { start: startCursor, end: { ...this.cursor } }, reason, content: char }); } } advanceCursor(newline) { if (newline) { this.cursor.col = 1; this.cursor.ln++; } else { this.cursor.col++; } } addCharAndAdvanceCursor(char, phraseKind) { this.phrase += char; this.phraseKind = phraseKind; this.advanceCursor(false); } parseCurrentPhrase(hasToSucceed) { const phraseStartCursor = getStartCursor(this.cursor, this.phrase); const result = this.parseChars(this.phrase, this.phraseKind, phraseStartCursor, hasToSucceed); if (result.success || hasToSucceed) { this.phrase = ""; this.phraseKind = "chars"; } return result; } parseChars(chars, phraseKind, startPos, hasToSucceed) { const parserInput = { chars, phraseKind, startPos, endPos: { ...this.cursor } }; const parseResult = this.currentParser.parsePhrase(parserInput); if (parseResult.success) { this.currentParser = parseResult.ruleParser; return { success: true }; } const parseTrail = [getParsePartRef(this.currentParser)]; // The global parser is always the first fallback, mostly for comments. After this, the currentParser is continued again const globalParser = new RuleParser_1.RuleParser({ name: "global", definition: [this.grammar.global] }, this.grammar, this.currentParser); const globalParseResult = globalParser.parsePhrase(parserInput); if (globalParseResult.success) { this.currentParser.globalParsedParts; globalParseResult.ruleParser.parent = this.currentParser; this.currentParser = globalParseResult.ruleParser; return { success: true }; } // Keep track of unfinished rules, for the case we have to succeed OR a parent rule can parse it let unfinishedRule; if (this.currentParser.hasRequiredPartsLeft()) unfinishedRule = this.currentParser; let successfulParentParser = false; // Let's try to match it with something from the parent rule, even if there are errors for (let parent = this.currentParser.parent; parent; parent = parent.parent) { const parentParseResult = parent.parsePhrase(parserInput); parseTrail.push(getParsePartRef(parent)); if (parentParseResult.success) { successfulParentParser = true; this.currentParser = parentParseResult.ruleParser; break; } // We only care about the deepest failure, not the parents if (!unfinishedRule && parent.hasRequiredPartsLeft()) unfinishedRule = parent; } // We have to parse this phrase, but nothing was able to match it if (hasToSucceed && !successfulParentParser) { const reason = { type: "unexpected_phrase", parsedPart: getParsePartRef(this.currentParser) }; this.brokenContent.push({ position: { start: startPos, end: { ...this.cursor } }, reason, content: chars, parseTrail }); } // We have to parse this phrase OR we found a match but a child of that parser had to finish first if (unfinishedRule && (successfulParentParser || hasToSucceed)) { const reason = { type: "unfinished_rule", parsedPart: getParsePartRef(unfinishedRule) }; this.brokenContent.push({ position: { start: startPos, end: { ...this.cursor } }, reason, content: chars, parseTrail }); } // Broken content or not, it's important to return if there was a match somewhere or not return successfulParentParser ? { success: true } : { success: false, error: "" }; } } exports.Parser = Parser; function getParsePartRef(parser) { return { rule: parser.rule.name, part: parser.parsedParts.at(-1)?.index || 0 }; } function getStartCursor(endPos, chars) { return { ln: endPos.ln, col: endPos.col - chars.length }; }