ima-parse
Version:
Easy Simple Parser, that only requires a Grammar JSON to output an AST.
171 lines (170 loc) • 8.97 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
const RuleParser_1 = require("./RuleParser");
const charCodeHelpers_1 = require("../helpers/charCodeHelpers");
class Parser {
grammar;
state = "not_started";
topLevelParser;
currentParser;
// Reader
phrase = "";
phraseKind = "chars";
cursor = { ln: 1, col: 1 };
wordChars = charCodeHelpers_1.defaultWordChars;
numberChars = charCodeHelpers_1.defaultNumberChars;
validChars = charCodeHelpers_1.defaultValidChars;
brokenContent = [];
constructor(grammar) {
this.grammar = grammar;
this.topLevelParser = new RuleParser_1.RuleParser(this.grammar.TopLevel, this.grammar);
this.currentParser = this.topLevelParser;
if (grammar.wordChars)
this.wordChars = grammar.wordChars;
if (grammar.numberChars)
this.numberChars = grammar.numberChars;
if (grammar.validChars)
this.validChars = grammar.validChars;
}
getTopLevelParser() {
if (this.state !== "done")
throw new Error(`Parsing not ${this.state === "not_started" ? "started" : "finished"} yet`);
return this.topLevelParser;
}
parseText(text) {
if (this.state !== "not_started")
throw new Error("A Parser instance can only run once, create a new one instead");
this.state = "parsing";
for (let i = 0; i < text.length; i++)
this.parseChar(text[i]);
if (this.phrase)
this.parseCurrentPhrase(true);
this.state = "done";
}
parseChar(char) {
const charCode = char.charCodeAt(0);
const receivedNewline = (0, charCodeHelpers_1.matchCharCodes)(charCode, charCodeHelpers_1.CharCodes.newline);
// Check if the current parsed part is in text mode. If so; we give it all characters we receive and continue.
if (this.currentParser.parsedParts.at(-1)?.textMode) {
this.phrase = char;
this.advanceCursor(receivedNewline);
this.parseCurrentPhrase(true);
// A text rule can decide that it's done, after it has received a certain character (which it doesn't include).
// Example: <node>text</node> where < can be the end of the text but also the start of something else.
if (!this.currentParser.parsedParts.at(-1)?.ignoredPhrase)
return;
}
const receivedWordChar = (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.wordChars);
const receivedNumberChar = (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.numberChars);
const receivedValidNonWordChar = !receivedWordChar && !receivedNumberChar && (0, charCodeHelpers_1.matchCharCodes)(charCode, ...this.validChars);
// Here we try define the phrase OR continue the current phrase, which requires the character to match the type
if (
// Word phrases can only start with word characters
(receivedWordChar && (!this.phrase || this.phraseKind === "word")) ||
// Number phrases can only contain number characters and words can contain numbers as well
(receivedNumberChar && (!this.phrase || this.phraseKind === "word" || this.phraseKind === "number")) ||
// Character phrases can only consist of a set of allowed non-word and non-number characters, nothing else
(receivedValidNonWordChar && (!this.phrase || this.phraseKind === "chars"))) {
const phraseKind = this.phrase ? this.phraseKind : receivedWordChar ? "word" : receivedNumberChar ? "number" : "chars";
this.addCharAndAdvanceCursor(char, phraseKind);
// If it's a non-word, we try to parse (characters are more often next to each other), but if it doesn't succeed that's fine.
if (receivedValidNonWordChar)
this.parseCurrentPhrase(false);
return;
}
// If we reach this code, we've received a char that can't be added to the phrase, so we must parse the phrase and continue
if (this.phrase) {
this.parseCurrentPhrase(true);
// We're done with the current phrase, let's give the new char a chance
return this.parseChar(char);
}
// From here, we've received a character we cannot parse, which might be a whitespace or an invalid character
const startCursor = { ...this.cursor };
this.advanceCursor(receivedNewline);
// We ignore all whitespace characters
const receivedInvalidChar = !(0, charCodeHelpers_1.matchCharCodes)(charCode, charCodeHelpers_1.CharCodes.tab, charCodeHelpers_1.CharCodes.space, charCodeHelpers_1.CharCodes.newline);
if (receivedInvalidChar) {
const reason = { type: "unknown_character" };
this.brokenContent.push({ position: { start: startCursor, end: { ...this.cursor } }, reason, content: char });
}
}
advanceCursor(newline) {
if (newline) {
this.cursor.col = 1;
this.cursor.ln++;
}
else {
this.cursor.col++;
}
}
addCharAndAdvanceCursor(char, phraseKind) {
this.phrase += char;
this.phraseKind = phraseKind;
this.advanceCursor(false);
}
parseCurrentPhrase(hasToSucceed) {
const phraseStartCursor = getStartCursor(this.cursor, this.phrase);
const result = this.parseChars(this.phrase, this.phraseKind, phraseStartCursor, hasToSucceed);
if (result.success || hasToSucceed) {
this.phrase = "";
this.phraseKind = "chars";
}
return result;
}
parseChars(chars, phraseKind, startPos, hasToSucceed) {
const parserInput = { chars, phraseKind, startPos, endPos: { ...this.cursor } };
const parseResult = this.currentParser.parsePhrase(parserInput);
if (parseResult.success) {
this.currentParser = parseResult.ruleParser;
return { success: true };
}
const parseTrail = [getParsePartRef(this.currentParser)];
// The global parser is always the first fallback, mostly for comments. After this, the currentParser is continued again
const globalParser = new RuleParser_1.RuleParser({ name: "global", definition: [this.grammar.global] }, this.grammar, this.currentParser);
const globalParseResult = globalParser.parsePhrase(parserInput);
if (globalParseResult.success) {
this.currentParser.globalParsedParts;
globalParseResult.ruleParser.parent = this.currentParser;
this.currentParser = globalParseResult.ruleParser;
return { success: true };
}
// Keep track of unfinished rules, for the case we have to succeed OR a parent rule can parse it
let unfinishedRule;
if (this.currentParser.hasRequiredPartsLeft())
unfinishedRule = this.currentParser;
let successfulParentParser = false;
// Let's try to match it with something from the parent rule, even if there are errors
for (let parent = this.currentParser.parent; parent; parent = parent.parent) {
const parentParseResult = parent.parsePhrase(parserInput);
parseTrail.push(getParsePartRef(parent));
if (parentParseResult.success) {
successfulParentParser = true;
this.currentParser = parentParseResult.ruleParser;
break;
}
// We only care about the deepest failure, not the parents
if (!unfinishedRule && parent.hasRequiredPartsLeft())
unfinishedRule = parent;
}
// We have to parse this phrase, but nothing was able to match it
if (hasToSucceed && !successfulParentParser) {
const reason = { type: "unexpected_phrase", parsedPart: getParsePartRef(this.currentParser) };
this.brokenContent.push({ position: { start: startPos, end: { ...this.cursor } }, reason, content: chars, parseTrail });
}
// We have to parse this phrase OR we found a match but a child of that parser had to finish first
if (unfinishedRule && (successfulParentParser || hasToSucceed)) {
const reason = { type: "unfinished_rule", parsedPart: getParsePartRef(unfinishedRule) };
this.brokenContent.push({ position: { start: startPos, end: { ...this.cursor } }, reason, content: chars, parseTrail });
}
// Broken content or not, it's important to return if there was a match somewhere or not
return successfulParentParser ? { success: true } : { success: false, error: "" };
}
}
exports.Parser = Parser;
function getParsePartRef(parser) {
return { rule: parser.rule.name, part: parser.parsedParts.at(-1)?.index || 0 };
}
function getStartCursor(endPos, chars) {
return { ln: endPos.ln, col: endPos.col - chars.length };
}