UNPKG

@palasimi/ipa-cluster

Version:

Cluster words with similar IPA transcriptions together

github.com/palasimi/ipa-cluster

palasimi/ipa-cluster

455 lines • 15.5 kB

JavaScript

"use strict"; // SPDX-License-Identifier: GPL-3.0-or-later // Copyright (c) 2023 Levi Gruspe // DSL parser. Object.defineProperty(exports, "__esModule", { value: true }); exports.parse = exports.Parser = exports.ParseError = void 0; const ir_1 = require("./ir"); const scopes_1 = require("./scopes"); const tokenizer_1 = require("./tokenizer"); /** * Exception thrown by parser when it encounters an error during parsing. */ class ParseError extends Error { constructor(message) { super(message); this.name = this.constructor.name; } } exports.ParseError = ParseError; /** * Code parser. * Exported only for testing. */ class Parser { /** * @param code - Code to be parsed */ constructor(code) { this.index = 0; this.scope = new scopes_1.Scope(); this.tokens = Array.from((0, tokenizer_1.tokenize)(code)); } /** * Checks if there are tokens left in the source program. */ isDone() { return this.index >= this.tokens.length; } /** * Peeks at the token an `offset` number of steps ahead of the current one, * without changing the token pointer. * Returns `undefined` if there are no tokens left. */ peek(offset = 0) { return this.tokens[this.index + offset]; } /** * Peeks at the next `count` number of tokens. * May return fewer tokens than asked for if there are no tokens left in the * source program. * * @param count - Number of tokens to peek at */ peekN(count) { const tokens = []; const max = Math.min(this.index + count, this.tokens.length); for (let i = this.index; i < max; i++) { const token = this.tokens[i]; tokens.push(token); } return tokens; } /** * Returns the current token and moves the pointer to the next token. * * If there are no tokens left in the input, returns `undefined`. */ move() { if (this.isDone()) { return undefined; } const token = this.peek(); this.index++; return token; } /** * Consumes a token from the input. * The token must match the specified tag. * If not, this method throws a `ParseError`. * * Returns the consumed token. * * @param tag - Expected tag of the next token * @param message - Error message */ expect(tag, message) { const token = this.move(); if ((token === null || token === void 0 ? void 0 : token.tag) !== tag) { abort(token, message); } return token; } /** * Parses a sound value. * It should be non-empty. */ parseSound() { const lookahead = this.peek(); switch (lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) { case tokenizer_1.Tag.LeftBrace: return this.parseUnionSound(); case tokenizer_1.Tag.Terminal: return this.parseTerminalSound(); case tokenizer_1.Tag.Variable: return this.parseVariableSound(); default: abort(lookahead, "expected a sound value"); } } /** * Parses an assignment statement. * In general, assignment statements can be written as `A = B`, * where `A` is the variable name and `B` is the sound value to be assigned. * * This method defines the variable in the current scope. */ parseAssignment() { const lhs = this.expect(tokenizer_1.Tag.Variable, "expected a variable name"); this.expect(tokenizer_1.Tag.Equals, "expected '='"); const rhs = this.parseSound(); const ok = this.scope.define(lhs.literal, rhs); if (!ok) { abort(lhs, `cannot redefine the variable '${lhs.literal}'`); } } /** * Parses an IPA segment (a terminal). */ parseTerminalSound() { const terminal = this.expect(tokenizer_1.Tag.Terminal, "expected an IPA segment"); return [terminal.literal]; } /** * Parses a variable that represents a sound value. * The value of the variable is resolved during parsing. */ parseVariableSound() { const variable = this.expect(tokenizer_1.Tag.Variable, "expected a variable"); try { return this.scope.resolve(variable.literal); } catch (error) { if (error instanceof scopes_1.NameError) { abort(variable, error.message); } throw error; } } /** * Parses a sound value that's enclosed by braces. * The value may be a null sound or a union of sounds. */ parseUnionSound() { const choices = []; this.expect(tokenizer_1.Tag.LeftBrace, "expected '{'"); while (!this.isDone() && this.peek().tag !== tokenizer_1.Tag.RightBrace) { choices.push(...this.parseTerminalSound()); } this.expect(tokenizer_1.Tag.RightBrace, "expected '}'"); return choices; } /** * Parses a sequence of sounds. * Returns an array of `Sound`s. */ parseSounds() { const sounds = []; for (;;) { const lookahead = this.peek(); switch (lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) { case tokenizer_1.Tag.LeftBrace: case tokenizer_1.Tag.Terminal: case tokenizer_1.Tag.Variable: sounds.push(this.parseSound()); break; default: return sounds; } } } /** * Parses the sound environment of a rule. * In general, a sound environment can be written as `/ A _ B`, * where `A` and `B` are sound values. * * Returns a `SoundEnvironment`. * Note that some rules don't specify a sound environment. */ parseEnvironment() { // Check if there's a slash. const lookahead = this.peek(); if ((lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) !== tokenizer_1.Tag.Slash) { return { left: [], right: [], explicit: false, }; } this.expect(tokenizer_1.Tag.Slash, "expected '/'"); const left = this.parseSounds(); this.expect(tokenizer_1.Tag.Underscore, "expected '_'"); const right = this.parseSounds(); return { left, right, explicit: true }; } /** * Parses a simple rule (one that has no constraints). * Simple rules can be written as: * - `a b c ~ d e f` (transformational rules) * - `a ~ b / c _ d` (SPE-style) */ parseSimpleRule() { const leftToken = this.peek(); const left = this.parseSounds(); this.expect(tokenizer_1.Tag.Tilde, "expected '~'"); const rightToken = this.peek(); const right = this.parseSounds(); const environment = this.parseEnvironment(); // Perform some checks. if (environment.explicit) { // Each side can have at most one sound value in an SPE-style rule. if (left.length > 1) { abort(leftToken, "too many symbols on the left-hand side of an SPE-style rule"); } if (right.length > 1) { abort(rightToken, "too many symbols on the right-hand side of an SPE-style rule"); } // Disallow "#" outside of an environment context in SPE-style rules. for (const sound of left) { if (sound.includes("#")) { abort(leftToken, "unexpected '#' outside a sound environment in an SPE-style rule"); } } for (const sound of right) { if (sound.includes("#")) { abort(rightToken, "unexpected '#' outside a sound environment in an SPE-style rule"); } } } // Include sound environment information in `left` and `right`. // Effectively, this converts SPE-style rules into string rewriting rules. return { left: [...environment.left, ...left, ...environment.right], right: [...environment.left, ...right, ...environment.right], }; } /** * Parses a simple statement. * A simple statement can be an assignment statement or a simple rule * (one that has no constraints). * * Returns an array that contains at most one `Rule`. */ parseSimpleStatement() { // We can detect if the next statement is an assignment statement by // looking for an equal sign. const lookahead = this.peek(1); if ((lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) === tokenizer_1.Tag.Equals) { this.parseAssignment(); return []; } // We return an array for convenience, // when this method gets called in `parseStatement`. return [this.parseSimpleRule()]; } /** * Parses language code. * See docstring for `isLanguageCode` for details on what language codes are * supposed to look like. * * Returns an array of language codes. */ parseLanguageCodes() { const codes = []; // Stop parsing when we see something that doesn't look like a language // code. for (;;) { const lookahead = this.peek(); switch (lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) { case tokenizer_1.Tag.Underscore: codes.push("_"); this.move(); break; case tokenizer_1.Tag.Terminal: if (isLanguageCode(lookahead.literal)) { codes.push(lookahead.literal); this.move(); continue; } return codes; default: return codes; } } } /** * Parses a language selector/constraint. * Returns a pair of language codes. * * A language constraint consists of one or two language codes, and a dot. * If the constraint only contains one language, it is assumed that the * constraint applies to both sides of a rule. * * It's the caller's responsibility to check that the next tokens are part of * a language constraint. * This method doesn't do anything to check. */ parseConstraint() { const checkpoint = this.peek(); const codes = this.parseLanguageCodes(); this.expect(tokenizer_1.Tag.Dot, "expected '.'"); // Check language codes. if (codes.length === 0) { abort(checkpoint, "expected a language code"); } if (codes.length > 2) { abort(checkpoint, "too many language codes in constraint"); } return { left: codes[0], right: codes[1] || codes[0], }; } /** * Parses a multi-line rule. * Every line is a simple statement preceded by a "|". * * Returns an array of `Rule`s. */ parseMultiLineRule() { // Enter new scope. this.scope = new scopes_1.Scope(this.scope); const rules = []; for (;;) { const pipe = this.peek(); if ((pipe === null || pipe === void 0 ? void 0 : pipe.tag) === tokenizer_1.Tag.Pipe) { this.move(); rules.push(...this.parseSimpleStatement()); } else { break; } const newline = this.peek(); if ((newline === null || newline === void 0 ? void 0 : newline.tag) === tokenizer_1.Tag.Newline) { this.move(); } } // Leave scope. this.scope = this.scope.outer; return rules; } /** * Parses a compound statement. * A compound statement is a rule that has language constraints/selectors. * It can be a single-line or a multi-line rule. * Returns a `Ruleset`. */ parseCompoundStatement() { var _a; const constraint = this.parseConstraint(); // Ignore newline. if (((_a = this.peek()) === null || _a === void 0 ? void 0 : _a.tag) === tokenizer_1.Tag.Newline) { this.move(); } const rules = []; const lookahead = this.peek(); if ((lookahead === null || lookahead === void 0 ? void 0 : lookahead.tag) === tokenizer_1.Tag.Pipe) { rules.push(...this.parseMultiLineRule()); } else { rules.push(this.parseSimpleRule()); } return { constraint, rules }; } /** * Parses a statement. * A statement is either a simple or a compound statement. * * Hierarchy of statements: * - simple * - assignment * - simple rule (no language constraints) * - compound * - single-line rule with language constraints * - multi-line rule with constraints (has nested simple statements) * * Returns the `Ruleset` defined by the statement. */ parseStatement() { // Peek at the next three tokens. // If there's a dot, then we know that there's an upcoming compound // statement. const lookahead = this.peekN(3); const dot = lookahead.findIndex((token) => token.tag === tokenizer_1.Tag.Dot); if (dot >= 0) { return this.parseCompoundStatement(); } return (0, ir_1.createUnconstrainedRuleset)(this.parseSimpleStatement()); } /** * Parses the source program. * Throws a `ParseError` if the source program has an error. * * A program is a sequence of statements. */ parseProgram() { const rulesets = []; while (!this.isDone()) { const token = this.peek(); if (token.tag === tokenizer_1.Tag.Newline) { this.move(); continue; } // Otherwise, the next line must be a statement. // We throw out empty rules, which are returned by assignment statements. const ruleset = this.parseStatement(); if (ruleset.rules.length > 0) { rulesets.push(ruleset); } } return { rulesets }; } } exports.Parser = Parser; /** * Parses code. * Returns an intermediate representation if there are no errors in the code. * Otherwise, throws a `ParseError`. */ function parse(code) { const parser = new Parser(code); return parser.parseProgram(); } exports.parse = parse; /** * Aborts the parser and throws a `ParseError`. * * @param token - Location of error * @param message - Error message */ function abort(token, message) { if (token == null) { throw new ParseError(`${message}; unexpected end-of-file`); } const { line, column, literal } = token; throw new ParseError(`${message} at line ${line}, column ${column}; found: ${literal}`); } /** * Checks if string looks like a language code. * This doesn't check if the language code is valid. */ function isLanguageCode(text) { const re = /^(_|([a-z][-a-z]*[a-z]))$/; return re.test(text); } //# sourceMappingURL=parser.js.map