UNPKG

@palasimi/ipa-cluster

Version:

Cluster words with similar IPA transcriptions together

276 lines 6.75 kB
"use strict"; // SPDX-License-Identifier: GPL-3.0-or-later // Copyright (c) 2023 Levi Gruspe // Code tokenizer. Object.defineProperty(exports, "__esModule", { value: true }); exports.tokenize = exports.Tag = exports.EOFError = void 0; /** * Exception thrown by `Tokenizer` when it runs out of tokens. */ class EOFError extends Error { } exports.EOFError = EOFError; /** * Represents a type of token. */ var Tag; (function (Tag) { Tag[Tag["Dot"] = 0] = "Dot"; Tag[Tag["Equals"] = 1] = "Equals"; Tag[Tag["LeftBrace"] = 2] = "LeftBrace"; Tag[Tag["Newline"] = 3] = "Newline"; Tag[Tag["Pipe"] = 4] = "Pipe"; Tag[Tag["RightBrace"] = 5] = "RightBrace"; Tag[Tag["Slash"] = 6] = "Slash"; Tag[Tag["Tilde"] = 7] = "Tilde"; Tag[Tag["Underscore"] = 8] = "Underscore"; Tag[Tag["Reserved"] = 9] = "Reserved"; // Space-delimited strings Tag[Tag["Terminal"] = 10] = "Terminal"; Tag[Tag["Variable"] = 11] = "Variable"; })(Tag || (exports.Tag = Tag = {})); /** * Map of seperator symbols and their corresponding tags. */ const separators = new Map([ [".", Tag.Dot], ["/", Tag.Slash], ["=", Tag.Equals], ["\n", Tag.Newline], ["_", Tag.Underscore], ["{", Tag.LeftBrace], ["|", Tag.Pipe], ["}", Tag.RightBrace], ["~", Tag.Tilde], ]); /** * Set of reserved symbols. * These symbols aren't used yet by the language. */ const reserved = new Set([ "!", "$", "%", "&", "'", "(", ")", "*", "+", ",", ":", ";", "<", ">", "?", "@", "[", "\\", "]", "^", "`", '"', ]); /** * Special terminal symbols. * These symbols are not allowed to be a substring in variable names and other * terminal symbols (e.g. IPA segments). */ const terminals = new Set(["#"]); /** * Set of word breakpoints. * These symbols cannot be used in strings and variable names. */ const breakpoints = new Set([ "", ...reserved, ...separators.keys(), ...terminals, ]); /** * Code tokenizer. */ class Tokenizer { /** * @param code - Code to be tokenized */ constructor(code) { this.line = 0; this.column = 0; this.index = 0; this.code = code; } /** * Checks if there are characters left in the input. */ isDone() { return this.index >= this.code.length; } /** * Returns the character `offset` steps from the current one. * If there are no characters left, returns an empty string. */ peek(offset = 0) { return this.code[this.index + offset] || ""; } /** * Returns the current character and moves the pointer to the next. * Also updates the line and column numbers if needed. * * If there are no characters left in the input, returns an empty string. */ move() { if (this.isDone()) { return ""; } const char = this.peek(); this.index++; this.column++; if (char === "\n") { this.column = 0; this.line++; } return char; } /** * Moves the pointer past the next word breakpoint. * Returns the string in-between (stripped of whitespace) as a token. */ moveUntilBreakpoint() { while (isSpace(this.peek())) { this.move(); } const line = this.line; const column = this.column; const chars = []; for (;;) { const char = this.peek(); if (isSpace(char) || breakpoints.has(char)) { break; } chars.push(char); this.move(); } const literal = chars.join("").trim(); return { line, column, tag: isName(literal) ? Tag.Variable : Tag.Terminal, literal, }; } /** * Skip to the end of the line. * Does not skip over newlines. */ skip() { for (;;) { const char = this.peek(); if (char === "" || char === "\n") { break; } this.move(); } } /** * Creates a token. */ createToken(tag, literal) { return { line: this.line, column: this.column, tag, literal, }; } /** * Emits the specified token and advances the tokenizer by one step. */ emit(tag, literal) { const token = this.createToken(tag, literal); this.move(); return token; } /** * Emits a word (e.g. strings and variables). */ emitWord() { return this.moveUntilBreakpoint(); } /** * Emits the next token in the code. * Throws an `EOFError` if there are no tokens left. */ nextToken() { // Remove insignificant whitespace. while (isSpace(this.peek())) { this.move(); } // Check if there are tokens left. if (this.isDone()) { throw new EOFError(); } const lookahead = this.peek(); // Ignore comments. if (lookahead === "-" && this.peek(1) === "-") { this.skip(); return this.nextToken(); } // Reserved symbols. if (reserved.has(lookahead)) { return this.emit(Tag.Reserved, lookahead); } // Separators. if (separators.has(lookahead)) { return this.emit(separators.get(lookahead), lookahead); } // Special terminal symbols. if (terminals.has(lookahead)) { return this.emit(Tag.Terminal, lookahead); } // Variable names and terminals. return this.emitWord(); } } /** * Tokenizes code. * Returns an array of `Token`s. */ function tokenize(code) { const tokens = []; const tokenizer = new Tokenizer(code); for (;;) { try { tokens.push(tokenizer.nextToken()); } catch (error) { if (error instanceof EOFError) { break; } throw error; } } return tokens; } exports.tokenize = tokenize; /** * Checks if the string is a whitespace character. * Since newlines are significant, we won't treat them as whitespace. */ function isSpace(text) { if (text === "\n") { return false; } const re = /^\s+$/; return re.test(text); } /** * Checks if the string is a valid variable name. * Variable names must be capitalized, and must only consist of alphanumeric * symbols. */ function isName(text) { const re = /^[A-Z][A-Za-z0-9]*$/; return re.test(text); } //# sourceMappingURL=tokenizer.js.map