@palasimi/ipa-cluster
Version:
Cluster words with similar IPA transcriptions together
276 lines • 6.75 kB
JavaScript
"use strict";
// SPDX-License-Identifier: GPL-3.0-or-later
// Copyright (c) 2023 Levi Gruspe
// Code tokenizer.
Object.defineProperty(exports, "__esModule", { value: true });
exports.tokenize = exports.Tag = exports.EOFError = void 0;
/**
* Exception thrown by `Tokenizer` when it runs out of tokens.
*/
class EOFError extends Error {
}
exports.EOFError = EOFError;
/**
* Represents a type of token.
*/
var Tag;
(function (Tag) {
Tag[Tag["Dot"] = 0] = "Dot";
Tag[Tag["Equals"] = 1] = "Equals";
Tag[Tag["LeftBrace"] = 2] = "LeftBrace";
Tag[Tag["Newline"] = 3] = "Newline";
Tag[Tag["Pipe"] = 4] = "Pipe";
Tag[Tag["RightBrace"] = 5] = "RightBrace";
Tag[Tag["Slash"] = 6] = "Slash";
Tag[Tag["Tilde"] = 7] = "Tilde";
Tag[Tag["Underscore"] = 8] = "Underscore";
Tag[Tag["Reserved"] = 9] = "Reserved";
// Space-delimited strings
Tag[Tag["Terminal"] = 10] = "Terminal";
Tag[Tag["Variable"] = 11] = "Variable";
})(Tag || (exports.Tag = Tag = {}));
/**
* Map of seperator symbols and their corresponding tags.
*/
const separators = new Map([
[".", Tag.Dot],
["/", Tag.Slash],
["=", Tag.Equals],
["\n", Tag.Newline],
["_", Tag.Underscore],
["{", Tag.LeftBrace],
["|", Tag.Pipe],
["}", Tag.RightBrace],
["~", Tag.Tilde],
]);
/**
* Set of reserved symbols.
* These symbols aren't used yet by the language.
*/
const reserved = new Set([
"!",
"$",
"%",
"&",
"'",
"(",
")",
"*",
"+",
",",
":",
";",
"<",
">",
"?",
"@",
"[",
"\\",
"]",
"^",
"`",
'"',
]);
/**
* Special terminal symbols.
* These symbols are not allowed to be a substring in variable names and other
* terminal symbols (e.g. IPA segments).
*/
const terminals = new Set(["#"]);
/**
* Set of word breakpoints.
* These symbols cannot be used in strings and variable names.
*/
const breakpoints = new Set([
"",
...reserved,
...separators.keys(),
...terminals,
]);
/**
* Code tokenizer.
*/
class Tokenizer {
/**
* @param code - Code to be tokenized
*/
constructor(code) {
this.line = 0;
this.column = 0;
this.index = 0;
this.code = code;
}
/**
* Checks if there are characters left in the input.
*/
isDone() {
return this.index >= this.code.length;
}
/**
* Returns the character `offset` steps from the current one.
* If there are no characters left, returns an empty string.
*/
peek(offset = 0) {
return this.code[this.index + offset] || "";
}
/**
* Returns the current character and moves the pointer to the next.
* Also updates the line and column numbers if needed.
*
* If there are no characters left in the input, returns an empty string.
*/
move() {
if (this.isDone()) {
return "";
}
const char = this.peek();
this.index++;
this.column++;
if (char === "\n") {
this.column = 0;
this.line++;
}
return char;
}
/**
* Moves the pointer past the next word breakpoint.
* Returns the string in-between (stripped of whitespace) as a token.
*/
moveUntilBreakpoint() {
while (isSpace(this.peek())) {
this.move();
}
const line = this.line;
const column = this.column;
const chars = [];
for (;;) {
const char = this.peek();
if (isSpace(char) || breakpoints.has(char)) {
break;
}
chars.push(char);
this.move();
}
const literal = chars.join("").trim();
return {
line,
column,
tag: isName(literal) ? Tag.Variable : Tag.Terminal,
literal,
};
}
/**
* Skip to the end of the line.
* Does not skip over newlines.
*/
skip() {
for (;;) {
const char = this.peek();
if (char === "" || char === "\n") {
break;
}
this.move();
}
}
/**
* Creates a token.
*/
createToken(tag, literal) {
return {
line: this.line,
column: this.column,
tag,
literal,
};
}
/**
* Emits the specified token and advances the tokenizer by one step.
*/
emit(tag, literal) {
const token = this.createToken(tag, literal);
this.move();
return token;
}
/**
* Emits a word (e.g. strings and variables).
*/
emitWord() {
return this.moveUntilBreakpoint();
}
/**
* Emits the next token in the code.
* Throws an `EOFError` if there are no tokens left.
*/
nextToken() {
// Remove insignificant whitespace.
while (isSpace(this.peek())) {
this.move();
}
// Check if there are tokens left.
if (this.isDone()) {
throw new EOFError();
}
const lookahead = this.peek();
// Ignore comments.
if (lookahead === "-" && this.peek(1) === "-") {
this.skip();
return this.nextToken();
}
// Reserved symbols.
if (reserved.has(lookahead)) {
return this.emit(Tag.Reserved, lookahead);
}
// Separators.
if (separators.has(lookahead)) {
return this.emit(separators.get(lookahead), lookahead);
}
// Special terminal symbols.
if (terminals.has(lookahead)) {
return this.emit(Tag.Terminal, lookahead);
}
// Variable names and terminals.
return this.emitWord();
}
}
/**
* Tokenizes code.
* Returns an array of `Token`s.
*/
function tokenize(code) {
const tokens = [];
const tokenizer = new Tokenizer(code);
for (;;) {
try {
tokens.push(tokenizer.nextToken());
}
catch (error) {
if (error instanceof EOFError) {
break;
}
throw error;
}
}
return tokens;
}
exports.tokenize = tokenize;
/**
* Checks if the string is a whitespace character.
* Since newlines are significant, we won't treat them as whitespace.
*/
function isSpace(text) {
if (text === "\n") {
return false;
}
const re = /^\s+$/;
return re.test(text);
}
/**
* Checks if the string is a valid variable name.
* Variable names must be capitalized, and must only consist of alphanumeric
* symbols.
*/
function isName(text) {
const re = /^[A-Z][A-Za-z0-9]*$/;
return re.test(text);
}
//# sourceMappingURL=tokenizer.js.map