@zsnout/ithkuil
Version:
A set of tools which can generate and parse romanized Ithkuil text and which can generate Ithkuil script from text and JSON data.
138 lines (137 loc) • 5.46 kB
JavaScript
import { ALL_SUPPLETIVE_ADJUNCT_TYPES, has, } from "../generate/index.js";
import { parseWord } from "../parse/index.js";
function parseSentence(text) {
try {
const words = text.match(/[\p{ID_Start}\d\u02BC\u0027\u2019'_][\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_]*/gu);
if (!words) {
return { ok: true, value: [] };
}
const output = [];
let wordType = { type: "word" };
for (const word of words) {
if (wordType.type == "carrier") {
output.push({
type: "word",
word: wordType.word,
source: wordType.source,
properNoun: word,
});
wordType = { type: "word" };
continue;
}
if (wordType.type == "chain" && wordType.expects == "properNoun") {
output.push({
type: "chain",
words: wordType.words,
properNoun: word,
});
wordType = { type: "word" };
continue;
}
const result = parseWord(word);
if (result == null) {
return { ok: false, reason: `“${word}” is not a valid word.` };
}
if (typeof result == "object" && "root" in result) {
if (wordType.type == "chain") {
if (result.root == "s") {
wordType.includesCarrier = true;
}
wordType.words.push([word, result]);
if (result.type != "UNF/C" ||
(result.concatenationType != 1 && result.concatenationType != 2)) {
if (wordType.includesCarrier) {
wordType.expects = "properNoun";
}
else {
output.push({ type: "chain", words: wordType.words });
wordType = { type: "word" };
}
}
}
else {
if (result.type == "UNF/C" &&
(result.concatenationType == 1 || result.concatenationType == 2)) {
wordType = {
type: "chain",
expects: "formative",
includesCarrier: result.root == "s",
words: [[word, result]],
};
}
else if (result.root == "s") {
wordType = { type: "carrier", word: result, source: word };
}
else {
output.push({ type: "word", word: result, source: word });
wordType = { type: "word" };
}
}
}
else {
if (wordType.type == "chain") {
output.push({ type: "brokenChain", words: wordType.words });
}
if (result == "SPF:START" ||
(typeof result == "object" &&
"type" in result &&
has(ALL_SUPPLETIVE_ADJUNCT_TYPES, result.type))) {
wordType = { type: "carrier", word: result, source: word };
}
else {
output.push({ type: "word", word: result, source: word });
wordType = { type: "word" };
}
}
}
if (wordType.type == "carrier") {
output.push({
type: "word",
word: wordType.word,
source: wordType.source,
});
}
else if (wordType.type == "chain") {
output.push({ type: "brokenChain", words: wordType.words });
}
return { ok: true, value: output };
}
catch (error) {
return {
ok: false,
reason: error instanceof Error ? error.message : String(error),
};
}
}
const sentenceJunctureAffix = /(^|[^\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_])(çç|ç[waeiouäëöüìùáéíóúâêôû]|çë[\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_])/gu;
/**
* Parses romanized Ithkuilic text into a series of items.
*
* @param text The text to be parsed.
* @returns A `Result` containing an array of `ParsedItem`s.
*/
export function parseSentences(text) {
text = text
// The ç in the regex is a "c" with an extension of "̧ ".
// We replace it with "ç" (a single character) for parsing purposes.
.replace(/ç/g, "ç")
.replace(sentenceJunctureAffix, (_, previousChar, junctureAffix) => {
return (previousChar +
". " +
(junctureAffix == "çç" ? "y" : (junctureAffix.slice(junctureAffix.startsWith("çë") ? 2 : 1))));
});
const output = [];
let isFirst = true;
for (const sentence of text.split(/[.!?]/g).filter((x) => x.trim() != "")) {
const result = parseSentence(sentence);
if (!result.ok) {
return result;
}
if (!isFirst) {
output.push({ type: "sentenceBreak" });
}
output.push(...result.value);
isFirst = false;
}
return { ok: true, value: output };
}