UNPKG

@zsnout/ithkuil

Version:

A set of tools which can generate and parse romanized Ithkuil text and which can generate Ithkuil script from text and JSON data.

138 lines (137 loc) 5.46 kB
import { ALL_SUPPLETIVE_ADJUNCT_TYPES, has, } from "../generate/index.js"; import { parseWord } from "../parse/index.js"; function parseSentence(text) { try { const words = text.match(/[\p{ID_Start}\d\u02BC\u0027\u2019'_][\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_]*/gu); if (!words) { return { ok: true, value: [] }; } const output = []; let wordType = { type: "word" }; for (const word of words) { if (wordType.type == "carrier") { output.push({ type: "word", word: wordType.word, source: wordType.source, properNoun: word, }); wordType = { type: "word" }; continue; } if (wordType.type == "chain" && wordType.expects == "properNoun") { output.push({ type: "chain", words: wordType.words, properNoun: word, }); wordType = { type: "word" }; continue; } const result = parseWord(word); if (result == null) { return { ok: false, reason: `“${word}” is not a valid word.` }; } if (typeof result == "object" && "root" in result) { if (wordType.type == "chain") { if (result.root == "s") { wordType.includesCarrier = true; } wordType.words.push([word, result]); if (result.type != "UNF/C" || (result.concatenationType != 1 && result.concatenationType != 2)) { if (wordType.includesCarrier) { wordType.expects = "properNoun"; } else { output.push({ type: "chain", words: wordType.words }); wordType = { type: "word" }; } } } else { if (result.type == "UNF/C" && (result.concatenationType == 1 || result.concatenationType == 2)) { wordType = { type: "chain", expects: "formative", includesCarrier: result.root == "s", words: [[word, result]], }; } else if (result.root == "s") { wordType = { type: "carrier", word: result, source: word }; } else { output.push({ type: "word", word: result, source: word }); wordType = { type: "word" }; } } } else { if (wordType.type == "chain") { output.push({ type: "brokenChain", words: wordType.words }); } if (result == "SPF:START" || (typeof result == "object" && "type" in result && has(ALL_SUPPLETIVE_ADJUNCT_TYPES, result.type))) { wordType = { type: "carrier", word: result, source: word }; } else { output.push({ type: "word", word: result, source: word }); wordType = { type: "word" }; } } } if (wordType.type == "carrier") { output.push({ type: "word", word: wordType.word, source: wordType.source, }); } else if (wordType.type == "chain") { output.push({ type: "brokenChain", words: wordType.words }); } return { ok: true, value: output }; } catch (error) { return { ok: false, reason: error instanceof Error ? error.message : String(error), }; } } const sentenceJunctureAffix = /(^|[^\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_])(çç|ç[waeiouäëöüìùáéíóúâêôû]|çë[\p{ID_Start}\p{ID_Continue}\d\u02BC\u0027\u2019'_])/gu; /** * Parses romanized Ithkuilic text into a series of items. * * @param text The text to be parsed. * @returns A `Result` containing an array of `ParsedItem`s. */ export function parseSentences(text) { text = text // The ç in the regex is a "c" with an extension of "̧ ". // We replace it with "ç" (a single character) for parsing purposes. .replace(/ç/g, "ç") .replace(sentenceJunctureAffix, (_, previousChar, junctureAffix) => { return (previousChar + ". " + (junctureAffix == "çç" ? "y" : (junctureAffix.slice(junctureAffix.startsWith("çë") ? 2 : 1)))); }); const output = []; let isFirst = true; for (const sentence of text.split(/[.!?]/g).filter((x) => x.trim() != "")) { const result = parseSentence(sentence); if (!result.ok) { return result; } if (!isFirst) { output.push({ type: "sentenceBreak" }); } output.push(...result.value); isFirst = false; } return { ok: true, value: output }; }