@lunarisapp/language
Version:
A utility library for core linguistic breakdown: vowels, consonants, words, and sentences.
691 lines (683 loc) • 13.4 kB
JavaScript
;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var index_exports = {};
__export(index_exports, {
consonants: () => consonants,
contractionsRegexSeq: () => contractionsRegexSeq,
getSentences: () => getSentences,
getWords: () => getWords,
languages: () => languages,
removePunctuation: () => removePunctuation,
vowels: () => vowels
});
module.exports = __toCommonJS(index_exports);
// src/features/languages.ts
var languages = [
"af",
"as",
"be",
"bg",
"ca",
"da",
"de",
"de_DE",
"de_CH",
"de_AT",
"en",
"en_US",
"en_GB",
"eo",
"es",
"et",
"fr",
"gl",
"hr",
"hu",
"id",
"is",
"it",
"kn",
"lt",
"lv",
"mn",
"mr",
"nb",
"nl",
"nn",
"pa",
"pl",
"pt",
"pt_PT",
"pt_BR",
"ro",
"ru",
"sa",
"sk",
"sl",
"sq",
"sr",
"sr_Latn",
"sv",
"te",
"th",
"uk",
"zu"
];
// src/features/vowels.ts
var VOWEL_LATIN_GROUP = ["a", "e", "i", "o", "u"];
var VOWEL_GERMANIC_UMLAUTS_GROUP = [
...VOWEL_LATIN_GROUP,
"\xE4",
"\xF6",
"\xFC",
"y"
];
var vowels = {
af: VOWEL_LATIN_GROUP,
as: ["\u0985", "\u0986", "\u0987", "\u0988", "\u0989", "\u098A", "\u098B", "\u098F", "\u0990", "\u0993", "\u0994"],
be: ["\u0430", "\u0435", "\u0451", "\u0456", "\u043E", "\u0443", "\u044B", "\u044D", "\u044E", "\u044F"],
bg: ["\u0430", "\u044A", "\u043E", "\u0443", "\u0435", "\u0438", "\u044E", "\u044F"],
ca: VOWEL_LATIN_GROUP,
da: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"],
de: VOWEL_GERMANIC_UMLAUTS_GROUP,
de_DE: VOWEL_GERMANIC_UMLAUTS_GROUP,
de_CH: VOWEL_GERMANIC_UMLAUTS_GROUP,
de_AT: VOWEL_GERMANIC_UMLAUTS_GROUP,
en: VOWEL_LATIN_GROUP,
en_US: VOWEL_LATIN_GROUP,
en_GB: VOWEL_LATIN_GROUP,
eo: VOWEL_LATIN_GROUP,
es: [...VOWEL_LATIN_GROUP, "\xE1", "\xE9", "\xED", "\xF3", "\xFA", "\xFC"],
et: [...VOWEL_LATIN_GROUP, "\xF5", "\xE4", "\xF6", "\xFC"],
fr: [
"a",
"e",
"i",
"o",
"u",
"y",
"\xE0",
"\xE2",
"\xE6",
"\xE8",
"\xE9",
"\xEA",
"\xEB",
"\xEE",
"\xEF",
"\xF4",
"\u0153",
"\xF9",
"\xFB",
"\xFC"
],
gl: VOWEL_LATIN_GROUP,
hr: VOWEL_LATIN_GROUP,
hu: [...VOWEL_LATIN_GROUP, "\xF6", "\xFC"],
id: VOWEL_LATIN_GROUP,
is: [
"a",
"e",
"i",
"o",
"u",
"y",
"\xE1",
"\xF0",
"\xE9",
"\xED",
"\xF3",
"\xFA",
"\xFD",
"\xE6",
"\xF6"
],
it: VOWEL_LATIN_GROUP,
kn: [
"\u0C85",
"\u0C86",
"\u0C87",
"\u0C88",
"\u0C89",
"\u0C8A",
"\u0C8B",
"\u0C8E",
"\u0C8F",
"\u0C90",
"\u0C92",
"\u0C93",
"\u0C94"
],
lt: [...VOWEL_LATIN_GROUP, "\u0105", "\u0119", "\u0117", "\u012F", "\u0173", "\u016B"],
lv: VOWEL_LATIN_GROUP,
mn: ["\u0430", "\u0435", "\u0438", "\u043E", "\u0443", "\u04AF", "\u04E9", "\u044D", "\u044F", "\u0451", "\u044E"],
mr: ["\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090A", "\u090B", "\u090F", "\u0910", "\u0913", "\u0914"],
nb: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"],
nl: VOWEL_LATIN_GROUP,
nn: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"],
pa: ["\u0A05", "\u0A06", "\u0A07", "\u0A08", "\u0A09", "\u0A0A", "\u0A0F", "\u0A10", "\u0A13", "\u0A14"],
pl: ["a", "\u0105", "e", "\u0119", "i", "o", "\xF3", "u", "y"],
pt: VOWEL_LATIN_GROUP,
pt_PT: VOWEL_LATIN_GROUP,
pt_BR: VOWEL_LATIN_GROUP,
ro: [...VOWEL_LATIN_GROUP, "\u0103", "\xE2", "\xEE"],
ru: ["\u0430", "\u0435", "\u0451", "\u0438", "\u043E", "\u0443", "\u044B", "\u044D", "\u044E", "\u044F"],
sa: ["\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090A", "\u090B", "\u090F", "\u0910", "\u0913", "\u0914"],
sk: [...VOWEL_LATIN_GROUP, "y", "\xE4", "\xF4"],
sl: VOWEL_LATIN_GROUP,
sq: [...VOWEL_LATIN_GROUP, "y", "\xEB"],
sr: ["\u0430", "\u0435", "\u0438", "\u043E", "\u0443"],
sr_Latn: VOWEL_LATIN_GROUP,
sv: [...VOWEL_LATIN_GROUP, "y", "\xE5", "\xE4", "\xF6"],
te: [
"\u0C05",
"\u0C06",
"\u0C07",
"\u0C08",
"\u0C09",
"\u0C0A",
"\u0C0B",
"\u0C0E",
"\u0C0F",
"\u0C10",
"\u0C12",
"\u0C13",
"\u0C14"
],
th: [
"\u0E30",
"\u0E32",
"\u0E34",
"\u0E35",
"\u0E36",
"\u0E37",
"\u0E38",
"\u0E39",
"\u0E40",
"\u0E41",
"\u0E42",
"\u0E43",
"\u0E44",
"\u0E45"
],
uk: ["\u0430", "\u0435", "\u0454", "\u0438", "\u0456", "\u0457", "\u043E", "\u0443", "\u044E", "\u044F"],
zu: VOWEL_LATIN_GROUP
};
// src/features/consonants.ts
var CONSONANT_LATIN_GROUP = [
"b",
"c",
"d",
"f",
"g",
"h",
"j",
"k",
"l",
"m",
"n",
"p",
"q",
"r",
"s",
"t",
"v",
"w",
"x",
"z"
];
var CONSONANT_CYRILLIC_GROUP = [
"\u0431",
"\u0432",
"\u0433",
"\u0434",
"\u0436",
"\u0437",
"\u0439",
"\u043A",
"\u043B",
"\u043C",
"\u043D",
"\u043F",
"\u0440",
"\u0441",
"\u0442",
"\u0444",
"\u0445",
"\u0446",
"\u0447",
"\u0448",
"\u0449"
];
var consonants = {
af: CONSONANT_LATIN_GROUP,
as: [
"\u0995",
"\u0996",
"\u0997",
"\u0998",
"\u0999",
"\u099A",
"\u099B",
"\u099C",
"\u099D",
"\u099E",
"\u099F",
"\u09A0",
"\u09A1",
"\u09A2",
"\u09A3",
"\u09A4",
"\u09A5",
"\u09A6",
"\u09A7",
"\u09A8",
"\u09AA",
"\u09AB",
"\u09AC",
"\u09AD",
"\u09AE",
"\u09AF",
"\u09B0",
"\u09B2",
"\u09B6",
"\u09B7",
"\u09B8",
"\u09B9"
],
be: CONSONANT_CYRILLIC_GROUP,
bg: CONSONANT_CYRILLIC_GROUP,
ca: CONSONANT_LATIN_GROUP,
da: [...CONSONANT_LATIN_GROUP, "\xF0"],
de: [...CONSONANT_LATIN_GROUP, "\xDF"],
de_AT: [...CONSONANT_LATIN_GROUP, "\xDF"],
de_CH: [...CONSONANT_LATIN_GROUP, "\xDF"],
de_DE: [...CONSONANT_LATIN_GROUP, "\xDF"],
en: CONSONANT_LATIN_GROUP,
en_GB: CONSONANT_LATIN_GROUP,
en_US: CONSONANT_LATIN_GROUP,
eo: CONSONANT_LATIN_GROUP,
es: [...CONSONANT_LATIN_GROUP, "\xF1"],
et: CONSONANT_LATIN_GROUP,
fr: [...CONSONANT_LATIN_GROUP, "\xE7"],
gl: CONSONANT_LATIN_GROUP,
hr: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0107", "\u0111", "\u0161", "\u017E"],
hu: [
...CONSONANT_LATIN_GROUP,
"cs",
"dz",
"gy",
"ly",
"ny",
"sz",
"ty",
"zs"
],
id: CONSONANT_LATIN_GROUP,
is: [...CONSONANT_LATIN_GROUP, "\xF0", "\xFE"],
it: CONSONANT_LATIN_GROUP,
kn: [
"\u0C95",
"\u0C96",
"\u0C97",
"\u0C98",
"\u0C99",
"\u0C9A",
"\u0C9B",
"\u0C9C",
"\u0C9D",
"\u0C9E",
"\u0C9F",
"\u0CA0",
"\u0CA1",
"\u0CA2",
"\u0CA3",
"\u0CA4",
"\u0CA5",
"\u0CA6",
"\u0CA7",
"\u0CA8",
"\u0CAA",
"\u0CAB",
"\u0CAC",
"\u0CAD",
"\u0CAE",
"\u0CAF",
"\u0CB0",
"\u0CB2",
"\u0CB5",
"\u0CB6",
"\u0CB7",
"\u0CB8",
"\u0CB9"
],
lt: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0161", "\u017E"],
lv: CONSONANT_LATIN_GROUP,
mr: [
"\u0915",
"\u0916",
"\u0917",
"\u0918",
"\u0919",
"\u091A",
"\u091B",
"\u091C",
"\u091D",
"\u091E",
"\u091F",
"\u0920",
"\u0921",
"\u0922",
"\u0923",
"\u0924",
"\u0925",
"\u0926",
"\u0927",
"\u0928",
"\u092A",
"\u092B",
"\u092C",
"\u092D",
"\u092E",
"\u092F",
"\u0930",
"\u0932",
"\u0935",
"\u0936",
"\u0937",
"\u0938",
"\u0939",
"\u0933",
"\u0915\u094D\u200D\u0937",
"\u0924\u094D\u0930",
"\u091C\u094D\u091E"
],
mn: [
"\u0431",
"\u0432",
"\u0433",
"\u0434",
"\u0436",
"\u0437",
"\u0439",
"\u043A",
"\u043B",
"\u043C",
"\u043D",
"\u043F",
"\u0440",
"\u0441",
"\u0442",
"\u0444",
"\u0445",
"\u0446",
"\u0447",
"\u0448"
],
nb: [...CONSONANT_LATIN_GROUP, "\xE7"],
nn: [...CONSONANT_LATIN_GROUP, "\xE7"],
nl: CONSONANT_LATIN_GROUP,
pa: [
"\u0A15",
"\u0A16",
"\u0A17",
"\u0A18",
"\u0A19",
"\u0A1A",
"\u0A1B",
"\u0A1C",
"\u0A1D",
"\u0A1E",
"\u0A1F",
"\u0A20",
"\u0A21",
"\u0A22",
"\u0A23",
"\u0A24",
"\u0A25",
"\u0A26",
"\u0A27",
"\u0A28",
"\u0A2A",
"\u0A2B",
"\u0A2C",
"\u0A2D",
"\u0A2E",
"\u0A2F",
"\u0A30",
"\u0A32",
"\u0A35",
"\u0A36",
"\u0A5B",
"\u0A38",
"\u0A39"
],
pl: [...CONSONANT_LATIN_GROUP, "\u0142", "\u0144", "\u015B", "\u017A", "\u017C", "\u0107"],
pt: CONSONANT_LATIN_GROUP,
pt_BR: CONSONANT_LATIN_GROUP,
pt_PT: CONSONANT_LATIN_GROUP,
ro: [...CONSONANT_LATIN_GROUP, "\u021B", "\u0219"],
ru: CONSONANT_CYRILLIC_GROUP,
sa: [
"\u0915",
"\u0916",
"\u0917",
"\u0918",
"\u0919",
"\u091A",
"\u091B",
"\u091C",
"\u091D",
"\u091E",
"\u091F",
"\u0920",
"\u0921",
"\u0922",
"\u0923",
"\u0924",
"\u0925",
"\u0926",
"\u0927",
"\u0928",
"\u092A",
"\u092B",
"\u092C",
"\u092D",
"\u092E",
"\u092F",
"\u0930",
"\u0932",
"\u0935",
"\u0936",
"\u0937",
"\u0938",
"\u0939",
"\u0933",
"\u0915\u094D\u200D\u0937",
"\u0924\u094D\u0930",
"\u091C\u094D\u091E"
],
sk: [...CONSONANT_LATIN_GROUP, "\u010D", "\u010F", "\u013E", "\u0148", "\u0161", "\u0165", "\u017E"],
sl: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0161", "\u017E"],
sq: [...CONSONANT_LATIN_GROUP, "\xE7"],
sr: [
"\u0431",
"\u0432",
"\u0433",
"\u0434",
"\u0436",
"\u0437",
"\u0458",
"\u043A",
"\u043B",
"\u0459",
"\u043C",
"\u043D",
"\u045A",
"\u043F",
"\u0440",
"\u0441",
"\u0442",
"\u045B",
"\u0443",
"\u0444",
"\u0445",
"\u0446",
"\u0447",
"\u045F",
"\u0448"
],
sr_Latn: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0107", "\u0111", "\u0161", "\u017E"],
sv: CONSONANT_LATIN_GROUP,
te: [
"\u0C15",
"\u0C16",
"\u0C17",
"\u0C18",
"\u0C19",
"\u0C1A",
"\u0C1B",
"\u0C1C",
"\u0C1D",
"\u0C1E",
"\u0C1F",
"\u0C20",
"\u0C21",
"\u0C22",
"\u0C23",
"\u0C24",
"\u0C25",
"\u0C26",
"\u0C27",
"\u0C28",
"\u0C2A",
"\u0C2B",
"\u0C2C",
"\u0C2D",
"\u0C2E",
"\u0C2F",
"\u0C30",
"\u0C32",
"\u0C35",
"\u0C36",
"\u0C37",
"\u0C38",
"\u0C39"
],
th: [
"\u0E01",
"\u0E02",
"\u0E03",
"\u0E04",
"\u0E05",
"\u0E06",
"\u0E07",
"\u0E08",
"\u0E09",
"\u0E0A",
"\u0E0B",
"\u0E0C",
"\u0E0D",
"\u0E0E",
"\u0E0F",
"\u0E10",
"\u0E11",
"\u0E12",
"\u0E13",
"\u0E14",
"\u0E15",
"\u0E16",
"\u0E17",
"\u0E18",
"\u0E19",
"\u0E1A",
"\u0E1B",
"\u0E1C",
"\u0E1D",
"\u0E1E",
"\u0E1F",
"\u0E20",
"\u0E21",
"\u0E22",
"\u0E23",
"\u0E25",
"\u0E27",
"\u0E28",
"\u0E29",
"\u0E2A",
"\u0E2B",
"\u0E2C",
"\u0E2D",
"\u0E2E"
],
uk: CONSONANT_CYRILLIC_GROUP,
zu: CONSONANT_LATIN_GROUP
};
// src/features/contractions.ts
var contractionsRegexSeq = {
en: "[tsd]\\b|ve\\b|ll\\b|re\\b",
// it's, don't, you've, I'll
fr: "\\b[cjntlsd]'\\b",
// c', j', n', l', d' (e.g., c'est, j'aime)
es: "\\b(pa')\\b",
// pa' (e.g., pa'lante)
it: "\\b(l'|un'|da')\\b",
// l', un', da' (e.g., l'amico, un'amica)
ca: "\\b(l'|d'|m'|s'|t'|n')\\b",
// l', d', m', s' (e.g., l'amor, d'aigua)
gl: "\\b(d'|n'|t'|v'|ll'|m'|s')\\b",
// d', n', t' (e.g., d'aquela, n'hai)
ro: "\\b(l|m|s|\u0163i|d|c|a)-(am|ai|a|au|om|i|em|\u0163i)\\b",
// l-am, m-a, s-a
nl: "\\b('t|m\u2019n|d\u2019r|'n)\\b"
// 't huis, m'n, d'r, 'n
};
// src/features/removePunctuation.ts
function removePunctuation(text, ignoreContractions = false) {
if (ignoreContractions) {
const contractions = Object.values(contractionsRegexSeq).join("|");
if (contractions) {
const antiContractionsRegex = new RegExp(`'(?!${contractions})`, "g");
text = text.replace(antiContractionsRegex, '"');
return text.replace(/[^\p{L}\p{N}\s']/gu, "");
}
}
return text.replace(/[^\p{L}\p{N}\s]/gu, "");
}
// src/features/parsers.ts
function getWords(text, isRemovePunctuation = true) {
if (isRemovePunctuation) {
text = removePunctuation(text, true);
}
return text.toLowerCase().split(/\s+/g);
}
function getSentences(text) {
return text.match(/[^.!?。!?\n\r]+[.!?。!?]*[\n\r]*/gu) || [];
}
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
consonants,
contractionsRegexSeq,
getSentences,
getWords,
languages,
removePunctuation,
vowels
});