UNPKG

@lunarisapp/language

Version:

A utility library for core linguistic breakdown: vowels, consonants, words, and sentences.

691 lines (683 loc) 13.4 kB
"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var index_exports = {}; __export(index_exports, { consonants: () => consonants, contractionsRegexSeq: () => contractionsRegexSeq, getSentences: () => getSentences, getWords: () => getWords, languages: () => languages, removePunctuation: () => removePunctuation, vowels: () => vowels }); module.exports = __toCommonJS(index_exports); // src/features/languages.ts var languages = [ "af", "as", "be", "bg", "ca", "da", "de", "de_DE", "de_CH", "de_AT", "en", "en_US", "en_GB", "eo", "es", "et", "fr", "gl", "hr", "hu", "id", "is", "it", "kn", "lt", "lv", "mn", "mr", "nb", "nl", "nn", "pa", "pl", "pt", "pt_PT", "pt_BR", "ro", "ru", "sa", "sk", "sl", "sq", "sr", "sr_Latn", "sv", "te", "th", "uk", "zu" ]; // src/features/vowels.ts var VOWEL_LATIN_GROUP = ["a", "e", "i", "o", "u"]; var VOWEL_GERMANIC_UMLAUTS_GROUP = [ ...VOWEL_LATIN_GROUP, "\xE4", "\xF6", "\xFC", "y" ]; var vowels = { af: VOWEL_LATIN_GROUP, as: ["\u0985", "\u0986", "\u0987", "\u0988", "\u0989", "\u098A", "\u098B", "\u098F", "\u0990", "\u0993", "\u0994"], be: ["\u0430", "\u0435", "\u0451", "\u0456", "\u043E", "\u0443", "\u044B", "\u044D", "\u044E", "\u044F"], bg: ["\u0430", "\u044A", "\u043E", "\u0443", "\u0435", "\u0438", "\u044E", "\u044F"], ca: VOWEL_LATIN_GROUP, da: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"], de: VOWEL_GERMANIC_UMLAUTS_GROUP, de_DE: VOWEL_GERMANIC_UMLAUTS_GROUP, de_CH: VOWEL_GERMANIC_UMLAUTS_GROUP, de_AT: VOWEL_GERMANIC_UMLAUTS_GROUP, en: VOWEL_LATIN_GROUP, en_US: VOWEL_LATIN_GROUP, en_GB: VOWEL_LATIN_GROUP, eo: VOWEL_LATIN_GROUP, es: [...VOWEL_LATIN_GROUP, "\xE1", "\xE9", "\xED", "\xF3", "\xFA", "\xFC"], et: [...VOWEL_LATIN_GROUP, "\xF5", "\xE4", "\xF6", "\xFC"], fr: [ "a", "e", "i", "o", "u", "y", "\xE0", "\xE2", "\xE6", "\xE8", "\xE9", "\xEA", "\xEB", "\xEE", "\xEF", "\xF4", "\u0153", "\xF9", "\xFB", "\xFC" ], gl: VOWEL_LATIN_GROUP, hr: VOWEL_LATIN_GROUP, hu: [...VOWEL_LATIN_GROUP, "\xF6", "\xFC"], id: VOWEL_LATIN_GROUP, is: [ "a", "e", "i", "o", "u", "y", "\xE1", "\xF0", "\xE9", "\xED", "\xF3", "\xFA", "\xFD", "\xE6", "\xF6" ], it: VOWEL_LATIN_GROUP, kn: [ "\u0C85", "\u0C86", "\u0C87", "\u0C88", "\u0C89", "\u0C8A", "\u0C8B", "\u0C8E", "\u0C8F", "\u0C90", "\u0C92", "\u0C93", "\u0C94" ], lt: [...VOWEL_LATIN_GROUP, "\u0105", "\u0119", "\u0117", "\u012F", "\u0173", "\u016B"], lv: VOWEL_LATIN_GROUP, mn: ["\u0430", "\u0435", "\u0438", "\u043E", "\u0443", "\u04AF", "\u04E9", "\u044D", "\u044F", "\u0451", "\u044E"], mr: ["\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090A", "\u090B", "\u090F", "\u0910", "\u0913", "\u0914"], nb: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"], nl: VOWEL_LATIN_GROUP, nn: [...VOWEL_LATIN_GROUP, "y", "\xE6", "\xF8", "\xE5"], pa: ["\u0A05", "\u0A06", "\u0A07", "\u0A08", "\u0A09", "\u0A0A", "\u0A0F", "\u0A10", "\u0A13", "\u0A14"], pl: ["a", "\u0105", "e", "\u0119", "i", "o", "\xF3", "u", "y"], pt: VOWEL_LATIN_GROUP, pt_PT: VOWEL_LATIN_GROUP, pt_BR: VOWEL_LATIN_GROUP, ro: [...VOWEL_LATIN_GROUP, "\u0103", "\xE2", "\xEE"], ru: ["\u0430", "\u0435", "\u0451", "\u0438", "\u043E", "\u0443", "\u044B", "\u044D", "\u044E", "\u044F"], sa: ["\u0905", "\u0906", "\u0907", "\u0908", "\u0909", "\u090A", "\u090B", "\u090F", "\u0910", "\u0913", "\u0914"], sk: [...VOWEL_LATIN_GROUP, "y", "\xE4", "\xF4"], sl: VOWEL_LATIN_GROUP, sq: [...VOWEL_LATIN_GROUP, "y", "\xEB"], sr: ["\u0430", "\u0435", "\u0438", "\u043E", "\u0443"], sr_Latn: VOWEL_LATIN_GROUP, sv: [...VOWEL_LATIN_GROUP, "y", "\xE5", "\xE4", "\xF6"], te: [ "\u0C05", "\u0C06", "\u0C07", "\u0C08", "\u0C09", "\u0C0A", "\u0C0B", "\u0C0E", "\u0C0F", "\u0C10", "\u0C12", "\u0C13", "\u0C14" ], th: [ "\u0E30", "\u0E32", "\u0E34", "\u0E35", "\u0E36", "\u0E37", "\u0E38", "\u0E39", "\u0E40", "\u0E41", "\u0E42", "\u0E43", "\u0E44", "\u0E45" ], uk: ["\u0430", "\u0435", "\u0454", "\u0438", "\u0456", "\u0457", "\u043E", "\u0443", "\u044E", "\u044F"], zu: VOWEL_LATIN_GROUP }; // src/features/consonants.ts var CONSONANT_LATIN_GROUP = [ "b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z" ]; var CONSONANT_CYRILLIC_GROUP = [ "\u0431", "\u0432", "\u0433", "\u0434", "\u0436", "\u0437", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043F", "\u0440", "\u0441", "\u0442", "\u0444", "\u0445", "\u0446", "\u0447", "\u0448", "\u0449" ]; var consonants = { af: CONSONANT_LATIN_GROUP, as: [ "\u0995", "\u0996", "\u0997", "\u0998", "\u0999", "\u099A", "\u099B", "\u099C", "\u099D", "\u099E", "\u099F", "\u09A0", "\u09A1", "\u09A2", "\u09A3", "\u09A4", "\u09A5", "\u09A6", "\u09A7", "\u09A8", "\u09AA", "\u09AB", "\u09AC", "\u09AD", "\u09AE", "\u09AF", "\u09B0", "\u09B2", "\u09B6", "\u09B7", "\u09B8", "\u09B9" ], be: CONSONANT_CYRILLIC_GROUP, bg: CONSONANT_CYRILLIC_GROUP, ca: CONSONANT_LATIN_GROUP, da: [...CONSONANT_LATIN_GROUP, "\xF0"], de: [...CONSONANT_LATIN_GROUP, "\xDF"], de_AT: [...CONSONANT_LATIN_GROUP, "\xDF"], de_CH: [...CONSONANT_LATIN_GROUP, "\xDF"], de_DE: [...CONSONANT_LATIN_GROUP, "\xDF"], en: CONSONANT_LATIN_GROUP, en_GB: CONSONANT_LATIN_GROUP, en_US: CONSONANT_LATIN_GROUP, eo: CONSONANT_LATIN_GROUP, es: [...CONSONANT_LATIN_GROUP, "\xF1"], et: CONSONANT_LATIN_GROUP, fr: [...CONSONANT_LATIN_GROUP, "\xE7"], gl: CONSONANT_LATIN_GROUP, hr: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0107", "\u0111", "\u0161", "\u017E"], hu: [ ...CONSONANT_LATIN_GROUP, "cs", "dz", "gy", "ly", "ny", "sz", "ty", "zs" ], id: CONSONANT_LATIN_GROUP, is: [...CONSONANT_LATIN_GROUP, "\xF0", "\xFE"], it: CONSONANT_LATIN_GROUP, kn: [ "\u0C95", "\u0C96", "\u0C97", "\u0C98", "\u0C99", "\u0C9A", "\u0C9B", "\u0C9C", "\u0C9D", "\u0C9E", "\u0C9F", "\u0CA0", "\u0CA1", "\u0CA2", "\u0CA3", "\u0CA4", "\u0CA5", "\u0CA6", "\u0CA7", "\u0CA8", "\u0CAA", "\u0CAB", "\u0CAC", "\u0CAD", "\u0CAE", "\u0CAF", "\u0CB0", "\u0CB2", "\u0CB5", "\u0CB6", "\u0CB7", "\u0CB8", "\u0CB9" ], lt: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0161", "\u017E"], lv: CONSONANT_LATIN_GROUP, mr: [ "\u0915", "\u0916", "\u0917", "\u0918", "\u0919", "\u091A", "\u091B", "\u091C", "\u091D", "\u091E", "\u091F", "\u0920", "\u0921", "\u0922", "\u0923", "\u0924", "\u0925", "\u0926", "\u0927", "\u0928", "\u092A", "\u092B", "\u092C", "\u092D", "\u092E", "\u092F", "\u0930", "\u0932", "\u0935", "\u0936", "\u0937", "\u0938", "\u0939", "\u0933", "\u0915\u094D\u200D\u0937", "\u0924\u094D\u0930", "\u091C\u094D\u091E" ], mn: [ "\u0431", "\u0432", "\u0433", "\u0434", "\u0436", "\u0437", "\u0439", "\u043A", "\u043B", "\u043C", "\u043D", "\u043F", "\u0440", "\u0441", "\u0442", "\u0444", "\u0445", "\u0446", "\u0447", "\u0448" ], nb: [...CONSONANT_LATIN_GROUP, "\xE7"], nn: [...CONSONANT_LATIN_GROUP, "\xE7"], nl: CONSONANT_LATIN_GROUP, pa: [ "\u0A15", "\u0A16", "\u0A17", "\u0A18", "\u0A19", "\u0A1A", "\u0A1B", "\u0A1C", "\u0A1D", "\u0A1E", "\u0A1F", "\u0A20", "\u0A21", "\u0A22", "\u0A23", "\u0A24", "\u0A25", "\u0A26", "\u0A27", "\u0A28", "\u0A2A", "\u0A2B", "\u0A2C", "\u0A2D", "\u0A2E", "\u0A2F", "\u0A30", "\u0A32", "\u0A35", "\u0A36", "\u0A5B", "\u0A38", "\u0A39" ], pl: [...CONSONANT_LATIN_GROUP, "\u0142", "\u0144", "\u015B", "\u017A", "\u017C", "\u0107"], pt: CONSONANT_LATIN_GROUP, pt_BR: CONSONANT_LATIN_GROUP, pt_PT: CONSONANT_LATIN_GROUP, ro: [...CONSONANT_LATIN_GROUP, "\u021B", "\u0219"], ru: CONSONANT_CYRILLIC_GROUP, sa: [ "\u0915", "\u0916", "\u0917", "\u0918", "\u0919", "\u091A", "\u091B", "\u091C", "\u091D", "\u091E", "\u091F", "\u0920", "\u0921", "\u0922", "\u0923", "\u0924", "\u0925", "\u0926", "\u0927", "\u0928", "\u092A", "\u092B", "\u092C", "\u092D", "\u092E", "\u092F", "\u0930", "\u0932", "\u0935", "\u0936", "\u0937", "\u0938", "\u0939", "\u0933", "\u0915\u094D\u200D\u0937", "\u0924\u094D\u0930", "\u091C\u094D\u091E" ], sk: [...CONSONANT_LATIN_GROUP, "\u010D", "\u010F", "\u013E", "\u0148", "\u0161", "\u0165", "\u017E"], sl: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0161", "\u017E"], sq: [...CONSONANT_LATIN_GROUP, "\xE7"], sr: [ "\u0431", "\u0432", "\u0433", "\u0434", "\u0436", "\u0437", "\u0458", "\u043A", "\u043B", "\u0459", "\u043C", "\u043D", "\u045A", "\u043F", "\u0440", "\u0441", "\u0442", "\u045B", "\u0443", "\u0444", "\u0445", "\u0446", "\u0447", "\u045F", "\u0448" ], sr_Latn: [...CONSONANT_LATIN_GROUP, "\u010D", "\u0107", "\u0111", "\u0161", "\u017E"], sv: CONSONANT_LATIN_GROUP, te: [ "\u0C15", "\u0C16", "\u0C17", "\u0C18", "\u0C19", "\u0C1A", "\u0C1B", "\u0C1C", "\u0C1D", "\u0C1E", "\u0C1F", "\u0C20", "\u0C21", "\u0C22", "\u0C23", "\u0C24", "\u0C25", "\u0C26", "\u0C27", "\u0C28", "\u0C2A", "\u0C2B", "\u0C2C", "\u0C2D", "\u0C2E", "\u0C2F", "\u0C30", "\u0C32", "\u0C35", "\u0C36", "\u0C37", "\u0C38", "\u0C39" ], th: [ "\u0E01", "\u0E02", "\u0E03", "\u0E04", "\u0E05", "\u0E06", "\u0E07", "\u0E08", "\u0E09", "\u0E0A", "\u0E0B", "\u0E0C", "\u0E0D", "\u0E0E", "\u0E0F", "\u0E10", "\u0E11", "\u0E12", "\u0E13", "\u0E14", "\u0E15", "\u0E16", "\u0E17", "\u0E18", "\u0E19", "\u0E1A", "\u0E1B", "\u0E1C", "\u0E1D", "\u0E1E", "\u0E1F", "\u0E20", "\u0E21", "\u0E22", "\u0E23", "\u0E25", "\u0E27", "\u0E28", "\u0E29", "\u0E2A", "\u0E2B", "\u0E2C", "\u0E2D", "\u0E2E" ], uk: CONSONANT_CYRILLIC_GROUP, zu: CONSONANT_LATIN_GROUP }; // src/features/contractions.ts var contractionsRegexSeq = { en: "[tsd]\\b|ve\\b|ll\\b|re\\b", // it's, don't, you've, I'll fr: "\\b[cjntlsd]'\\b", // c', j', n', l', d' (e.g., c'est, j'aime) es: "\\b(pa')\\b", // pa' (e.g., pa'lante) it: "\\b(l'|un'|da')\\b", // l', un', da' (e.g., l'amico, un'amica) ca: "\\b(l'|d'|m'|s'|t'|n')\\b", // l', d', m', s' (e.g., l'amor, d'aigua) gl: "\\b(d'|n'|t'|v'|ll'|m'|s')\\b", // d', n', t' (e.g., d'aquela, n'hai) ro: "\\b(l|m|s|\u0163i|d|c|a)-(am|ai|a|au|om|i|em|\u0163i)\\b", // l-am, m-a, s-a nl: "\\b('t|m\u2019n|d\u2019r|'n)\\b" // 't huis, m'n, d'r, 'n }; // src/features/removePunctuation.ts function removePunctuation(text, ignoreContractions = false) { if (ignoreContractions) { const contractions = Object.values(contractionsRegexSeq).join("|"); if (contractions) { const antiContractionsRegex = new RegExp(`'(?!${contractions})`, "g"); text = text.replace(antiContractionsRegex, '"'); return text.replace(/[^\p{L}\p{N}\s']/gu, ""); } } return text.replace(/[^\p{L}\p{N}\s]/gu, ""); } // src/features/parsers.ts function getWords(text, isRemovePunctuation = true) { if (isRemovePunctuation) { text = removePunctuation(text, true); } return text.toLowerCase().split(/\s+/g); } function getSentences(text) { return text.match(/[^.!?。!?\n\r]+[.!?。!?]*[\n\r]*/gu) || []; } // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { consonants, contractionsRegexSeq, getSentences, getWords, languages, removePunctuation, vowels });