UNPKG

@erkanarslan/turkish-inflection

Version:
289 lines (288 loc) 9.29 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.inflectWord = exports.inflectText = exports.inflect = exports.addLenitionExceptions = void 0; var data_1 = require("./data"); var data_2 = require("./data"); Object.defineProperty(exports, "addLenitionExceptions", { enumerable: true, get: function () { return data_2.addLenitionExceptions; } }); var suffixes = ['e', 'in', 'i', 'den', 'de', 'ler', 'mi']; var vowels = ['a', 'e', 'ı', 'i', 'o', 'ö', 'u', 'ü']; var nonContinuantFortisConsonants = ['p', 'ç', 't', 'k']; var fortisConsonants = ['p', 'ç', 't', 'k', 'f', 'h', 's', 'ş']; var transformMap = { i: { a: 'ı', e: 'i', ı: 'ı', i: 'i', o: 'u', ö: 'ü', u: 'u', ü: 'ü' }, e: { a: 'a', e: 'e', ı: 'a', i: 'e', o: 'a', ö: 'e', u: 'a', ü: 'e' }, }; var lenitionMap = { p: 'b', ç: 'c', t: 'd', k: 'ğ', k2: 'g', P: "B", Ç: 'C', T: 'D', K: 'Ğ', K2: 'G' }; var alternatives = suffixes.map(function (s) { return s.replace('e', 'a').replace('i', 'ı'); }) .concat(['İN', 'İ', 'Mİ']); var markupPattern = new RegExp("--(" + suffixes.concat(alternatives).join("|") + ")", 'i'); var vowelPattern = new RegExp(vowels.join("|"), 'gi'); var wordPattern = /(?:[a-z]|[A-Z]|ç|Ç|ğ|Ğ|ı|İ|ö|Ö|ş|Ş|ü|Ü)+/; var numberPattern = /\d+/; function inflect(text, option) { if (typeof option == 'string') { return inflectWord(text, option); } else { return inflectText(text, option); } } exports.inflect = inflect; function inflectText(text, interpolation) { var match; if (interpolation) { for (var key in interpolation) { var value = interpolation[key]; var marker = "{{" + key + "}}"; while (text.includes(marker)) { text = text.replace(marker, value); } } } while (true) { match = text.match(markupPattern); if (!match) break; if (!match.index) continue; var index = match.index, suffix = match[1]; var _a = splitLastWord(text.slice(0, index)), firstPart = _a[0], word = _a[1]; text = firstPart + inflectWord(word, suffix) + text.slice(index + suffix.length + 2); } return text; } exports.inflectText = inflectText; function inflectWord(word, suffix) { var upperCase = isUpperCase(suffix); var normalizedSuffix = normalizeSuffix(suffix); var normalizedWord = normalizeWord(word); // Transform suffix var suffixType = getSuffixType(normalizedSuffix); var lastVowel = getLastVowel(normalizedWord); var suffixVowel = transformMap[suffixType][lastVowel]; suffix = normalizedSuffix.replace(suffixType, suffixVowel); // Fortitive assimilation (Ex: market + de -> markette) suffix = applyFortitiveAssimilation(normalizedWord, suffix); suffix = addBufferLetter(normalizedWord, suffix, normalizedSuffix); // Transform word // Consonant lenition (Ex: bıçak + ı -> bıçağı) word = applyLenition(word, normalizedSuffix); if (upperCase) { suffix = toUpperCase(suffix); } return word + suffix; } exports.inflectWord = inflectWord; /** * Checks and transforms suffix for fortitive assimilation. * * Ex: market + de -> markette * * @param word * @param suffix * @returns Transformed suffix */ function applyFortitiveAssimilation(word, suffix) { var lastLetter = word[word.length - 1]; var suffixIsASeparateWord = lastLetter == ' '; if (suffix.startsWith('d') && !suffixIsASeparateWord && wordEndsWith(fortisConsonants, word)) { // Convert first letter from 'd' to 't' return suffix.replace('d', 't'); } return suffix; } /** * Checks and transforms word for consonant lenition. * * Ex: bıçak + ı -> bıçağı * * @param word * @param suffix * @returns Transformed word */ function applyLenition(word, suffix) { var lastLetter = word[word.length - 1]; if (startsWithVowel(suffix) && isNonContinuantFortisConsonant(lastLetter)) { var vowelCount = (word.match(vowelPattern) || []).length; var last = word[word.length - 1], previous = word[word.length - 2]; /* When to apply lenition? * - If word has 2+ syllables, lenition is applied. * - If word has only 1 syllable, lenition is NOT applied. * - There are exceptions to both rules. */ var shouldApplyLenition = vowelCount == 1 && data_1.lenitionExceptions.includes(word) || vowelCount > 1 && !data_1.lenitionExceptions.includes(word); if (shouldApplyLenition) { // If last letter is 'k' and previous letter is a consonant, 'k' becomes 'g' instead of 'ğ'. if (isConsonant(previous)) { if (last == 'k') last = 'k2'; if (last == 'K') last = 'K2'; } var newLetter = lenitionMap[last]; return word.slice(0, word.length - 1) + newLetter; } } return word; } /** * Checks and adds buffer letter to suffix if needed. * * Ex: araba + a -> arabaya * Ex: araba + ın -> arabanın * * @param word * @param suffix * @returns Transformed suffix */ function addBufferLetter(word, suffix, normalizedSuffix) { var _vowels = vowels; if (startsWithVowel(normalizedSuffix) && wordEndsWith(_vowels, word)) { if (normalizedSuffix == 'i' || normalizedSuffix == 'e') { return 'y' + suffix; } else { return 'n' + suffix; } } return suffix; } function getLastVowel(word) { for (var i = word.length - 1; i >= 0; i--) { if (vowels.includes(word[i])) { return word[i]; } } /* If word has no vowel, we can read the letters by putting an "e" to their end. * Ex: "PTT" reads "petete" */ return "e"; } function normalizeWord(word) { // If word is numeric, convert the number to text and use it to get vowel if (isWordNumeric(word)) { var num = getNumberFromWord(word); var lastPart = word.slice(word.lastIndexOf(num) + num.length); word = getLastWordOfNumber(num) + lastPart; } else { word = word.replace('İ', 'i').replace('I', 'ı').toLowerCase(); } return word; } function isWordNumeric(word) { var isNumeric = false; for (var i = word.length - 1; i >= 0; i--) { var letter = word[i]; if (wordPattern.test(letter)) { break; } else if (numberPattern.test(letter)) { isNumeric = true; break; } } return isNumeric; } function getNumberFromWord(word) { word = word.split("").reverse().join(""); var match = word.match(numberPattern); return match[0].split("").reverse().join(""); } function getLastWordOfNumber(num) { // TODO Add million, billion, etc. if (num.length > 3 && num.endsWith("000")) return "bin"; if (num.length > 2 && num.endsWith("00")) return "yüz"; if (num.endsWith("90")) return "doksan"; if (num.endsWith("80")) return "seksen"; if (num.endsWith("70")) return "yetmiş"; if (num.endsWith("60")) return "altmış"; if (num.endsWith("50")) return "elli"; if (num.endsWith("40")) return "kırk"; if (num.endsWith("30")) return "otuz"; if (num.endsWith("20")) return "yirmi"; if (num.endsWith("10")) return "on"; if (num.endsWith("9")) return "dokuz"; if (num.endsWith("8")) return "sekiz"; if (num.endsWith("7")) return "yedi"; if (num.endsWith("6")) return "altı"; if (num.endsWith("5")) return "beş"; if (num.endsWith("4")) return "dört"; if (num.endsWith("3")) return "üç"; if (num.endsWith("2")) return "iki"; if (num.endsWith("1")) return "bir"; return "sıfır"; } function normalizeSuffix(suffix) { return suffix.replace('İ', 'i') .toLowerCase() .replace("a", "e") .replace("ı", "i"); } function getSuffixType(suffix) { return suffix.includes('i') ? 'i' : 'e'; } function startsWithVowel(text) { return vowels.includes(text[0]); } function isNonContinuantFortisConsonant(letter) { return nonContinuantFortisConsonants.includes(letter.toLowerCase()); } function isConsonant(letter) { return !isVowel(letter); } function isVowel(letter) { return vowels.includes(letter.toLowerCase()); } function isUpperCase(suffix) { return /A|E|I|İ/.test(suffix); } function splitLastWord(text) { var i; for (i = text.length - 2; i >= 0; i--) { if (!wordPattern.test(text[i]) && !numberPattern.test(text[i])) { break; } } if (i < 0) { i = 0; } return [text.slice(0, i), text.slice(i)]; } function wordEndsWith(letters, word) { // Get last word character var lastWordChar = ''; for (var i = word.length - 1; i >= 0; i--) { if (wordPattern.test(word[i])) { lastWordChar = word[i]; break; } } return letters.includes(lastWordChar.toLowerCase()); } function toUpperCase(word) { return word.replace('i', 'İ').toUpperCase(); }