@erkanarslan/turkish-inflection
Version:
A library to inflect Turkish words
289 lines (288 loc) • 9.29 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.inflectWord = exports.inflectText = exports.inflect = exports.addLenitionExceptions = void 0;
var data_1 = require("./data");
var data_2 = require("./data");
Object.defineProperty(exports, "addLenitionExceptions", { enumerable: true, get: function () { return data_2.addLenitionExceptions; } });
var suffixes = ['e', 'in', 'i', 'den', 'de', 'ler', 'mi'];
var vowels = ['a', 'e', 'ı', 'i', 'o', 'ö', 'u', 'ü'];
var nonContinuantFortisConsonants = ['p', 'ç', 't', 'k'];
var fortisConsonants = ['p', 'ç', 't', 'k', 'f', 'h', 's', 'ş'];
var transformMap = {
i: { a: 'ı', e: 'i', ı: 'ı', i: 'i', o: 'u', ö: 'ü', u: 'u', ü: 'ü' },
e: { a: 'a', e: 'e', ı: 'a', i: 'e', o: 'a', ö: 'e', u: 'a', ü: 'e' },
};
var lenitionMap = {
p: 'b', ç: 'c', t: 'd', k: 'ğ', k2: 'g',
P: "B", Ç: 'C', T: 'D', K: 'Ğ', K2: 'G'
};
var alternatives = suffixes.map(function (s) { return s.replace('e', 'a').replace('i', 'ı'); })
.concat(['İN', 'İ', 'Mİ']);
var markupPattern = new RegExp("--(" + suffixes.concat(alternatives).join("|") + ")", 'i');
var vowelPattern = new RegExp(vowels.join("|"), 'gi');
var wordPattern = /(?:[a-z]|[A-Z]|ç|Ç|ğ|Ğ|ı|İ|ö|Ö|ş|Ş|ü|Ü)+/;
var numberPattern = /\d+/;
function inflect(text, option) {
if (typeof option == 'string') {
return inflectWord(text, option);
}
else {
return inflectText(text, option);
}
}
exports.inflect = inflect;
function inflectText(text, interpolation) {
var match;
if (interpolation) {
for (var key in interpolation) {
var value = interpolation[key];
var marker = "{{" + key + "}}";
while (text.includes(marker)) {
text = text.replace(marker, value);
}
}
}
while (true) {
match = text.match(markupPattern);
if (!match)
break;
if (!match.index)
continue;
var index = match.index, suffix = match[1];
var _a = splitLastWord(text.slice(0, index)), firstPart = _a[0], word = _a[1];
text = firstPart + inflectWord(word, suffix) + text.slice(index + suffix.length + 2);
}
return text;
}
exports.inflectText = inflectText;
function inflectWord(word, suffix) {
var upperCase = isUpperCase(suffix);
var normalizedSuffix = normalizeSuffix(suffix);
var normalizedWord = normalizeWord(word);
// Transform suffix
var suffixType = getSuffixType(normalizedSuffix);
var lastVowel = getLastVowel(normalizedWord);
var suffixVowel = transformMap[suffixType][lastVowel];
suffix = normalizedSuffix.replace(suffixType, suffixVowel);
// Fortitive assimilation (Ex: market + de -> markette)
suffix = applyFortitiveAssimilation(normalizedWord, suffix);
suffix = addBufferLetter(normalizedWord, suffix, normalizedSuffix);
// Transform word
// Consonant lenition (Ex: bıçak + ı -> bıçağı)
word = applyLenition(word, normalizedSuffix);
if (upperCase) {
suffix = toUpperCase(suffix);
}
return word + suffix;
}
exports.inflectWord = inflectWord;
/**
* Checks and transforms suffix for fortitive assimilation.
*
* Ex: market + de -> markette
*
* @param word
* @param suffix
* @returns Transformed suffix
*/
function applyFortitiveAssimilation(word, suffix) {
var lastLetter = word[word.length - 1];
var suffixIsASeparateWord = lastLetter == ' ';
if (suffix.startsWith('d') && !suffixIsASeparateWord && wordEndsWith(fortisConsonants, word)) {
// Convert first letter from 'd' to 't'
return suffix.replace('d', 't');
}
return suffix;
}
/**
* Checks and transforms word for consonant lenition.
*
* Ex: bıçak + ı -> bıçağı
*
* @param word
* @param suffix
* @returns Transformed word
*/
function applyLenition(word, suffix) {
var lastLetter = word[word.length - 1];
if (startsWithVowel(suffix) && isNonContinuantFortisConsonant(lastLetter)) {
var vowelCount = (word.match(vowelPattern) || []).length;
var last = word[word.length - 1], previous = word[word.length - 2];
/* When to apply lenition?
* - If word has 2+ syllables, lenition is applied.
* - If word has only 1 syllable, lenition is NOT applied.
* - There are exceptions to both rules.
*/
var shouldApplyLenition = vowelCount == 1 && data_1.lenitionExceptions.includes(word) || vowelCount > 1 && !data_1.lenitionExceptions.includes(word);
if (shouldApplyLenition) {
// If last letter is 'k' and previous letter is a consonant, 'k' becomes 'g' instead of 'ğ'.
if (isConsonant(previous)) {
if (last == 'k')
last = 'k2';
if (last == 'K')
last = 'K2';
}
var newLetter = lenitionMap[last];
return word.slice(0, word.length - 1) + newLetter;
}
}
return word;
}
/**
* Checks and adds buffer letter to suffix if needed.
*
* Ex: araba + a -> arabaya
* Ex: araba + ın -> arabanın
*
* @param word
* @param suffix
* @returns Transformed suffix
*/
function addBufferLetter(word, suffix, normalizedSuffix) {
var _vowels = vowels;
if (startsWithVowel(normalizedSuffix) && wordEndsWith(_vowels, word)) {
if (normalizedSuffix == 'i' || normalizedSuffix == 'e') {
return 'y' + suffix;
}
else {
return 'n' + suffix;
}
}
return suffix;
}
function getLastVowel(word) {
for (var i = word.length - 1; i >= 0; i--) {
if (vowels.includes(word[i])) {
return word[i];
}
}
/* If word has no vowel, we can read the letters by putting an "e" to their end.
* Ex: "PTT" reads "petete"
*/
return "e";
}
function normalizeWord(word) {
// If word is numeric, convert the number to text and use it to get vowel
if (isWordNumeric(word)) {
var num = getNumberFromWord(word);
var lastPart = word.slice(word.lastIndexOf(num) + num.length);
word = getLastWordOfNumber(num) + lastPart;
}
else {
word = word.replace('İ', 'i').replace('I', 'ı').toLowerCase();
}
return word;
}
function isWordNumeric(word) {
var isNumeric = false;
for (var i = word.length - 1; i >= 0; i--) {
var letter = word[i];
if (wordPattern.test(letter)) {
break;
}
else if (numberPattern.test(letter)) {
isNumeric = true;
break;
}
}
return isNumeric;
}
function getNumberFromWord(word) {
word = word.split("").reverse().join("");
var match = word.match(numberPattern);
return match[0].split("").reverse().join("");
}
function getLastWordOfNumber(num) {
// TODO Add million, billion, etc.
if (num.length > 3 && num.endsWith("000"))
return "bin";
if (num.length > 2 && num.endsWith("00"))
return "yüz";
if (num.endsWith("90"))
return "doksan";
if (num.endsWith("80"))
return "seksen";
if (num.endsWith("70"))
return "yetmiş";
if (num.endsWith("60"))
return "altmış";
if (num.endsWith("50"))
return "elli";
if (num.endsWith("40"))
return "kırk";
if (num.endsWith("30"))
return "otuz";
if (num.endsWith("20"))
return "yirmi";
if (num.endsWith("10"))
return "on";
if (num.endsWith("9"))
return "dokuz";
if (num.endsWith("8"))
return "sekiz";
if (num.endsWith("7"))
return "yedi";
if (num.endsWith("6"))
return "altı";
if (num.endsWith("5"))
return "beş";
if (num.endsWith("4"))
return "dört";
if (num.endsWith("3"))
return "üç";
if (num.endsWith("2"))
return "iki";
if (num.endsWith("1"))
return "bir";
return "sıfır";
}
function normalizeSuffix(suffix) {
return suffix.replace('İ', 'i')
.toLowerCase()
.replace("a", "e")
.replace("ı", "i");
}
function getSuffixType(suffix) {
return suffix.includes('i') ? 'i' : 'e';
}
function startsWithVowel(text) {
return vowels.includes(text[0]);
}
function isNonContinuantFortisConsonant(letter) {
return nonContinuantFortisConsonants.includes(letter.toLowerCase());
}
function isConsonant(letter) {
return !isVowel(letter);
}
function isVowel(letter) {
return vowels.includes(letter.toLowerCase());
}
function isUpperCase(suffix) {
return /A|E|I|İ/.test(suffix);
}
function splitLastWord(text) {
var i;
for (i = text.length - 2; i >= 0; i--) {
if (!wordPattern.test(text[i]) && !numberPattern.test(text[i])) {
break;
}
}
if (i < 0) {
i = 0;
}
return [text.slice(0, i), text.slice(i)];
}
function wordEndsWith(letters, word) {
// Get last word character
var lastWordChar = '';
for (var i = word.length - 1; i >= 0; i--) {
if (wordPattern.test(word[i])) {
lastWordChar = word[i];
break;
}
}
return letters.includes(lastWordChar.toLowerCase());
}
function toUpperCase(word) {
return word.replace('i', 'İ').toUpperCase();
}