unicode-to-plain-text
Version:
Convert fancy Unicode text to plain ASCII with smart language preservation
80 lines (79 loc) • 3.36 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeCasing = void 0;
const STRONG_CASE_THRESHOLD = 0.75;
const normalizeCasing = (text) => text.split(' ').map(normalizeWord).join(' ');
exports.normalizeCasing = normalizeCasing;
const normalizeWord = (word) => {
if (!word || word.length <= 1)
return word;
const letters = getLetters(word);
if (letters.length <= 1)
return word;
const casePattern = getCasePattern(word, letters);
if (casePattern.isProper || casePattern.isAllSame)
return word;
return applyNormalization(word, letters, casePattern);
};
const getLetters = (word) => Array.from(word).filter((c) => /[a-zA-Z]/.test(c));
const getCasePattern = (word, letters) => {
const uppercaseCount = letters.filter((c) => c === c.toUpperCase()).length;
const lowercaseCount = letters.length - uppercaseCount;
const upperRatio = uppercaseCount / letters.length;
const lowerRatio = lowercaseCount / letters.length;
const firstLetterIndex = Array.from(word).findIndex((c) => /[a-zA-Z]/.test(c));
const hasUpperFirst = firstLetterIndex >= 0 && /[A-Z]/.test(word[firstLetterIndex]);
const restAfterFirst = word.slice(firstLetterIndex + 1);
const isProper = hasUpperFirst && restAfterFirst === restAfterFirst.toLowerCase();
return {
isProper,
isAllSame: word === word.toUpperCase() || word === word.toLowerCase(),
uppercaseCount,
lowercaseCount,
upperRatio,
lowerRatio,
isStrongUpper: upperRatio >= STRONG_CASE_THRESHOLD,
isStrongLower: lowerRatio >= STRONG_CASE_THRESHOLD,
isBalanced: Math.abs(uppercaseCount - lowercaseCount) <= 1
};
};
const applyNormalization = (word, letters, pattern) => {
if (pattern.isStrongUpper)
return word.toUpperCase();
if (pattern.isStrongLower)
return word.toLowerCase();
const firstLetterIndex = Array.from(word).findIndex((c) => /[a-zA-Z]/.test(c));
if (firstLetterIndex >= 0 && /[A-Z]/.test(word[firstLetterIndex]) && pattern.lowerRatio > 0.5) {
return word.slice(0, firstLetterIndex + 1) + word.slice(firstLetterIndex + 1).toLowerCase();
}
if (pattern.isBalanced && letters.length >= 3) {
const interiorBias = getInteriorCaseBias(letters);
if (interiorBias === 'lower')
return word.toLowerCase();
if (interiorBias === 'upper')
return word.toUpperCase();
const boundaryBias = getBoundaryCaseBias(letters);
if (boundaryBias)
return boundaryBias === 'upper' ? word.toUpperCase() : word.toLowerCase();
}
return word;
};
const getInteriorCaseBias = (letters) => {
const middle = letters.slice(1, -1);
const upperCount = middle.filter((c) => c === c.toUpperCase()).length;
const lowerCount = middle.length - upperCount;
if (lowerCount > upperCount)
return 'lower';
if (upperCount > lowerCount)
return 'upper';
return null;
};
const getBoundaryCaseBias = (letters) => {
const firstUpper = letters[0] === letters[0].toUpperCase();
const lastUpper = letters[letters.length - 1] === letters[letters.length - 1].toUpperCase();
if (firstUpper && lastUpper)
return 'upper';
if (!firstUpper && !lastUpper)
return 'lower';
return null;
};