UNPKG

normalize-vietnamese

Version:

A TypeScript library for Vietnamese text processing including accent normalization, text masking, and string utilities

github.com/nvminh461/normalize-vietnamese

nvminh461/normalize-vietnamese

228 lines • 10 kB

JavaScript

"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const slugify_1 = __importDefault(require("slugify")); class Str { static normalize(text) { if (typeof text !== "string") { return text; } slugify_1.default.extend({ ".": " ", "-": " ", }); return (0, slugify_1.default)(text, { lower: true, locale: "vi", trim: true, replacement: " ", remove: /[^\w\s]+/g, }); } /** * Normalize Vietnamese accent marks according to Vietnamese grammar rules * @param text - The text to normalize * @returns The normalized text with proper accent placement */ static normalizeVietnameseAccent(text) { if (typeof text !== "string") { return text; } // Map of characters with their base forms and accent marks const accentMap = { a: { base: "a", marks: ["à", "á", "ả", "ã", "ạ"] }, ă: { base: "ă", marks: ["ằ", "ắ", "ẳ", "ẵ", "ặ"] }, â: { base: "â", marks: ["ầ", "ấ", "ẩ", "ẫ", "ậ"] }, e: { base: "e", marks: ["è", "é", "ẻ", "ẽ", "ẹ"] }, ê: { base: "ê", marks: ["ề", "ế", "ể", "ễ", "ệ"] }, i: { base: "i", marks: ["ì", "í", "ỉ", "ĩ", "ị"] }, o: { base: "o", marks: ["ò", "ó", "ỏ", "õ", "ọ"] }, ô: { base: "ô", marks: ["ồ", "ố", "ổ", "ỗ", "ộ"] }, ơ: { base: "ơ", marks: ["ờ", "ớ", "ở", "ỡ", "ợ"] }, u: { base: "u", marks: ["ù", "ú", "ủ", "ũ", "ụ"] }, ư: { base: "ư", marks: ["ừ", "ứ", "ử", "ữ", "ự"] }, y: { base: "y", marks: ["ỳ", "ý", "ỷ", "ỹ", "ỵ"] }, A: { base: "A", marks: ["À", "Á", "Ả", "Ã", "Ạ"] }, Ă: { base: "Ă", marks: ["Ằ", "Ắ", "Ẳ", "Ẵ", "Ặ"] }, Â: { base: "Â", marks: ["Ầ", "Ấ", "Ẩ", "Ẫ", "Ậ"] }, E: { base: "E", marks: ["È", "É", "Ẻ", "Ẽ", "Ẹ"] }, Ê: { base: "Ê", marks: ["Ề", "Ế", "Ể", "Ễ", "Ệ"] }, I: { base: "I", marks: ["Ì", "Í", "Ỉ", "Ĩ", "Ị"] }, O: { base: "O", marks: ["Ò", "Ó", "Ỏ", "Õ", "Ọ"] }, Ô: { base: "Ô", marks: ["Ồ", "Ố", "Ổ", "Ỗ", "Ộ"] }, Ơ: { base: "Ơ", marks: ["Ờ", "Ớ", "Ở", "Ỡ", "Ợ"] }, U: { base: "U", marks: ["Ù", "Ú", "Ủ", "Ũ", "Ụ"] }, Ư: { base: "Ư", marks: ["Ừ", "Ứ", "Ử", "Ữ", "Ự"] }, Y: { base: "Y", marks: ["Ỳ", "Ý", "Ỷ", "Ỹ", "Ỵ"] }, }; // Function to extract accent mark from a character const extractAccent = (char) => { for (const [base, data] of Object.entries(accentMap)) { const markIndex = data.marks.indexOf(char); if (markIndex !== -1) { return { base, mark: markIndex }; } } return { base: char, mark: -1 }; }; // Function to apply accent mark to a character const applyAccent = (base, mark) => { if (mark === -1) return base; const data = accentMap[base]; return data ? data.marks[mark] : base; }; // Function to check if word has final consonant const hasFinalConsonant = (word) => { const lastChar = word[word.length - 1]; const { base } = extractAccent(lastChar); return !accentMap[base] && !accentMap[base.toLowerCase()]; }; // Function to normalize accent for a word const normalizeWord = (word) => { // Correct accent on "gi" and "qu" const correctChars = []; const vowels = []; // Special consonant clusters that should be treated as single consonants const specialConsonants = ["gi", "qu"]; let accentIndex = -1; for (let i = 0; i < word.length; i++) { const char = word[i]; const { base, mark } = extractAccent(char); let isSpecialConsonant = false; if (i === 0 && word.length > 2) { // Check if this is part of a special consonant cluster "gi" and "qu" const nextChar = extractAccent(word[i + 1]); const currentPair = char + nextChar.base; for (const special of specialConsonants) { if (currentPair.toLowerCase() === special) { isSpecialConsonant = true; // Skip the next character as it's part of the consonant cluster correctChars.push(currentPair); i++; if (nextChar.mark !== -1) { accentIndex = nextChar.mark; } break; } } } // Only add as vowel if it's not part of a special consonant cluster if (!isSpecialConsonant) { if (accentMap[base] || accentMap[base.toLowerCase()]) { vowels.push({ char: base.toLowerCase(), index: i, accent: mark, }); correctChars.push(applyAccent(base, accentIndex !== -1 ? accentIndex : mark)); accentIndex = -1; } else { correctChars.push(base); } } } const correctedWord = correctChars.join(""); if (vowels.length === 0) return correctedWord; // Find the accent mark to preserve let accentToPreserve = -1; let currentAccentIndex = -1; for (let i = 0; i < vowels.length; i++) { if (vowels[i].accent !== -1) { accentToPreserve = vowels[i].accent; currentAccentIndex = i; break; } } // If no accent mark found, return original word if (accentToPreserve === -1) return correctedWord; // Determine target vowel position based on rules let targetIndex = 0; // Exception rule: 'ê' and 'ơ' have priority regardless of position const priorityVowels = ["ê", "ơ"]; for (let i = 0; i < vowels.length; i++) { if (priorityVowels.includes(vowels[i].char)) { targetIndex = i; break; } } // If no priority vowel found, apply normal rules if (targetIndex === 0 && !priorityVowels.includes(vowels[0].char)) { if (vowels.length === 1) { // Rule 1: Single vowel - place accent on the vowel targetIndex = 0; } else if (vowels.length === 2) { // Rule 2: Two vowels (diphthong) - place accent on first vowel // But if there's a final consonant, treat as triphthong if (hasFinalConsonant(correctedWord)) { targetIndex = 1; // Move to second vowel } else { targetIndex = 0; // Stay on first vowel } } else if (vowels.length >= 3) { // Rule 2: Three vowels (triphthong) - move accent to second vowel targetIndex = 1; } } // If accent is already in the correct position, return original word if (currentAccentIndex === targetIndex) { return correctedWord; } // Apply accent to target vowel const result = correctedWord.split(""); const targetVowel = vowels[targetIndex]; const originalChar = result[targetVowel.index]; const { base } = extractAccent(originalChar); result[targetVowel.index] = applyAccent(base, accentToPreserve); // Remove accents from other vowels for (let i = 0; i < vowels.length; i++) { if (i !== targetIndex) { const vowel = vowels[i]; const originalChar = result[vowel.index]; const { base } = extractAccent(originalChar); result[vowel.index] = base; } } return result.join(""); }; // Split text into words and normalize each word return text .normalize("NFC") .split(/(\s+|\p{P}+)/u) .map((part) => { // Only process non-whitespace parts if (/\s|\p{P}/u.test(part)) { return part; } return normalizeWord(part); }) .join(""); } } Str.mask = (text, start = 0, end = 0) => { const length = text.length; const _start = start < 0 ? length + start : start; const _end = end <= 0 ? length + end : end; if (!length || _start < 0 || _start >= length || _end <= 0 || _end > length || _start >= _end) { return text; } const maskedLength = _end - _start; return (text.substring(0, _start) + "*".repeat(maskedLength) + text.substring(_end)); }; exports.default = Str; //# sourceMappingURL=string.js.map