lao-word-splitter
Version:
A utility to split Lao language sentences into individual words
344 lines (343 loc) • 17.8 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.LaoWordSplitter = void 0;
/**
* Splits Lao text into individual words.
* A class for splitting Lao text into individual words.
* This class handles various Lao language specific rules for word segmentation,
* including consonant clusters, leading vowels, and special character sequences.
*/
class LaoWordSplitter {
constructor() {
// Basic consonants (Consonants)
this.LAO_CONSONANTS = new Set([
'ກ', 'ຂ', 'ຄ', 'ງ', 'ຈ', 'ສ', 'ຊ', 'ຍ', 'ດ', 'ຕ', 'ຖ', 'ທ', 'ນ',
'ບ', 'ປ', 'ຜ', 'ຝ', 'ພ', 'ຟ', 'ມ', 'ຢ', 'ຣ', 'ລ', 'ວ', 'ຫ', 'ອ', 'ຮ',
'ໜ', 'ໝ' // Considered distinct consonants
]);
// Characters indicating a potential new word start
// Examples: ເກມ, ແປ, ໂຕ, ໄປ, ໃນ - These often mark the beginning of a new word
this.LEADING_VOWELS = new Set(['ເ', 'ແ', 'ໂ', 'ໄ', 'ໃ']);
// Mid-word characters: vowels, tones, etc.
// Examples: ກະ, ນາ, ດິນ, ຊີມ, ກຶກ, ປື, ຮຸກ, ງູ, etc.
this.MIDDLE_CHARS = new Set([
'ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', // Base vowels
'ໍ', // Vowel O / AM
'ຳ', // Vowel AM (also often acts as middle)
'່', '້', '໊', '໋', // Tones
'ຼ', // Ligation mark (LO)
'໌', // Cancellation mark (KAN)
'ຽ', // Vowel IA
'ັ', // Vowel sign MAI KANG
'ົ' // Vowel sign MAI KON
]);
// Consonants that can follow 'ຫ' to form a digraph
// Examples: ຫງາຍ, ຫຍ້າ, ຫຼາຍ, ຫວ້າ, ຫຣິດ
this.DIGRAPH_FOLLOWERS = new Set(['ງ', 'ຍ', 'ລ', 'ວ', 'ຣ']);
// Special character that always forms its own word
// Example: ແດງໆ - 'ໆ' is always treated as a separate word
this.MAI_YAMOK = 'ໆ';
}
/**
* Checks if a character is part of Lao script
*
* Examples:
* isLaoCharacter('ກ') => true
* isLaoCharacter('a') => false
* isLaoCharacter('1') => false
*/
isLaoCharacter(char) {
if (!char)
return false;
const charCode = char.charCodeAt(0);
// Lao Unicode block: U+0E80 to U+0EFF
return charCode >= 0x0E80 && charCode <= 0x0EFF;
}
/**
* Helper function to add a word to the result array and start a new word
*
* Examples:
* addWordToResult('ປະ', words, 'ເ') => 'ເ' and words = ['ປະ']
* addWordToResult('ລາວ', words, 'ເ') => 'ເ' and words = ['ປະ', 'ລາວ']
*/
addWordToResult(currentWord, resultArray, newWord = '') {
if (currentWord.length > 0) {
resultArray.push(currentWord);
}
return newWord;
}
/**
* Helper function to handle consonant + 'ວ' sequences
*
* Examples:
* "ຈົນກວ່າຈະ" when encountering '່' after 'ກວ':
* handleConsonantVSequence('ຈົນກວ', '່', words) => 'ກວ່' and words = ['ຈົນ']
*
* "ຄວາມຮັກ" when encountering 'າ' after 'ຄວ':
* handleConsonantVSequence('ຄວ', 'າ', words) => 'ຄວາ' and words = []
*
* "ຂວາງາມ" when encountering 'າ' after 'ຂວ':
* handleConsonantVSequence('ຂວ', 'າ', words) => 'ຂວາ' and words = []
*/
handleConsonantVSequence(currentWord, char, resultArray) {
const secondLastChar = currentWord[currentWord.length - 2];
const lastChar = currentWord[currentWord.length - 1];
const consonantVSequence = secondLastChar + lastChar; // e.g., 'ກວ', 'ຂວ', 'ຄວ'
const wordBeforeSequence = currentWord.slice(0, -2);
if (wordBeforeSequence.length > 0) {
resultArray.push(wordBeforeSequence);
}
return consonantVSequence + char;
}
/**
* Helper function to handle consonant + 'ຣ' sequences
*
* Examples:
* "ທຣັມ" when encountering 'ັ' after 'ຣ':
* handleConsonantRSequence('ທຣ', 'ັ', words) => 'ທຣັ' and words = []
*
* "ປຣິນເຕີ" when encountering 'ິ' after 'ຣ':
* handleConsonantRSequence('ປຣ', 'ິ', words) => 'ປຣິ' and words = []
*
* "ກຣາມ" when encountering 'າ' after 'ຣ':
* handleConsonantRSequence('ກຣ', 'າ', words) => 'ກຣາ' and words = []
*/
handleConsonantRSequence(currentWord, char, resultArray) {
const secondLastChar = currentWord[currentWord.length - 2];
const lastChar = currentWord[currentWord.length - 1];
const consonantRSequence = secondLastChar + lastChar; // e.g., 'ທຣ', 'ປຣ', 'ກຣ', 'ບຣ', 'ຟຣ'
const wordBeforeSequence = currentWord.slice(0, -2);
if (wordBeforeSequence.length > 0) {
resultArray.push(wordBeforeSequence);
}
return consonantRSequence + char;
}
/**
* Helper function to handle digraphs with 'ຫ'
*
* Examples:
* "ຫວານຫລາຍ" when encountering 'າ' after 'ຫວ':
* handleDigraphSequence('ຫວ', 'າ', words) => 'ຫວາ' and words = []
*/
handleDigraphSequence(currentWord, char, resultArray) {
const secondLastChar = currentWord[currentWord.length - 2];
const lastChar = currentWord[currentWord.length - 1];
const digraph = secondLastChar + lastChar; // e.g., 'ຫວ'
const wordBeforeDigraph = currentWord.slice(0, -2);
if (wordBeforeDigraph.length > 0) {
resultArray.push(wordBeforeDigraph);
}
return digraph + char;
}
/**
* Helper function to handle regular middle character processing
*
* Examples:
* "ເທດລາວ" when encountering 'າ' after 'ລ':
* handleRegularMiddleChar('ເທດລ', 'າ', words) => 'ລາ' and words = ['ເທດ']
*/
handleRegularMiddleChar(currentWord, char, resultArray) {
const lastChar = currentWord[currentWord.length - 1];
const wordWithoutLast = currentWord.slice(0, -1);
if (wordWithoutLast.length > 0) {
resultArray.push(wordWithoutLast);
}
return lastChar + char;
}
/**
* Helper function to handle 'ວ' or 'ອ' between consonants
*
* Examples:
* "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ":
* handleVaOrOSequence('ສຶ່ງສ', 'ວ', words) => 'ສວ' and words = ['ສຶ່ງ']
*/
handleVaOrOSequence(currentWord, char, resultArray) {
const lastConsonant = currentWord.slice(-1);
const wordWithoutLast = currentWord.slice(0, -1);
if (wordWithoutLast.length > 0) {
resultArray.push(wordWithoutLast);
}
return lastConsonant + char;
}
/**
* Remove Zero Width Spaces
*
* Examples:
* removeZeroWidthSpaces("ສະບາຍດີ") => "ສະບາຍດີ"
*/
removeZeroWidthSpaces(text) {
return text.replace(/\u200B/g, '');
}
/**
* Splits a Lao language sentence into individual words based on syllable structure rules.
*
* @param sentence The Lao sentence to be segmented.
* @returns An array of segmented words.
*
* Examples:
* splitLao("ປະເທດລາວເປັນສິ່ງສວຍງາມ") => ["ປະ", "ເທດ", "ລາວ", "ເປັນ", "ສິ່ງ", "ສວຍ", "ງາມ"]
* splitLao("ຈົນກວ່າຈະ") => ["ຈົນ", "ກວ່າ", "ຈະ"]
* splitLao("ຫວຽດນາມ") => ["ຫວຽດ", "ນາມ"]
* splitLao("ພາສາລາວ 101") => ["ພາ", "ສາ", "ລາວ", "101"]
*/
split(sentence) {
// Handle empty input
if (!sentence || sentence.trim().length === 0) {
return [];
}
// Preprocessing: remove zero width spaces
// Example: "ສະບາຍດີ" => "ສະບາຍດີ"
sentence = this.removeZeroWidthSpaces(sentence);
const words = [];
let currentWord = '';
for (let i = 0; i < sentence.length; i++) {
const char = sentence[i];
// Get context for current character
// Example 1: In sentence "ປະເທດລາວ" at index 3 (character 'ທ'):
// - char = 'ທ'
// - currentCharIsLao = true (ທ is a Lao character)
// - lastCharOfCurrentWord = 'ເ' (last character of currentWord "ເ")
// - lastCharWasLao = true (ເ is a Lao character)
// - secondLastChar = '' (no second character because currentWord = "ເ" has only one character)
// - nextChar = 'ດ' (next character is 'ດ')
const currentCharIsLao = this.isLaoCharacter(char);
const lastCharOfCurrentWord = currentWord.length > 0 ? currentWord[currentWord.length - 1] : '';
const lastCharWasLao = this.isLaoCharacter(lastCharOfCurrentWord);
const secondLastChar = currentWord.length > 1 ? currentWord[currentWord.length - 2] : '';
const nextChar = i + 1 < sentence.length ? sentence[i + 1] : null;
// --------- GUARD CONDITIONS ---------
// GUARD: Space character - add current word and reset
// Example: "ພາສາລາວ 101" when encountering space after "ລາວ"
// currentWord = "ລາວ" => words = ["ພາ", "ສາ", "ລາວ"], currentWord = ""
if (char === ' ') {
currentWord = this.addWordToResult(currentWord, words);
continue;
}
// GUARD: Non-Lao character processing
if (!currentCharIsLao) {
// Transition from Lao to non-Lao
// Example: "ພາສາລາວ 101" when encountering "1" after space
// currentWord = "" => words = ["ພາ", "ສາ", "ລາວ"], currentWord = "1"
if (lastCharWasLao) {
currentWord = this.addWordToResult(currentWord, words, char);
continue;
}
// If both current and last char are non-Lao, just append
// Example: "ພາສາລາວ 101" when encountering "0" after "1"
// currentWord = "1" => currentWord = "10"
currentWord += char;
continue;
}
// --- Lao character processing ---
// GUARD: Mai Yamok ('ໆ') - Always treat as separate word
// Example: "ແດງໆ" when encountering "ໆ" after "ງ"
// currentWord = "ແດງ" => words = ["ແດງ"], currentWord = "ໆ"
if (char === this.MAI_YAMOK) {
currentWord = this.addWordToResult(currentWord, words, this.MAI_YAMOK);
continue;
}
// GUARD: Leading vowels start a new word
// Example: "ປະເທດລາວ" when encountering "ເ" after "ະ"
// currentWord = "ປະ" => words = ["ປະ"], currentWord = "ເ"
if (this.LEADING_VOWELS.has(char)) {
// Special case: Don't split on second 'ເ' if previous char was also 'ເ'
// Example: "ເເຕກຕ່າງ" when encountering second "ເ" after first "ເ"
// currentWord = "ເ" => currentWord = "ເເ"
if (char === 'ເ' && lastCharOfCurrentWord === 'ເ') {
currentWord += char;
continue;
}
currentWord = this.addWordToResult(currentWord, words, char);
continue;
}
// GUARD: Transition from non-Lao to Lao
// Example: "RFA ລາວ" when encountering "ລ" after space
// currentWord = "" => words = ["RFA"], currentWord = "ລ"
if (!lastCharWasLao && currentWord.length > 0) {
currentWord = this.addWordToResult(currentWord, words, char);
continue;
}
// GUARD: Handle middle characters (vowels, tones, etc.)
if (this.MIDDLE_CHARS.has(char)) {
// If current word is empty, start a new word with this middle char
// Example: "າ" when starting with a vowel (unusual case)
// currentWord = "" => currentWord = "າ"
if (currentWord.length === 0) {
currentWord = char;
continue;
}
// Check if we should simply append the middle character
// 1. Case: middle character after middle character: "ສຶ່" when encountering "່" after "ຶ" => "ສຶ່"
// 2. Case: leading vowel followed by consonant: "ເປ" when encountering "ັ" after "ປ" with "ເ" at start => "ເປັ"
const shouldAppendMiddleChar = this.MIDDLE_CHARS.has(lastCharOfCurrentWord) ||
(char === 'ັ' && currentWord.length >= 2 && this.LEADING_VOWELS.has(secondLastChar));
if (shouldAppendMiddleChar) {
currentWord += char;
continue;
}
// GUARD: Special case for consonant + 'ວ' sequences (ກວ, ຂວ, ຄວ)
// Examples:
// - "ຈົນກວ່າຈະ" when encountering "່" after "ວ" preceded by "ກ"
// - "ຄວາມຮັກ" when encountering "າ" after "ວ" preceded by "ຄ"
// - "ຂວາງາມ" when encountering "າ" after "ວ" preceded by "ຂ"
if (currentWord.length >= 2 && lastCharOfCurrentWord === 'ວ' &&
(secondLastChar === 'ກ' || secondLastChar === 'ຂ' || secondLastChar === 'ຄ')) {
currentWord = this.handleConsonantVSequence(currentWord, char, words);
continue;
}
// GUARD: Special case for consonant + 'ຣ' sequences (ທຣ, ປຣ, ກຣ, ບຣ, ຟຣ)
if (currentWord.length >= 2 && lastCharOfCurrentWord === 'ຣ' &&
(secondLastChar === 'ທ' || secondLastChar === 'ປ' || secondLastChar === 'ກ' ||
secondLastChar === 'ບ' || secondLastChar === 'ຟ')) {
currentWord = this.handleConsonantRSequence(currentWord, char, words);
continue;
}
// GUARD: Special case for digraphs with 'ຫ'
// Example: "ຫວຽດນາມ" when encountering "ຽ" after "ວ" preceded by "ຫ"
// currentWord = "ຫວ" => words = [], currentWord = "ຫວຽ"
if (currentWord.length >= 2 && secondLastChar === 'ຫ' && this.DIGRAPH_FOLLOWERS.has(lastCharOfCurrentWord)) {
currentWord = this.handleDigraphSequence(currentWord, char, words);
continue;
}
// GUARD: Regular middle char handling - check if we need to split
// Example: "ເທດລາວ" when encountering "າ" after "ລ"
// Check if "ລ" is a consonant and "ເທດ" is not a leading vowel followed by consonant
// currentWord = "ເທດລ" => words = ["ເທດ"], currentWord = "ລາ"
const isLastCharConsonant = this.LAO_CONSONANTS.has(lastCharOfCurrentWord);
const isSecondLastLeadingVowel = currentWord.length >= 2 && this.LEADING_VOWELS.has(secondLastChar);
if (isLastCharConsonant && !isSecondLastLeadingVowel) {
currentWord = this.handleRegularMiddleChar(currentWord, char, words);
}
else {
// Append in other cases
currentWord += char;
}
continue;
}
// GUARD: Special check for 'ວ' or 'ອ' between consonants
// Example: "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ"
// currentWord = "ສຶ່ງສ" => words = ["ສຶ່ງ"], currentWord = "ສວ"
if ((char === 'ວ' || char === 'ອ') && currentWord.length > 0) {
const lastCharIsConsonant = this.LAO_CONSONANTS.has(lastCharOfCurrentWord);
const nextCharIsConsonant = nextChar && this.isLaoCharacter(nextChar) && this.LAO_CONSONANTS.has(nextChar);
if (lastCharIsConsonant && nextCharIsConsonant) {
currentWord = this.handleVaOrOSequence(currentWord, char, words);
continue;
}
}
// DEFAULT: Append character if no special cases matched
// Example when no conditions above match
// "ປະ" when encountering "ປ" followed by "ະ" => currentWord = "ປະ"
currentWord += char;
}
// Add the last remaining word after the loop finishes
// Example: Last word remaining after loop ends
// "ປະເທດລາວ" => words = ["ປະ", "ເທດ", "ລາວ"]
if (currentWord.length > 0) {
words.push(currentWord);
}
// Filter out any empty strings
return words.filter(word => word.length > 0);
}
}
exports.LaoWordSplitter = LaoWordSplitter;