lao-word-splitter
Version:
A utility to split Lao language sentences into individual words
103 lines (102 loc) • 4.13 kB
TypeScript
/**
* Splits Lao text into individual words.
* A class for splitting Lao text into individual words.
* This class handles various Lao language specific rules for word segmentation,
* including consonant clusters, leading vowels, and special character sequences.
*/
export declare class LaoWordSplitter {
private readonly LAO_CONSONANTS;
private readonly LEADING_VOWELS;
private readonly MIDDLE_CHARS;
private readonly DIGRAPH_FOLLOWERS;
private readonly MAI_YAMOK;
/**
* Checks if a character is part of Lao script
*
* Examples:
* isLaoCharacter('ກ') => true
* isLaoCharacter('a') => false
* isLaoCharacter('1') => false
*/
private isLaoCharacter;
/**
* Helper function to add a word to the result array and start a new word
*
* Examples:
* addWordToResult('ປະ', words, 'ເ') => 'ເ' and words = ['ປະ']
* addWordToResult('ລາວ', words, 'ເ') => 'ເ' and words = ['ປະ', 'ລາວ']
*/
private addWordToResult;
/**
* Helper function to handle consonant + 'ວ' sequences
*
* Examples:
* "ຈົນກວ່າຈະ" when encountering '່' after 'ກວ':
* handleConsonantVSequence('ຈົນກວ', '່', words) => 'ກວ່' and words = ['ຈົນ']
*
* "ຄວາມຮັກ" when encountering 'າ' after 'ຄວ':
* handleConsonantVSequence('ຄວ', 'າ', words) => 'ຄວາ' and words = []
*
* "ຂວາງາມ" when encountering 'າ' after 'ຂວ':
* handleConsonantVSequence('ຂວ', 'າ', words) => 'ຂວາ' and words = []
*/
private handleConsonantVSequence;
/**
* Helper function to handle consonant + 'ຣ' sequences
*
* Examples:
* "ທຣັມ" when encountering 'ັ' after 'ຣ':
* handleConsonantRSequence('ທຣ', 'ັ', words) => 'ທຣັ' and words = []
*
* "ປຣິນເຕີ" when encountering 'ິ' after 'ຣ':
* handleConsonantRSequence('ປຣ', 'ິ', words) => 'ປຣິ' and words = []
*
* "ກຣາມ" when encountering 'າ' after 'ຣ':
* handleConsonantRSequence('ກຣ', 'າ', words) => 'ກຣາ' and words = []
*/
private handleConsonantRSequence;
/**
* Helper function to handle digraphs with 'ຫ'
*
* Examples:
* "ຫວານຫລາຍ" when encountering 'າ' after 'ຫວ':
* handleDigraphSequence('ຫວ', 'າ', words) => 'ຫວາ' and words = []
*/
private handleDigraphSequence;
/**
* Helper function to handle regular middle character processing
*
* Examples:
* "ເທດລາວ" when encountering 'າ' after 'ລ':
* handleRegularMiddleChar('ເທດລ', 'າ', words) => 'ລາ' and words = ['ເທດ']
*/
private handleRegularMiddleChar;
/**
* Helper function to handle 'ວ' or 'ອ' between consonants
*
* Examples:
* "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ":
* handleVaOrOSequence('ສຶ່ງສ', 'ວ', words) => 'ສວ' and words = ['ສຶ່ງ']
*/
private handleVaOrOSequence;
/**
* Remove Zero Width Spaces
*
* Examples:
* removeZeroWidthSpaces("ສະບາຍດີ") => "ສະບາຍດີ"
*/
private removeZeroWidthSpaces;
/**
* Splits a Lao language sentence into individual words based on syllable structure rules.
*
* @param sentence The Lao sentence to be segmented.
* @returns An array of segmented words.
*
* Examples:
* splitLao("ປະເທດລາວເປັນສິ່ງສວຍງາມ") => ["ປະ", "ເທດ", "ລາວ", "ເປັນ", "ສິ່ງ", "ສວຍ", "ງາມ"]
* splitLao("ຈົນກວ່າຈະ") => ["ຈົນ", "ກວ່າ", "ຈະ"]
* splitLao("ຫວຽດນາມ") => ["ຫວຽດ", "ນາມ"]
* splitLao("ພາສາລາວ 101") => ["ພາ", "ສາ", "ລາວ", "101"]
*/
split(sentence: string): string[];
}