UNPKG

lao-word-splitter

Version:

A utility to split Lao language sentences into individual words

103 lines (102 loc) 4.13 kB
/** * Splits Lao text into individual words. * A class for splitting Lao text into individual words. * This class handles various Lao language specific rules for word segmentation, * including consonant clusters, leading vowels, and special character sequences. */ export declare class LaoWordSplitter { private readonly LAO_CONSONANTS; private readonly LEADING_VOWELS; private readonly MIDDLE_CHARS; private readonly DIGRAPH_FOLLOWERS; private readonly MAI_YAMOK; /** * Checks if a character is part of Lao script * * Examples: * isLaoCharacter('ກ') => true * isLaoCharacter('a') => false * isLaoCharacter('1') => false */ private isLaoCharacter; /** * Helper function to add a word to the result array and start a new word * * Examples: * addWordToResult('ປະ', words, 'ເ') => 'ເ' and words = ['ປະ'] * addWordToResult('ລາວ', words, 'ເ') => 'ເ' and words = ['ປະ', 'ລາວ'] */ private addWordToResult; /** * Helper function to handle consonant + 'ວ' sequences * * Examples: * "ຈົນກວ່າຈະ" when encountering '່' after 'ກວ': * handleConsonantVSequence('ຈົນກວ', '່', words) => 'ກວ່' and words = ['ຈົນ'] * * "ຄວາມຮັກ" when encountering 'າ' after 'ຄວ': * handleConsonantVSequence('ຄວ', 'າ', words) => 'ຄວາ' and words = [] * * "ຂວາງາມ" when encountering 'າ' after 'ຂວ': * handleConsonantVSequence('ຂວ', 'າ', words) => 'ຂວາ' and words = [] */ private handleConsonantVSequence; /** * Helper function to handle consonant + 'ຣ' sequences * * Examples: * "ທຣັມ" when encountering 'ັ' after 'ຣ': * handleConsonantRSequence('ທຣ', 'ັ', words) => 'ທຣັ' and words = [] * * "ປຣິນເຕີ" when encountering 'ິ' after 'ຣ': * handleConsonantRSequence('ປຣ', 'ິ', words) => 'ປຣິ' and words = [] * * "ກຣາມ" when encountering 'າ' after 'ຣ': * handleConsonantRSequence('ກຣ', 'າ', words) => 'ກຣາ' and words = [] */ private handleConsonantRSequence; /** * Helper function to handle digraphs with 'ຫ' * * Examples: * "ຫວານຫລາຍ" when encountering 'າ' after 'ຫວ': * handleDigraphSequence('ຫວ', 'າ', words) => 'ຫວາ' and words = [] */ private handleDigraphSequence; /** * Helper function to handle regular middle character processing * * Examples: * "ເທດລາວ" when encountering 'າ' after 'ລ': * handleRegularMiddleChar('ເທດລ', 'າ', words) => 'ລາ' and words = ['ເທດ'] */ private handleRegularMiddleChar; /** * Helper function to handle 'ວ' or 'ອ' between consonants * * Examples: * "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ": * handleVaOrOSequence('ສຶ່ງສ', 'ວ', words) => 'ສວ' and words = ['ສຶ່ງ'] */ private handleVaOrOSequence; /** * Remove Zero Width Spaces * * Examples: * removeZeroWidthSpaces("ສະ​ບາຍ​ດີ") => "ສະບາຍດີ" */ private removeZeroWidthSpaces; /** * Splits a Lao language sentence into individual words based on syllable structure rules. * * @param sentence The Lao sentence to be segmented. * @returns An array of segmented words. * * Examples: * splitLao("ປະເທດລາວເປັນສິ່ງສວຍງາມ") => ["ປະ", "ເທດ", "ລາວ", "ເປັນ", "ສິ່ງ", "ສວຍ", "ງາມ"] * splitLao("ຈົນກວ່າຈະ") => ["ຈົນ", "ກວ່າ", "ຈະ"] * splitLao("ຫວຽດນາມ") => ["ຫວຽດ", "ນາມ"] * splitLao("ພາສາລາວ 101") => ["ພາ", "ສາ", "ລາວ", "101"] */ split(sentence: string): string[]; }