UNPKG

lao-grammar-checker

Version:

A utility to check grammar and split Lao language sentences into individual words

154 lines (153 loc) 6.29 kB
/** * Splits Lao text into individual words. * A class for splitting Lao text into individual words. * This class handles various Lao language specific rules for word segmentation, * including consonant clusters, leading vowels, and special character sequences. */ export interface LaoWordInfo { word: string; startIndex: number; endIndex: number; } export declare class LaoWordSplitter { private readonly LAO_CONSONANTS; private readonly LEADING_VOWELS; private readonly MIDDLE_CHARS; private readonly DIGRAPH_FOLLOWERS; private readonly MAI_YAMOK; /** * Checks if a character is part of Lao script * * Examples: * isLaoCharacter('ກ') => true * isLaoCharacter('a') => false * isLaoCharacter('1') => false */ private isLaoCharacter; /** * Helper function to add a word to the result array and start a new word * Also tracks the position of words in the original text * * Examples: * addWordToResult('ປະ', words, 2, 0, 'ເ') => * returns { word: 'ເ', startIndex: 2 } and * words = [{ word: 'ປະ', startIndex: 0, endIndex: 1 }] * * addWordToResult('ລາວ', words, 8, 5, 'ເ') => * returns { word: 'ເ', startIndex: 8 } and * words = [..., { word: 'ລາວ', startIndex: 5, endIndex: 7 }] */ private addWordToResult; /** * Helper function to handle consonant + 'ວ' sequences * Also tracks the position of words in the original text * * Examples: * "ຈົນກວ່າຈະ" when encountering '່' after 'ກວ': * handleConsonantVSequence('ຈົນກວ', '່', words, 5, 0) => * returns { word: 'ກວ່', startIndex: 3 } and * words = [{ word: 'ຈົນ', startIndex: 0, endIndex: 2 }] * * "ຄວາມຮັກ" when encountering 'າ' after 'ຄວ': * handleConsonantVSequence('ຄວ', 'າ', words, 2, 0) => * returns { word: 'ຄວາ', startIndex: 0 } and words = [] * * "ຂວາງາມ" when encountering 'າ' after 'ຂວ': * handleConsonantVSequence('ຂວ', 'າ', words, 2, 0) => * returns { word: 'ຂວາ', startIndex: 0 } and words = [] */ private handleConsonantVSequence; /** * Helper function to handle consonant + 'ຣ' sequences * Also tracks the position of words in the original text * * Examples: * "ທຣັມ" when encountering 'ັ' after 'ຣ': * handleConsonantRSequence('ທຣ', 'ັ', words, 2, 0) => * returns { word: 'ທຣັ', startIndex: 0 } and words = [] * * "ປຣິນເຕີ" when encountering 'ິ' after 'ຣ': * handleConsonantRSequence('ປຣ', 'ິ', words, 2, 0) => * returns { word: 'ປຣິ', startIndex: 0 } and words = [] * * "ກຣາມ" when encountering 'າ' after 'ຣ': * handleConsonantRSequence('ກຣ', 'າ', words, 2, 0) => * returns { word: 'ກຣາ', startIndex: 0 } and words = [] */ private handleConsonantRSequence; /** * Helper function to handle digraphs with 'ຫ' * Also tracks the position of words in the original text * * Examples: * "ຫວານຫລາຍ" when encountering 'າ' after 'ຫວ': * handleDigraphSequence('ຫວ', 'າ', words, 2, 0) => * returns { word: 'ຫວາ', startIndex: 0 } and words = [] * * "ຫລັງຈາກ" when encountering 'ັ' after 'ຫລ': * handleDigraphSequence('ຫລ', 'ັ', words, 2, 0) => * returns { word: 'ຫລັ', startIndex: 0 } and words = [] */ private handleDigraphSequence; /** * Helper function to handle regular middle character processing * Also tracks the position of words in the original text * * Examples: * "ເທດລາວ" when encountering 'າ' after 'ລ': * handleRegularMiddleChar('ເທດລ', 'າ', words, 5, 0) => * returns { word: 'ລາ', startIndex: 4 } and * words = [{ word: 'ເທດ', startIndex: 0, endIndex: 3 }] * * "ພາສາ" when encountering 'າ' after 'ສ': * handleRegularMiddleChar('ພາສ', 'າ', words, 3, 0) => * returns { word: 'ສາ', startIndex: 2 } and * words = [{ word: 'ພາ', startIndex: 0, endIndex: 1 }] */ private handleRegularMiddleChar; /** * Helper function to handle 'ວ' or 'ອ' between consonants * Also tracks the position of words in the original text * * Examples: * "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ": * handleVaOrOSequence('ສຶ່ງສ', 'ວ', words, 5, 0) => * returns { word: 'ສວ', startIndex: 4 } and * words = [{ word: 'ສຶ່ງ', startIndex: 0, endIndex: 3 }] * * "ຂອງກິນ" when encountering "ອ" after "ຂ" and followed by "ງ": * handleVaOrOSequence('ຂ', 'ອ', words, 1, 0) => * returns { word: 'ຂອ', startIndex: 0 } and words = [] */ private handleVaOrOSequence; /** * Remove Zero Width Spaces * * Examples: * removeZeroWidthSpaces("ສະ​ບາຍ​ດີ") => "ສະບາຍດີ" */ private removeZeroWidthSpaces; /** * Splits a Lao language sentence into individual words based on syllable structure rules. * Also provides the start and end indices of each word in the original text. * Space characters are treated as separate words. * * @param sentence The Lao sentence to be segmented. * @returns An array of segmented words with their indices. * * Examples: * splitLao("ປະເທດລາວ") => [ * { word: "ປະ", startIndex: 0, endIndex: 1 }, * { word: "ເທດ", startIndex: 2, endIndex: 4 }, * { word: "ລາວ", startIndex: 5, endIndex: 7 } * ] * * splitLao("ພາສາ ລາວ") => [ * { word: "ພາ", startIndex: 0, endIndex: 1 }, * { word: "ສາ", startIndex: 2, endIndex: 3 }, * { word: " ", startIndex: 4, endIndex: 4 }, * { word: "ລາວ", startIndex: 5, endIndex: 7 } * ] */ split(sentence: string): LaoWordInfo[]; }