lao-grammar-checker
Version:
A utility to check grammar and split Lao language sentences into individual words
164 lines (163 loc) • 6.68 kB
TypeScript
/**
* Splits Lao text into individual words.
* A class for splitting Lao text into individual words.
* This class handles various Lao language specific rules for word segmentation,
* including consonant clusters, leading vowels, and special character sequences.
*/
export interface LaoWordInfo {
word: string;
startIndex: number;
endIndex: number;
}
export declare class LaoWordSplitter {
private readonly LAO_CONSONANTS;
private readonly LEADING_VOWELS;
private readonly MIDDLE_CHARS;
private readonly DIGRAPH_FOLLOWERS;
private readonly MAI_YAMOK;
/**
* Checks if a character is part of Lao script
*
* Examples:
* isLaoCharacter('ກ') => true
* isLaoCharacter('a') => false
* isLaoCharacter('1') => false
*/
private isLaoCharacter;
/**
* Helper function to add a word to the result array and start a new word
* Also tracks the position of words in the original text
*
* Examples:
* addWordToResult('ປະ', words, 2, 0, 'ເ') =>
* returns { word: 'ເ', startIndex: 2 } and
* words = [{ word: 'ປະ', startIndex: 0, endIndex: 1 }]
*
* addWordToResult('ລາວ', words, 8, 5, 'ເ') =>
* returns { word: 'ເ', startIndex: 8 } and
* words = [..., { word: 'ລາວ', startIndex: 5, endIndex: 7 }]
*/
private addWordToResult;
/**
* Helper function to handle consonant + 'ວ' sequences
* Also tracks the position of words in the original text
*
* Examples:
* "ຈົນກວ່າຈະ" when encountering '່' after 'ກວ':
* handleConsonantVSequence('ຈົນກວ', '່', words, 5, 0) =>
* returns { word: 'ກວ່', startIndex: 3 } and
* words = [{ word: 'ຈົນ', startIndex: 0, endIndex: 2 }]
*
* "ຄວາມຮັກ" when encountering 'າ' after 'ຄວ':
* handleConsonantVSequence('ຄວ', 'າ', words, 2, 0) =>
* returns { word: 'ຄວາ', startIndex: 0 } and words = []
*
* "ຂວາງາມ" when encountering 'າ' after 'ຂວ':
* handleConsonantVSequence('ຂວ', 'າ', words, 2, 0) =>
* returns { word: 'ຂວາ', startIndex: 0 } and words = []
*/
private handleConsonantVSequence;
/**
* Helper function to handle consonant + 'ຣ' sequences
* Also tracks the position of words in the original text
*
* Examples:
* "ທຣັມ" when encountering 'ັ' after 'ຣ':
* handleConsonantRSequence('ທຣ', 'ັ', words, 2, 0) =>
* returns { word: 'ທຣັ', startIndex: 0 } and words = []
*
* "ປຣິນເຕີ" when encountering 'ິ' after 'ຣ':
* handleConsonantRSequence('ປຣ', 'ິ', words, 2, 0) =>
* returns { word: 'ປຣິ', startIndex: 0 } and words = []
*
* "ກຣາມ" when encountering 'າ' after 'ຣ':
* handleConsonantRSequence('ກຣ', 'າ', words, 2, 0) =>
* returns { word: 'ກຣາ', startIndex: 0 } and words = []
*/
private handleConsonantRSequence;
/**
* Helper function to handle digraphs with 'ຫ'
* Also tracks the position of words in the original text
*
* Examples:
* "ຫວານຫລາຍ" when encountering 'າ' after 'ຫວ':
* handleDigraphSequence('ຫວ', 'າ', words, 2, 0) =>
* returns { word: 'ຫວາ', startIndex: 0 } and words = []
*
* "ຫລັງຈາກ" when encountering 'ັ' after 'ຫລ':
* handleDigraphSequence('ຫລ', 'ັ', words, 2, 0) =>
* returns { word: 'ຫລັ', startIndex: 0 } and words = []
*/
private handleDigraphSequence;
/**
* Helper function to handle regular middle character processing
* Also tracks the position of words in the original text
*
* Examples:
* "ເທດລາວ" when encountering 'າ' after 'ລ':
* handleRegularMiddleChar('ເທດລ', 'າ', words, 5, 0) =>
* returns { word: 'ລາ', startIndex: 4 } and
* words = [{ word: 'ເທດ', startIndex: 0, endIndex: 3 }]
*
* "ພາສາ" when encountering 'າ' after 'ສ':
* handleRegularMiddleChar('ພາສ', 'າ', words, 3, 0) =>
* returns { word: 'ສາ', startIndex: 2 } and
* words = [{ word: 'ພາ', startIndex: 0, endIndex: 1 }]
*/
private handleRegularMiddleChar;
/**
* Helper function to handle 'ວ' or 'ອ' between consonants
* Also tracks the position of words in the original text
*
* Examples:
* "ສຶ່ງສວຍງາມ" when encountering "ວ" after "ສ" and followed by "ຍ":
* handleVaOrOSequence('ສຶ່ງສ', 'ວ', words, 5, 0) =>
* returns { word: 'ສວ', startIndex: 4 } and
* words = [{ word: 'ສຶ່ງ', startIndex: 0, endIndex: 3 }]
*
* "ຂອງກິນ" when encountering "ອ" after "ຂ" and followed by "ງ":
* handleVaOrOSequence('ຂ', 'ອ', words, 1, 0) =>
* returns { word: 'ຂອ', startIndex: 0 } and words = []
*/
private handleVaOrOSequence;
/**
* Remove Zero Width Spaces
*
* Examples:
* removeZeroWidthSpaces("ສະບາຍດີ") => "ສະບາຍດີ"
*/
private removeZeroWidthSpaces;
/**
* Splits a Lao language sentence into individual words based on syllable structure rules.
* Also provides the start and end indices of each word in the original text.
*
* @param sentence The Lao sentence to be segmented.
* @returns An array of segmented words with their indices.
*
* Examples:
* splitLao("ປະເທດລາວ") => [
* { word: "ປະ", startIndex: 0, endIndex: 1 },
* { word: "ເທດ", startIndex: 2, endIndex: 4 },
* { word: "ລາວ", startIndex: 5, endIndex: 7 }
* ]
*
* splitLao("ຈົນກວ່າຈະ") => [
* { word: "ຈົນ", startIndex: 0, endIndex: 2 },
* { word: "ກວ່າ", startIndex: 3, endIndex: 6 },
* { word: "ຈະ", startIndex: 7, endIndex: 8 }
* ]
*
* splitLao("ຫວຽດນາມ") => [
* { word: "ຫວຽດ", startIndex: 0, endIndex: 3 },
* { word: "ນາມ", startIndex: 4, endIndex: 6 }
* ]
*
* splitLao("ພາສາລາວ 101") => [
* { word: "ພາ", startIndex: 0, endIndex: 1 },
* { word: "ສາ", startIndex: 2, endIndex: 3 },
* { word: "ລາວ", startIndex: 4, endIndex: 6 },
* { word: "101", startIndex: 8, endIndex: 10 }
* ]
*/
split(sentence: string): LaoWordInfo[];
}