lao-grammar-checker
Version:
A utility to check grammar and split Lao language sentences into individual words
278 lines (277 loc) • 12.8 kB
JavaScript
"use strict";
// src/laoGrammarCheck.ts
Object.defineProperty(exports, "__esModule", { value: true });
exports.laoGrammarChecker = laoGrammarChecker;
// Import the LaoWordSplitter and LaoWordInfo
const LaoWordSplitter_1 = require("./LaoWordSplitter");
// Define character sets based on Lao script components
// Consonants
const LAO_CONSONANTS = new Set([
'ກ', 'ຂ', 'ຄ', 'ງ', 'ຈ', 'ສ', 'ຊ', 'ຍ', 'ດ', 'ຕ', 'ຖ', 'ທ', 'ນ',
'ບ', 'ປ', 'ຜ', 'ຝ', 'ພ', 'ຟ', 'ມ', 'ຢ', 'ຣ', 'ລ', 'ວ', 'ຫ', 'ອ', 'ຮ',
'ໜ', 'ໝ'
]);
// Leading Vowels (Rule 1 & Rule 3.4)
const LEADING_VOWELS_RULE1_AND_3 = new Set(['ເ', 'ແ', 'ໂ', 'ໄ', 'ໃ']); // Added 'ໃ' for Rule 3.4
// Top Vowels (Rule 2 & 4)
const TOP_VOWELS_RULE2 = new Set(['ິ', 'ີ', 'ຶ', 'ື']);
// Tone Marks (Rule 3)
const TONE_MARKS_RULE3 = new Set(['່', '້']);
// Other Diacritics/Vowels (Rule 4)
const OTHER_DIACRITICS_RULE4 = new Set([
'໊', '໋', 'ໍ', '໌', 'ົ', 'ັ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ຼ'
]);
// Vowel IA (Rule 5)
const VOWEL_IA_RULE5 = 'ຽ';
// Vowels A/AA (Rule 6)
const VOWELS_A_AA_RULE6 = new Set(['ະ', 'າ']);
// --- Sets for New Rules (7-10) ---
// Vowels/Diacritics for Rule 7 (currentChar check - excludes ຼ and ໍ)
const VOWELS_DIACRITICS_RULE7_CURRENT = new Set(['ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ັ', 'ົ']);
// Vowels/Diacritics for Rule 7 (nextChar check - includes ຼ and ໍ)
const VOWELS_DIACRITICS_RULE7_NEXT = new Set(['ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ໍ', 'ຼ', 'ັ', 'ົ']);
// Vowels/Diacritics for Rule 8 (Identical to COUNTABLE_VOWELS_DIACRITICS)
const VOWELS_DIACRITICS_RULE8 = new Set([
'ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ໍ', 'ຼ', '໊', 'ັ', 'ົ', '່', '້', '໋', '໌', 'ຽ'
]);
// Tone/Cancellation Marks for Rule 9
const TONES_CANCEL_RULE9 = new Set(['່', '້', '໋', '໌']);
// Vowels A/IA for Rule 10 (currentChar)
const VOWELS_A_IA_RULE10 = new Set(['າ', 'ຽ']);
// Vowels/Diacritics for Rule 10 (previousChar check - modified for 10.1)
const VOWELS_DIACRITICS_RULE10_1_PREV = new Set([
'ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', '໊', 'ັ', 'ົ', '໋', '໌', 'ຽ'
// Excludes 'ໍ'
]);
// Vowels/Diacritics for counting (Used for post-loop checks)
const COUNTABLE_VOWELS_DIACRITICS = new Set([
'ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ໍ', 'ຼ', '໊', 'ັ', 'ົ', '່', '້', '໋', '໌', 'ຽ'
// Note: 'າ' is counted here, but 'ໃ', 'ໄ', 'ເົາ', 'ຳ' are typically part of the word structure itself
// and handled by rules rather than just counted as standalone vowels.
]);
const LEADING_VOWELS_RULE3_4_ONLY = new Set(['ໂ', 'ໄ', 'ໃ']);
// Allowed Previous Characters for Top Vowels (Rule 2.1 Extended)
const ALLOWED_PREV_FOR_TOP_VOWELS_RULE2 = new Set([
...LAO_CONSONANTS,
'ເ', 'ແ', 'ຼ'
]);
/**
* Checks if a character is part of Lao script.
* Reusable helper function.
*/
function isLaoCharacter(char) {
if (!char)
return false;
const charCode = char.charCodeAt(0);
// Lao Unicode block: U+0E80 to U+0EFF
return charCode >= 0x0E80 && charCode <= 0x0EFF;
}
/**
* Checks if a Lao word has correct grammatical structure.
*
* @param word - The Lao word to check
* @returns True if the word has correct grammar, false otherwise
*/
function checkLaoWordGrammar(word) {
// Check for space - always return true for spaces
if (word === " ") {
return true;
}
// Word-Level Guard 0: Empty word check
if (!word || word.length === 0) {
return false;
}
// Word-Level Guard 1 (Moved Up): First character is not Lao
if (!isLaoCharacter(word[0])) {
return true; // Non-Lao words are considered valid
}
// Word-Level Guard 2 (Now Guard 2): Single character word
if (word.length === 1) {
// Exception: Allow ONLY 'ໆ' (Mai Yamok)
return word === 'ໆ';
}
// Word-Level Guard 3 (Now Guard 3): Specific check for single 'ຳ'
if (word === 'ຳ') {
return false;
}
// Word-Level Guard 4: Check issues in 2-char words
if (word.length === 2) {
const isConsonant0 = LAO_CONSONANTS.has(word[0]);
const isConsonant1 = LAO_CONSONANTS.has(word[1]);
// 4.1 Check for two consecutive consonants
if (isConsonant0 && isConsonant1) {
return false;
}
// 4.2 Check for identical characters (excluding 'ເເ')
if (word[0] === word[1] && word !== 'ເເ') {
return false;
}
}
// Word-Level Guard 5: Check for identical characters in 3-char words
if (word.length === 3 && word[0] === word[1] && word[1] === word[2]) {
return false;
}
let vowelCount = 0;
let consonantCount = 0;
// Iterate through characters of the word
for (let i = 0; i < word.length; i++) {
const currentChar = word[i];
const previousChar = i > 0 ? word[i - 1] : null;
const previousChar2 = i > 1 ? word[i - 2] : null;
const previousChar3 = i > 2 ? word[i - 3] : null;
const nextChar = i < word.length - 1 ? word[i + 1] : null;
const nextChar2 = i < word.length - 2 ? word[i + 2] : null;
// --- Character Counting ---
if (LAO_CONSONANTS.has(currentChar)) {
consonantCount++;
}
else if (COUNTABLE_VOWELS_DIACRITICS.has(currentChar)) {
vowelCount++; // Count specific vowels/diacritics
}
// --- Rule Checks ---
// Rule 1: Check ['ເ', 'ແ', 'ໂ', 'ໄ']
if (LEADING_VOWELS_RULE1_AND_3.has(currentChar)) {
// 1.1 Must be at index 0
if (i !== 0) {
// Exception: Allow the second 'ເ' if it's at index 1 and follows another 'ເ'
const isAllowedDoubleE = (currentChar === 'ເ' && i === 1 && previousChar === 'ເ');
if (!isAllowedDoubleE) {
return false; // Not at index 0 AND not the allowed double 'ເ'
}
}
else { // i === 0 (It's the first character)
// 1.2 Next char must be a consonant (or 'ເ' if current is 'ເ')
if (!nextChar || (!LAO_CONSONANTS.has(nextChar) && !(currentChar === 'ເ' && nextChar === 'ເ'))) {
return false;
}
}
}
// Rule 2: Check [ິ ີ ຶ ື]
else if (TOP_VOWELS_RULE2.has(currentChar)) {
// 2.1 Previous char must be a consonant or specific allowed chars [ເ, ແ, ຼ]
if (!previousChar || !ALLOWED_PREV_FOR_TOP_VOWELS_RULE2.has(previousChar)) {
return false;
}
}
// Rule 3: Check [່ ້]
else if (TONE_MARKS_RULE3.has(currentChar)) {
// 3.1 Previous char or previous char 2 must be a consonant
const prevIsConsonant = previousChar && LAO_CONSONANTS.has(previousChar);
const prev2IsConsonant = previousChar2 && LAO_CONSONANTS.has(previousChar2);
if (!prevIsConsonant && !prev2IsConsonant) {
return false;
}
// 3.2 If word length <= 2, next char must exist
if (word.length <= 2 && !nextChar) {
return false;
}
// 3.3 If tone mark is at index 1, word length must be >= 3
if (i === 1 && word.length < 3) {
return false;
}
// 3.4 Check for invalid structure: [ໂ,ໄ,ໃ] + ... + ToneMark + Consonant + Consonant
const nextCharIsConsonant = nextChar && LAO_CONSONANTS.has(nextChar);
const nextChar2IsConsonant = nextChar2 && LAO_CONSONANTS.has(nextChar2);
if (LEADING_VOWELS_RULE3_4_ONLY.has(word[0]) && nextCharIsConsonant && nextChar2IsConsonant) {
return false;
}
// 3.5 (New) Check for invalid structure: [ເ,ແ] + ... + ToneMark + Consonant + Consonant (where prev char is not a top vowel)
const isLeadingEorAE = word[0] === 'ເ' || word[0] === 'ແ';
const prevIsNotTopVowel = !(previousChar && TOP_VOWELS_RULE2.has(previousChar));
if (isLeadingEorAE && nextCharIsConsonant && nextChar2IsConsonant && prevIsNotTopVowel) {
return false;
}
// 3.5 (Old) If word length is 3, check for invalid Consonant + ToneMark + Consonant structure
const prevCharIsConsonantRule35 = previousChar && LAO_CONSONANTS.has(previousChar);
const nextCharIsConsonantRule35 = nextChar && LAO_CONSONANTS.has(nextChar);
if (word.length === 3 && prevCharIsConsonantRule35 && nextCharIsConsonantRule35) {
return false;
}
}
// Rule 4: Check [໊ ໋ ໍ ໌ ົ ັ ິ ີ ຶ ື ຸ ູ ຼ] (excludes Rule 2 chars)
else if (OTHER_DIACRITICS_RULE4.has(currentChar) && !TOP_VOWELS_RULE2.has(currentChar)) {
// 4.1 Previous char must be a consonant
if (!previousChar || !LAO_CONSONANTS.has(previousChar)) {
return false;
}
}
// Rule 5: Check 'ຽ'
else if (currentChar === VOWEL_IA_RULE5) {
// 5.1 Next char must be a consonant
if (!nextChar || !LAO_CONSONANTS.has(nextChar)) {
return false;
}
}
// Rule 6: Check [ະ າ]
else if (VOWELS_A_AA_RULE6.has(currentChar)) {
// 6.1 Previous, Previous 2, or Previous 3 must be a consonant
const prevIsConsonant = previousChar && LAO_CONSONANTS.has(previousChar);
const prev2IsConsonant = previousChar2 && LAO_CONSONANTS.has(previousChar2);
const prev3IsConsonant = previousChar3 && LAO_CONSONANTS.has(previousChar3);
if (!prevIsConsonant && !prev2IsConsonant && !prev3IsConsonant) {
return false;
}
}
// Rule 7: Check for consecutive vowels/diacritics based on specific sets
if (VOWELS_DIACRITICS_RULE7_CURRENT.has(currentChar) && nextChar && VOWELS_DIACRITICS_RULE7_NEXT.has(nextChar)) {
return false;
}
// Rule 8: Check for identical consecutive vowels/diacritics from VOWELS_DIACRITICS_RULE8
if (VOWELS_DIACRITICS_RULE8.has(currentChar) && nextChar && VOWELS_DIACRITICS_RULE8.has(nextChar) && currentChar === nextChar) {
// Re-use Rule 7 exception for 'ເເ'
if (!(currentChar === 'ເ' && nextChar === 'ເ')) {
return false;
}
}
// Rule 9: Check for consecutive tone/cancellation marks
if (TONES_CANCEL_RULE9.has(currentChar) && nextChar && TONES_CANCEL_RULE9.has(nextChar)) {
return false;
}
// Rule 10: Check for specific vowels [າ, ຽ] preceded by certain vowels/diacritics OR followed by two consonants
if (VOWELS_A_IA_RULE10.has(currentChar)) {
// 10.1 Check if preceded by invalid vowel/diacritic (using the modified set)
if (previousChar && VOWELS_DIACRITICS_RULE10_1_PREV.has(previousChar)) {
return false;
}
// 10.2 Check if followed by two consonants
const nextIsConsonant = nextChar && LAO_CONSONANTS.has(nextChar);
const next2IsConsonant = nextChar2 && LAO_CONSONANTS.has(nextChar2);
if (nextIsConsonant && next2IsConsonant) {
return false;
}
}
} // End of character loop
// --- Post-Character Loop Checks ---
// 1. No consonants found
if (consonantCount === 0) {
return false;
}
// 2. Too many consonants (more than 4)
if (consonantCount > 4) {
return false;
}
// If the word passed all rule checks, it's valid
return true;
}
/**
* Checks Lao sentence for grammatical structure violations.
* Uses LaoWordSplitter to split the sentence into words first.
*
* @param sentence - The Lao sentence to check
* @returns An array of LaoGrammarCheckResult objects with grammar correctness indicator
*/
function laoGrammarChecker(sentence) {
// Create splitter instance
const splitter = new LaoWordSplitter_1.LaoWordSplitter();
// Split the sentence
const wordInfos = splitter.split(sentence);
// Check grammar for each word and add result
const results = wordInfos.map(wordInfo => {
const grammarCorrect = checkLaoWordGrammar(wordInfo.word);
return {
...wordInfo,
grammarCorrect
};
});
return results;
}