lao-grammar-checker
Version:
A utility to check grammar and split Lao language sentences into individual words
264 lines (263 loc) • 15.6 kB
JavaScript
;
// src/laoGrammarCheck.ts
Object.defineProperty(exports, "__esModule", { value: true });
exports.laoGrammarCheck = laoGrammarCheck;
// Define character sets based on Lao script components
// Consonants
const LAO_CONSONANTS = new Set([
'ກ', 'ຂ', 'ຄ', 'ງ', 'ຈ', 'ສ', 'ຊ', 'ຍ', 'ດ', 'ຕ', 'ຖ', 'ທ', 'ນ',
'ບ', 'ປ', 'ຜ', 'ຝ', 'ພ', 'ຟ', 'ມ', 'ຢ', 'ຣ', 'ລ', 'ວ', 'ຫ', 'ອ', 'ຮ',
'ໜ', 'ໝ'
]);
// Leading Vowels (Rule 1)
const LEADING_VOWELS_RULE1 = new Set(['ເ', 'ແ', 'ໂ', 'ໄ']);
// Top Vowels (Rule 2 & 4)
const TOP_VOWELS_RULE2 = new Set(['ິ', 'ີ', 'ຶ', 'ື']);
// Tone Marks (Rule 3)
const TONE_MARKS_RULE3 = new Set(['່', '້']);
// Other Diacritics/Vowels (Rule 4)
const OTHER_DIACRITICS_RULE4 = new Set([
'໊', '໋', 'ໍ', '໌', 'ົ', 'ັ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ຼ'
]);
// Vowel IA (Rule 5)
const VOWEL_IA_RULE5 = 'ຽ';
// Vowels A/AA (Rule 6)
const VOWELS_A_AA_RULE6 = new Set(['ະ', 'າ']);
// Vowels/Diacritics for counting
const COUNTABLE_VOWELS_DIACRITICS = new Set([
'ະ', 'າ', 'ິ', 'ີ', 'ຶ', 'ື', 'ຸ', 'ູ', 'ໍ', 'ຼ', '໊', 'ັ', 'ົ', '່', '້', '໋', '໌', 'ຽ'
// Note: 'າ' is counted here, but 'ໃ', 'ໄ', 'ເົາ', 'ຳ' are typically part of the word structure itself
// and handled by rules rather than just counted as standalone vowels.
]);
/**
* Checks if a character is part of Lao script.
* Reusable helper function.
*/
function isLaoCharacter(char) {
if (!char)
return false;
const charCode = char.charCodeAt(0);
// Lao Unicode block: U+0E80 to U+0EFF
return charCode >= 0x0E80 && charCode <= 0x0EFF;
}
/**
* Checks Lao words for basic grammatical structure violations.
*
* @param words - An array of Lao words (presumably segmented).
* @returns An array of words that violate the defined grammar rules.
*/
function laoGrammarCheck(words) {
// Initial Guard: Empty input array
if (!words || words.length === 0) {
return [];
}
const wrongWords = [];
// Outer loop: Iterate through each word
wordLoop: for (const word of words) {
// Word-Level Guard 0: Empty word check
if (!word || word.length === 0) {
continue wordLoop; // Skip empty strings
}
// Word-Level Guard 1: Single character word (usually invalid unless it's a special case not covered here)
if (word.length === 1) {
// Exception: Allow 'ໆ' (Mai Yamok) as a single valid character word if needed,
// but the current rules treat any single char as wrong.
if (word !== 'ໆ') { // Example exception, adjust if needed
wrongWords.push(word);
continue wordLoop;
}
else {
// If the word is exactly 'ໆ', consider it valid for this check and skip further checks for this word
continue wordLoop;
}
}
// Word-Level Guard 2: First character is not Lao
if (!isLaoCharacter(word[0])) {
// wrongWords.push(word); // Removed: Don't add to wrongWords
continue wordLoop; // Skip the word entirely if first char is not Lao
}
let vowelCount = 0;
let consonantCount = 0;
// Inner loop: Iterate through characters of the current word
for (let i = 0; i < word.length; i++) {
const currentChar = word[i];
const previousChar = i > 0 ? word[i - 1] : null;
const previousChar2 = i > 1 ? word[i - 2] : null;
const previousChar3 = i > 2 ? word[i - 3] : null;
const nextChar = i < word.length - 1 ? word[i + 1] : null;
// const nextChar2 = i < word.length - 2 ? word[i + 2] : null; // Not used in current rules
// --- Character Counting ---
if (LAO_CONSONANTS.has(currentChar)) {
consonantCount++;
}
else if (COUNTABLE_VOWELS_DIACRITICS.has(currentChar)) {
vowelCount++; // Count specific vowels/diacritics
}
// Note: Leading vowels like ເ, ແ, ໂ, ໄ are not counted in vowelCount here
// as their validity is checked by Rule 1 positionally.
// --- Rule Checks ---
let ruleBroken = false;
// Rule 1: Check ['ເ', 'ແ', 'ໂ', 'ໄ']
if (LEADING_VOWELS_RULE1.has(currentChar)) {
// 1.1 Must be at index 0
if (i !== 0) {
// Exception: Allow the second 'ເ' if it's at index 1 and follows another 'ເ'
const isAllowedDoubleE = (currentChar === 'ເ' && i === 1 && previousChar === 'ເ');
if (!isAllowedDoubleE) {
ruleBroken = true; // Break if not at index 0 AND not the allowed double 'ເ'
}
// If it is the allowed double 'ເ', do nothing here, let the loop continue
}
else { // i === 0 (It's the first character)
// 1.2 Next char must be a consonant (or 'ເ' if current is 'ເ')
if (!nextChar || (!LAO_CONSONANTS.has(nextChar) && !(currentChar === 'ເ' && nextChar === 'ເ'))) {
ruleBroken = true;
}
}
}
// Rule 2: Check [ິ ີ ຶ ື]
else if (TOP_VOWELS_RULE2.has(currentChar)) {
// 2.1 Previous char must be a consonant
if (!previousChar || !LAO_CONSONANTS.has(previousChar)) {
ruleBroken = true;
}
}
// Rule 3: Check [່ ້]
else if (TONE_MARKS_RULE3.has(currentChar)) {
// 3.1 Previous char or previous char 2 must be a consonant
const prevIsConsonant = previousChar && LAO_CONSONANTS.has(previousChar);
const prev2IsConsonant = previousChar2 && LAO_CONSONANTS.has(previousChar2);
if (!prevIsConsonant && !prev2IsConsonant) {
// Further refinement needed: Tone marks usually follow a vowel/diacritic which follows a consonant.
// This basic check might be too lenient/strict.
// Example: ເກົ້າ (kor kai, sara ao, mai tho) - mai tho follows sara ao, prev2 is consonant kor kai. OK.
// Example: ຫຼ້າ (hor lor ling, sara aa, mai tho) - mai tho follows sara aa, prev is consonant lor ling. OK.
// Example: ກວ່າ (kor kai, wor we, sara aa, mai ek) - mai ek follows sara aa, prev is consonant wor we. OK.
// Let's stick to the user's rule for now: prev or prev2 must be consonant.
ruleBroken = true;
}
}
// Rule 4: Check [໊ ໋ ໍ ໌ ົ ັ ິ ີ ຶ ື ຸ ູ ຼ] (includes Rule 2 chars, handled above but check again for others)
else if (OTHER_DIACRITICS_RULE4.has(currentChar) && !TOP_VOWELS_RULE2.has(currentChar)) { // Avoid re-checking Rule 2 chars here
// 4.1 Previous char must be a consonant
if (!previousChar || !LAO_CONSONANTS.has(previousChar)) {
ruleBroken = true;
}
}
// Rule 5: Check 'ຽ'
else if (currentChar === VOWEL_IA_RULE5) {
// 5.1 Next char must be a consonant
if (!nextChar || !LAO_CONSONANTS.has(nextChar)) {
ruleBroken = true;
}
}
// Rule 6: Check [ະ າ]
else if (VOWELS_A_AA_RULE6.has(currentChar)) {
// 6.1 Previous, Previous 2, or Previous 3 must be a consonant
const prevIsConsonant = previousChar && LAO_CONSONANTS.has(previousChar);
const prev2IsConsonant = previousChar2 && LAO_CONSONANTS.has(previousChar2);
const prev3IsConsonant = previousChar3 && LAO_CONSONANTS.has(previousChar3);
if (!prevIsConsonant && !prev2IsConsonant && !prev3IsConsonant) {
ruleBroken = true;
}
}
// If any rule is broken, add the word and move to the next word
if (ruleBroken) {
wrongWords.push(word);
continue wordLoop; // Go to the next word in the outer loop
}
} // End of inner character loop
// --- Post-Character Loop Checks ---
// These checks apply only if no rules were broken during the character loop
// 1. No consonants found
if (consonantCount === 0) {
wrongWords.push(word);
continue wordLoop;
}
// 2. Too many consonants (more than 4) - This seems like an unusual rule, typically max is 3 (like ກຣະ, ທຣຳ), or 2+vowel+2. Let's implement as requested.
if (consonantCount > 4) {
wrongWords.push(word);
continue wordLoop;
}
// If the word passed all character rule checks and the post-loop checks, it's considered valid (for now)
} // End of outer word loop
return wrongWords;
}
// Example Usage (can be removed or placed in a test file)
const testWords1 = ["ປະ", "ເທດ", "ລາວ"];
console.log(`Input: ${testWords1}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords1)}`); // Expected: []
const testWords2 = ["ປ", "ທ້ດ", "ລວ", "ເດືອນ", "ເເດງ", "ກິນເຂົ້າ", "ສະບາຍດີ", "ເ", "ນ", "ົາ", "ຂອງຂ້ອຍ", "ຫລາຍ"];
console.log(`Input: ${testWords2}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords2)}`);
// Expected output based on rules (may need refinement):
// "ປ" -> single char
// "ທ້ດ" -> '້' follows 'ທ' (consonant), 'ດ' follows '້' (tone). Looks ok based on basic rules, but "ທ້ດ" isn't a word. This checker focuses on *structure*, not vocabulary.
// "ລວ" -> 'ລ' (consonant), 'ວ' (consonant). consonantCount=2. Needs more rules to catch this. Maybe check for missing vowel?
// "ເເດງ" -> Rule 1 exception allows double 'ເເ'. OK.
// "ກິນເຂົ້າ" -> Skipped? No, should be processed if split correctly. Assumes input is already split.
// "ສະບາຍດີ" -> OK
// "ເ" -> single char
// "ນ" -> single char
// "ົາ" -> No consonant -> wrong
// "ຂອງຂ້ອຍ" -> OK structurally
// "ຫລາຍ" -> OK structurally
// Let's refine expected output based *strictly* on the rules given *AND the new changes*:
// "ປ" -> Rule: Single char -> wrongWords: ["ປ"]
// "ທ້ດ" -> Passes.
// "ລວ" -> Passes.
// "ເເດງ" -> Passes rule 1 now. OK.
// "ກິນເຂົ້າ" -> Should be ["ກິນ", "ເຂົ້າ"]. Both OK.
// "ສະບາຍດີ" -> Should be ["ສະ", "ບາຍ", "ດີ"]. All OK.
// "ເ" -> Rule: Single char -> wrongWords: ["ປ", "ເ"]
// "ນ" -> Rule: Single char -> wrongWords: ["ປ", "ເ", "ນ"]
// "ົາ" -> No consonant -> Rule: consonantCount === 0 -> wrongWords: ["ປ", "ເ", "ນ", "ົາ"]
// "ຂອງຂ້ອຍ" -> Should be ["ຂອງ", "ຂ້ອຍ"]. Both OK.
// "ຫລາຍ" -> Should be ["ຫລາຍ"]. OK.
// Final Expected based on *strict rules and changes*: ["ປ", "ເ", "ນ", "ົາ"] (Assuming input words are split correctly)
// --- Additional Examples ---
// Test case 3: Mostly valid words, including Mai Yamok (which is allowed as a single char by exception)
const testWords3 = ["ກິນ", "ເຂົ້າ", "ແລ້ວ", "ໆ", "ບໍ່"];
console.log(`
Input: ${testWords3}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords3)}`); // Expected: [] ('ໆ' is now correctly skipped)
// Test case 4: Mixed Lao and non-Lao characters
const testWords4 = ["Hello", "ສະບາຍດີ", "World123", "ພາສາ", "Lao"];
console.log(`
Input: ${testWords4}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords4)}`); // Expected: [] (Words starting non-Lao are now skipped)
// Test case 5: Words violating specific rules
// "່ກ" -> Rule 3 broken (tone mark without preceding consonant)
// "ເັກ" -> Rule 1 broken ('ເ' requires consonant after) -> but this passes because ເ is index 0 and ກ is consonant. Let's try a clearer case.
// "ໄທຍ" -> Rule 1 broken ('ໄ' requires consonant after, not 'ຍ' - assuming 'ຍ' isn't treated as consonant for this rule test, or maybe 'ໄທ' is valid, but 'ຍ' makes it wrong? Let's assume ໄທ is valid, ຍ breaks it? No, 'ຍ' *is* a consonant. This word *should* be valid by Rule 1.)
// Let's rethink rule violations:
// "ກ່" -> Rule 3 needs consonant before or 2 before. Here it's just ກ. Ok. But usually tone follows vowel/diacritic. What if no vowel? Let's assume this is wrong.
// "ເດືອຽ" -> Rule 5 broken ('ຽ' needs consonant *after*, not end of word)
// "າ" -> Single char, no consonant. -> Wrong.
// "ກົ້" -> Rule 3: '້' previous is 'ົ', previous 2 is 'ກ'. prev2 is consonant. OK.
// "ແຫນ" -> Rule 1: 'ແ' at index 0, 'ຫ' is consonant. OK. 'ນ' follows 'ຫ'. ok.
// "ກິ່ນ" -> Rule 2: 'ິ' follows 'ກ'. OK. Rule 3: '່' follows 'ິ', prev2 is 'ກ'. OK. 'ນ' follows '່'. OK.
// "ເົາ" -> Rule 1 broken ('ເ' needs consonant after) + No consonant count. -> Still broken because next char not consonant? No, 'ເົາ' has no next char. Should be Rule: consonantCount === 0. -> Wrong.
// "ມະຫາ" -> 'ມ' 'ະ' 'ຫ' 'າ'. Rule 6('ະ'): prev ('ມ') is C. OK. Rule 6('າ'): prev ('ຫ') is C. OK.
// Let's try clearly broken ones:
// "ເດືອນເ" -> Rule 1 broken ('ເ' not at index 0)
// "ເເ" -> Passes Rule 1 check, but fails consonantCount === 0 check? Let's trace: consonantCount=0. vowelCount=0. Loop ends. Check consonantCount===0 is true. -> Added to wrongWords. Correct.
// "ກິ່" -> Rule 3 broken ('່' prev='ິ', prev2='ກ'(C). Ok) - Hmm, need better examples of broken rules.
// "ກັັບ" -> Rule 4 ('ັ') needs consonant before. 'ກ' is C. OK. Rule 4 ('ັ') needs consonant before. 'ບ' is C. Ok. Hmm. Let's assume double diacritic isn't allowed? Need a clearer rule. -> Passes current rules.
// "ະ" -> Single char, no consonant. -> Wrong.
// "ຜູ້" -> Rule 2 ('ູ') needs consonant before. '້' is not C. -> Wrong.
// "ພຽງຽ" -> Rule 5 ('ຽ') needs consonant after. No char after. -> Wrong.
const testWords5 = ["່ກ", "ເດືອນເ", "ຜູ້", "ພຽງຽ", "ະ"];
console.log(`
Input: ${testWords5}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords5)}`); // Expected: ["່ກ", "ເດືອນເ", "ຜູ້", "ພຽງຽ", "ະ"] (Based on interpretations)
// Test case 6: Empty array
const testWords6 = [];
console.log(`
Input: ${testWords6}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords6)}`); // Expected: []
// Test case 7: Array with only invalid words
const testWords7 = ["a", "1", " ", "ເເ", "ກກກກກ"]; // 'ກກກກກ' has 5 consonants > 4
console.log(`
Input: ${testWords7}`);
console.log(`Wrong words: ${laoGrammarCheck(testWords7)}`); // Expected: [" ", "ເເ", "ກກກກກ"] ('a', '1' skipped. "" skipped by guard 0 if present. " " is pushed by single char rule if " "!= 'ໆ'. 'ເເ' fails consonant count. 'ກກກກກ' fails consonant count > 4)