@autobe/agent

/** * Validation utilities for ensuring English-only content in analysis documents. * * These validators detect non-English characters (Chinese, Korean, Japanese, * etc.) that may be incorrectly generated by LLMs despite prompt instructions. */ /** * Regex pattern to detect non-English characters. Includes: * * - Chinese (CJK Unified Ideographs): \u4e00-\u9fff * - Korean (Hangul Syllables): \uac00-\ud7af * - Japanese Hiragana: \u3040-\u309f * - Japanese Katakana: \u30a0-\u30ff * - CJK Extension A: \u3400-\u4dbf * - CJK Compatibility Ideographs: \uf900-\ufaff */ const NON_ENGLISH_PATTERN = /[\u4e00-\u9fff\uac00-\ud7af\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\uf900-\ufaff]/g; /** Check if text contains non-English characters. */ export const containsNonEnglish = (text: string): boolean => { return NON_ENGLISH_PATTERN.test(text); }; /** * Find all non-English characters in text. Returns array of { char, index, * context } objects. */ export const findNonEnglishCharacters = ( text: string, ): Array<{ char: string; index: number; context: string }> => { const results: Array<{ char: string; index: number; context: string }> = []; const regex = new RegExp(NON_ENGLISH_PATTERN.source, "g"); let match; while ((match = regex.exec(text)) !== null) { const start = Math.max(0, match.index - 20); const end = Math.min(text.length, match.index + 21); results.push({ char: match[0], index: match.index, context: text.slice(start, end), }); } return results; }; /** * Validate that content is English-only. Returns validation result with details * if non-English characters found. */ export const validateEnglishOnly = ( content: string, ): { valid: boolean; errors: string[] } => { const nonEnglish = findNonEnglishCharacters(content); if (nonEnglish.length === 0) { return { valid: true, errors: [] }; } const errors = nonEnglish.slice(0, 5).map((item) => { return `Non-English character "${item.char}" found at index ${item.index}: "...${item.context}..."`; }); if (nonEnglish.length > 5) { errors.push(`... and ${nonEnglish.length - 5} more non-English characters`); } return { valid: false, errors }; }; /** Validate module section content for English-only requirement. */ export const validateModuleSectionContent = ( sections: Array<{ title: string; purpose: string; content: string }>, ): { valid: boolean; errors: string[] } => { const allErrors: string[] = []; sections.forEach((section, index) => { const titleResult = validateEnglishOnly(section.title); const purposeResult = validateEnglishOnly(section.purpose); const contentResult = validateEnglishOnly(section.content); if (!titleResult.valid) { allErrors.push( `Module section ${index} title: ${titleResult.errors.join("; ")}`, ); } if (!purposeResult.valid) { allErrors.push( `Module section ${index} purpose: ${purposeResult.errors.join("; ")}`, ); } if (!contentResult.valid) { allErrors.push( `Module section ${index} content: ${contentResult.errors.join("; ")}`, ); } }); return { valid: allErrors.length === 0, errors: allErrors, }; }; /** Validate unit section content for English-only requirement. */ export const validateUnitSectionContent = ( sections: Array<{ title: string; purpose: string; content: string; keywords: string[]; }>, ): { valid: boolean; errors: string[] } => { const allErrors: string[] = []; sections.forEach((section, index) => { const titleResult = validateEnglishOnly(section.title); const purposeResult = validateEnglishOnly(section.purpose); const contentResult = validateEnglishOnly(section.content); if (!titleResult.valid) { allErrors.push( `Unit section ${index} title: ${titleResult.errors.join("; ")}`, ); } if (!purposeResult.valid) { allErrors.push( `Unit section ${index} purpose: ${purposeResult.errors.join("; ")}`, ); } if (!contentResult.valid) { allErrors.push( `Unit section ${index} content: ${contentResult.errors.join("; ")}`, ); } section.keywords.forEach((keyword, kwIndex) => { const kwResult = validateEnglishOnly(keyword); if (!kwResult.valid) { allErrors.push( `Unit section ${index} keyword ${kwIndex}: ${kwResult.errors.join("; ")}`, ); } }); }); return { valid: allErrors.length === 0, errors: allErrors, }; }; /** Validate section section content for English-only requirement. */ export const validateSectionSectionContent = ( sections: Array<{ title: string; content: string }>, ): { valid: boolean; errors: string[] } => { const allErrors: string[] = []; sections.forEach((section, index) => { const titleResult = validateEnglishOnly(section.title); const contentResult = validateEnglishOnly(section.content); if (!titleResult.valid) { allErrors.push( `Section ${index} title: ${titleResult.errors.join("; ")}`, ); } if (!contentResult.valid) { allErrors.push( `Section ${index} content: ${contentResult.errors.join("; ")}`, ); } }); return { valid: allErrors.length === 0, errors: allErrors, }; }; /** * Validate scenario file names for correct format. Expected format: 00-toc.md, * 01-xxx.md, 02-xxx.md, ... */ export const validateScenarioFileNames = ( files: Array<{ filename: string }>, ): { valid: boolean; errors: string[] } => { const allErrors: string[] = []; const filenamePattern = /^\d{2}-[a-z][a-z0-9-]*\.md$/; // Check first file is 00-toc.md if (files.length > 0 && files[0].filename !== "00-toc.md") { allErrors.push( `First file must be "00-toc.md", got "${files[0].filename}"`, ); } // Check all filenames match pattern files.forEach((file, index) => { if (!filenamePattern.test(file.filename)) { allErrors.push( `File ${index}: Invalid filename format "${file.filename}". Expected format: XX-name.md`, ); } }); // Check sequential numbering files.forEach((file, index) => { const expectedPrefix = index.toString().padStart(2, "0"); if (!file.filename.startsWith(expectedPrefix + "-")) { allErrors.push( `File ${index}: Expected prefix "${expectedPrefix}-", got "${file.filename}"`, ); } }); return { valid: allErrors.length === 0, errors: allErrors, }; };