@autobe/agent
Version:
AI backend server code generator
225 lines (196 loc) • 6.54 kB
text/typescript
/**
* Validation utilities for ensuring English-only content in analysis documents.
*
* These validators detect non-English characters (Chinese, Korean, Japanese,
* etc.) that may be incorrectly generated by LLMs despite prompt instructions.
*/
/**
* Regex pattern to detect non-English characters. Includes:
*
* - Chinese (CJK Unified Ideographs): \u4e00-\u9fff
* - Korean (Hangul Syllables): \uac00-\ud7af
* - Japanese Hiragana: \u3040-\u309f
* - Japanese Katakana: \u30a0-\u30ff
* - CJK Extension A: \u3400-\u4dbf
* - CJK Compatibility Ideographs: \uf900-\ufaff
*/
const NON_ENGLISH_PATTERN =
/[\u4e00-\u9fff\uac00-\ud7af\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\uf900-\ufaff]/g;
/** Check if text contains non-English characters. */
export const containsNonEnglish = (text: string): boolean => {
return NON_ENGLISH_PATTERN.test(text);
};
/**
* Find all non-English characters in text. Returns array of { char, index,
* context } objects.
*/
export const findNonEnglishCharacters = (
text: string,
): Array<{ char: string; index: number; context: string }> => {
const results: Array<{ char: string; index: number; context: string }> = [];
const regex = new RegExp(NON_ENGLISH_PATTERN.source, "g");
let match;
while ((match = regex.exec(text)) !== null) {
const start = Math.max(0, match.index - 20);
const end = Math.min(text.length, match.index + 21);
results.push({
char: match[0],
index: match.index,
context: text.slice(start, end),
});
}
return results;
};
/**
* Validate that content is English-only. Returns validation result with details
* if non-English characters found.
*/
export const validateEnglishOnly = (
content: string,
): { valid: boolean; errors: string[] } => {
const nonEnglish = findNonEnglishCharacters(content);
if (nonEnglish.length === 0) {
return { valid: true, errors: [] };
}
const errors = nonEnglish.slice(0, 5).map((item) => {
return `Non-English character "${item.char}" found at index ${item.index}: "...${item.context}..."`;
});
if (nonEnglish.length > 5) {
errors.push(`... and ${nonEnglish.length - 5} more non-English characters`);
}
return { valid: false, errors };
};
/** Validate module section content for English-only requirement. */
export const validateModuleSectionContent = (
sections: Array<{ title: string; purpose: string; content: string }>,
): { valid: boolean; errors: string[] } => {
const allErrors: string[] = [];
sections.forEach((section, index) => {
const titleResult = validateEnglishOnly(section.title);
const purposeResult = validateEnglishOnly(section.purpose);
const contentResult = validateEnglishOnly(section.content);
if (!titleResult.valid) {
allErrors.push(
`Module section ${index} title: ${titleResult.errors.join("; ")}`,
);
}
if (!purposeResult.valid) {
allErrors.push(
`Module section ${index} purpose: ${purposeResult.errors.join("; ")}`,
);
}
if (!contentResult.valid) {
allErrors.push(
`Module section ${index} content: ${contentResult.errors.join("; ")}`,
);
}
});
return {
valid: allErrors.length === 0,
errors: allErrors,
};
};
/** Validate unit section content for English-only requirement. */
export const validateUnitSectionContent = (
sections: Array<{
title: string;
purpose: string;
content: string;
keywords: string[];
}>,
): { valid: boolean; errors: string[] } => {
const allErrors: string[] = [];
sections.forEach((section, index) => {
const titleResult = validateEnglishOnly(section.title);
const purposeResult = validateEnglishOnly(section.purpose);
const contentResult = validateEnglishOnly(section.content);
if (!titleResult.valid) {
allErrors.push(
`Unit section ${index} title: ${titleResult.errors.join("; ")}`,
);
}
if (!purposeResult.valid) {
allErrors.push(
`Unit section ${index} purpose: ${purposeResult.errors.join("; ")}`,
);
}
if (!contentResult.valid) {
allErrors.push(
`Unit section ${index} content: ${contentResult.errors.join("; ")}`,
);
}
section.keywords.forEach((keyword, kwIndex) => {
const kwResult = validateEnglishOnly(keyword);
if (!kwResult.valid) {
allErrors.push(
`Unit section ${index} keyword ${kwIndex}: ${kwResult.errors.join("; ")}`,
);
}
});
});
return {
valid: allErrors.length === 0,
errors: allErrors,
};
};
/** Validate section section content for English-only requirement. */
export const validateSectionSectionContent = (
sections: Array<{ title: string; content: string }>,
): { valid: boolean; errors: string[] } => {
const allErrors: string[] = [];
sections.forEach((section, index) => {
const titleResult = validateEnglishOnly(section.title);
const contentResult = validateEnglishOnly(section.content);
if (!titleResult.valid) {
allErrors.push(
`Section ${index} title: ${titleResult.errors.join("; ")}`,
);
}
if (!contentResult.valid) {
allErrors.push(
`Section ${index} content: ${contentResult.errors.join("; ")}`,
);
}
});
return {
valid: allErrors.length === 0,
errors: allErrors,
};
};
/**
* Validate scenario file names for correct format. Expected format: 00-toc.md,
* 01-xxx.md, 02-xxx.md, ...
*/
export const validateScenarioFileNames = (
files: Array<{ filename: string }>,
): { valid: boolean; errors: string[] } => {
const allErrors: string[] = [];
const filenamePattern = /^\d{2}-[a-z][a-z0-9-]*\.md$/;
// Check first file is 00-toc.md
if (files.length > 0 && files[0].filename !== "00-toc.md") {
allErrors.push(
`First file must be "00-toc.md", got "${files[0].filename}"`,
);
}
// Check all filenames match pattern
files.forEach((file, index) => {
if (!filenamePattern.test(file.filename)) {
allErrors.push(
`File ${index}: Invalid filename format "${file.filename}". Expected format: XX-name.md`,
);
}
});
// Check sequential numbering
files.forEach((file, index) => {
const expectedPrefix = index.toString().padStart(2, "0");
if (!file.filename.startsWith(expectedPrefix + "-")) {
allErrors.push(
`File ${index}: Expected prefix "${expectedPrefix}-", got "${file.filename}"`,
);
}
});
return {
valid: allErrors.length === 0,
errors: allErrors,
};
};