usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
231 lines (208 loc) • 9.68 kB
JavaScript
const Parser = require('tree-sitter');
const USFM3 = require('tree-sitter-usfm3');
const fs = require('node:fs');
const Ajv = require('ajv');
const {USJ_SCHEMA} = require("./utils/usjSchema")
const { Query } = Parser;
class Validator {
constructor(usjSchemaPath = null) {
this.USFMParser = new Parser();
this.USFMParser.setLanguage(USFM3);
this.parserOptions = Parser.Options = {
bufferSize: 1024 * 1024,
};
this.USFMErrors = [];
// Load the schema for validation
this.USJValidator = null;
try {
const ajv = new Ajv();
if (usjSchemaPath === null) {
this.USJValidator = ajv.compile(USJ_SCHEMA);
} else {
const schemaStr = fs.readFileSync(usjSchemaPath, 'utf8');
const schema = JSON.parse(schemaStr);
this.USJValidator = ajv.compile(schema);
}
} catch (error) {
console.error("Error loading schema:", error);
}
this.message = "";
this.modifiedUSFM = "";
this.usfm = ""
}
isValidUSJ(usj) {
this.message = "";
if (this.USJValidator(usj) === true) {
return true;
} else {
for (let err of this.USJValidator.errors) {
this.message += `Error at ${err.instancePath}: ${err.message}\n`;
}
return false;
}
}
isValidUSFM(usfm) {
this.usfm = usfm;
this.USFMErrors = [];
let tree = null;
if (usfm.length > 25000) {
tree = this.USFMParser.parse(usfm, null, this.parserOptions);
}
else {
tree = this.USFMParser.parse(usfm);
}
const errorQuery = new Query(USFM3, "(ERROR) @errors");
const errors = errorQuery.captures(tree.rootNode);
for (let error of errors) {
// console.log(getAllProperties(error.node));
this.USFMErrors.push(error.node);
}
this.checkForMissing(tree.rootNode);
if (this.USFMErrors.length > 0) {
this.message = this.formatErrors();
return false;
}
return true;
}
checkForMissing(node) {
for (let n of node.children) {
if (n.isMissing){
this.USFMErrors.push(n);
} else {
this.checkForMissing(n);
}
}
}
formatErrors() {
const errLines = this.USFMErrors.map(err => {
if (err.isMissing) {
const start = Math.max(0, err.startIndex - 3);
const end = Math.min(this.usfm.length, err.startIndex + 10);
return `At ${err.startIndex}:Missing something here:${this.usfm.slice(start, end)}`;
} else {
return `At ${err.startPosition.row}:${err.startPosition.column}, Error: ${this.usfm.substring(err.startIndex, err.endIndex)}`;
}
});
return `Errors present:\n\t${errLines.join('\n\t')}`;
}
autoFixUSFM(usfm, fixed=false) {
if (this.isValidUSFM(usfm)) {
if (fixed) {
this.message = "Fixed Errors in USFM"
} else {
this.message = "No Errors in USFM";
}
return usfm;
}
let modifiedUSFM = usfm;
let changed = false;
for (let error of this.USFMErrors) {
const errorText = usfm.substring(error.startIndex, error.endIndex);
// No \P after \s5
if (error.isError && errorText.startsWith("\\s5") &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 1");
modifiedUSFM = modifiedUSFM.replace(/\\s5[\s\n\r]*/g, '\\s5 \n\\p\n');
changed = true;
}
// Missing space after \s5
else if (error.isMissing && error.parent.type === "sTag" && error.toString() === '(MISSING " ")') {
// console.log("Match 2");
modifiedUSFM = modifiedUSFM.replace(/\\s5\n/g, '\\s5 \n');
changed = true;
}
// Book code is missing (empty id marker)
else if (bookCodeMissingPattern.test(modifiedUSFM)) {
// console.log("Match 3");
modifiedUSFM = modifiedUSFM.replace(/\\id[\s\n\r]*\\/g, '\\id XXX xxx\n\\');
changed = true;
}
// \p not given after section heading
else if (error.isError && errorText.startsWith("\\v") && error.parent.type === "s" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 4");
const start = error.parent.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Space missing between \v and number
else if (vWithoutSpacePattern.test(errorText)) {
// console.log("Match 5");
modifiedUSFM = modifiedUSFM.replace(vWithoutSpacePattern, "$1 $2");
changed = true;
}
// Space missing between \c and number
else if (cWithoutSpacePattern.test(errorText)) {
// console.log("Match 6");
modifiedUSFM = modifiedUSFM.replace(cWithoutSpacePattern, "$1 $2");
changed = true;
}
// \p not given at chapter start
else if (error.isError && errorText.startsWith("\\v") && error.previousSibling.type === "chapter" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 7");
const start = error.previousSibling.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
else if (error.isError && !errorText.startsWith("\\") && error.previousSibling.type === "chapter" &&
!error.children.some(ch => ch.type === "paragraph")) {
// console.log("Match 7.1");
const start = error.previousSibling.startIndex;
const end = error.startIndex;
const toReplace = modifiedUSFM.slice(start, end);
modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`);
changed = true;
}
// Stray slash not with a valid marker
else if (errorText.startsWith("\\") && !validMarkersPattern.test(errorText)) {
// console.log("Match 8");
modifiedUSFM = modifiedUSFM.replace(errorText, errorText.slice(1));
changed = true;
}
// Just a single problematic marker (could be w/o text)
else if (errorText.startsWith("\\") && validMarkersPattern.test(errorText)) {
// console.log("Match 9");
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Empty attribute
else if (errorText.trim() === "|") {
// console.log("Match 10");
// console.log(errorText);
const start = Math.max(0, error.startIndex - 5);
const end = Math.min(modifiedUSFM.length, error.endIndex + 5);
const toReplace = modifiedUSFM.slice(start, end);
const replacement = toReplace.replace(errorText, "");
modifiedUSFM = modifiedUSFM.replace(toReplace, replacement);
changed = true;
}
// Stray content in the chapter line
else if (error.parent.type === "chapter" && error.previousSibling.type === "c" && !errorText.includes("\\")) {
// console.log("Match 11");
modifiedUSFM = modifiedUSFM.replace(errorText, "");
changed = true;
}
}
if (!changed || modifiedUSFM===usfm) {
const errStr = this.formatErrors();
this.message = `Cannot fix these errors:\n\t${errStr}`;
return modifiedUSFM;
}
// return modifiedUSFM
return this.autoFixUSFM(modifiedUSFM, true);
}
}
const bookCodeMissingPattern = /\\id[\s\n\r]*\\/;
const vWithoutSpacePattern = /(\\v)(\d+)/;
const cWithoutSpacePattern = /(\\c)(\d+)/;
const validMarkersPattern = /(\\id|\\usfm|\\ide|\\ref|\\h|\\toc|\\toca|\\sts|\\rem|\\restore|\\lit|\\iqt|\\imt|\\imte|\\is|\\io|\\ior|\\iot|\\ip|\\im|\\ipi|\\imi|\\ili|\\ipq|\\imq|\\ipr|\\ib|\\iq|\\ie|\\iex|\\v|\\va|\\vp|\\c|\\cl|\\ca|\\cp|\\cd|\\mt|\\mte|\\ms|\\mr|\\s|\\sr|\\r|\\sp|\\d|\\sd|\\p|\\m|\\po|\\pr|\\cls|\\pmo|\\pm|\\pmc|\\pmr|\\pi|\\mi|\\nb|\\pc|\\ph|\\phi|\\b|\\q|\\qr|\\qc|\\qs|\\qa|\\qac|\\qm|\\qd|\\lh|\\lf|\\li|\\lim|\\liv|\\lik|\\litl|\\tr|\\th|\\thr|\\tc|\\tcr|\\f|\\fe|\\ef|\\fr|\\fq|\\fqa|\\fk|\\fl|\\fw|\\fp|\\ft|\\fdc|\\fv|\\fm|\\x|\\xo|\\xk|\\xq|\\xt|\\xta|\\xop|\\xot|\\xnt|\\xdc|\\rq|\\add|\\bk|\\dc|\\k|\\nd|\\ord|\\pn|\\png|\\addpn|\\qt|\\sig|\\sls|\\tl|\\wj|\\em|\\bd|\\it|\\bdit|\\no|\\sc|\\sup|\\ndx|\\pro|\\rb|\\w|\\wg|\\wh|\\wa|\\fig|\\jmp|\\pb|\\z|\\esb|\\esbe|\\cat)(\d|\s|\n|\r|$)/;
exports.Validator = Validator;