usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
427 lines (389 loc) • 12.4 kB
JavaScript
const Parser = require('tree-sitter');
const assert = require('assert');
const { USFMGenerator } = require('./usfmGenerator');
const { USJGenerator } = require('./usjGenerator');
const { ListGenerator } = require('./listGenerator');
const { USXGenerator } = require('./usxGenerator');
const { Filter } = require('./filters.js');
const { ORIGINAL_VREF } = require('./utils/vrefs');
const USFM3 = require('tree-sitter-usfm3');
const { Query } = Parser;
class USFMParser {
constructor(
usfmString = null,
fromUsj = null,
fromUsx = null,
fromBibleNlp = null,
bookCode = null,
) {
this.syntaxTree = null;
this.errors = [];
this.warnings = [];
let inputsGiven = 0;
if (usfmString !== null) {
inputsGiven += 1;
}
if (fromUsj !== null) {
inputsGiven += 1;
}
if (fromUsx !== null) {
inputsGiven += 1;
}
if (fromBibleNlp !== null) {
inputsGiven += 1;
}
if (inputsGiven > 1) {
throw new Error(`Found more than one input!
Only one of USFM, USJ, USX or BibleNLP is supported in one object.`);
}
if (inputsGiven === 0) {
throw Error(
'Missing input! Either USFM, USJ, USX or BibleNLP is to be provided.',
);
}
if (usfmString !== null) {
if (
typeof usfmString !== 'string' ||
!usfmString.trim().startsWith('\\')
) {
throw new Error(
'Invalid input for USFM. Expected a string with \\ markups.',
);
}
this.usfm = usfmString;
} else if (fromUsj !== null) {
this.usj = fromUsj;
this.usfm = this.convertUSJToUSFM();
} else if (fromUsx !== null) {
this.usx = fromUsx;
this.usfm = this.convertUSXToUSFM();
} else if (fromBibleNlp !== null) {
this.bibleNlp = fromBibleNlp;
this.usfm = this.convertBibleNLPtoUSFM(bookCode);
}
this.parser = null;
this.initializeParser();
this.parseUSFM();
}
initializeParser() {
this.parser = new Parser();
this.parser.setLanguage(USFM3);
this.parserOptions = Parser.Options = {
bufferSize: 1024 * 1024,
};
}
toSyntaxTree() {
return this.syntaxTree.toString();
}
toUSJ(
excludeMarkers = null,
includeMarkers = null,
ignoreErrors = false,
combineTexts = true,
) {
this.usj = this.convertUSFMToUSJ(
(excludeMarkers = excludeMarkers),
(includeMarkers = includeMarkers),
(ignoreErrors = ignoreErrors),
(combineTexts = combineTexts),
);
return this.usj;
}
usjToUsfm(usjObject) {
if (
typeof usjObject !== 'object' ||
usjObject === null ||
!usjObject.hasOwnProperty('type')
) {
throw new Error('Invalid input for USJ. Expected USJ json object.');
}
if (!this.parser) {
this.initializeParser();
}
this.usj = usjObject;
this.usfm = this.convertUSJToUSFM();
return this.usfm;
}
convertUSXToUSFM() {
try {
assert(
1 <= this.usx.nodeType && this.usx.nodeType <= 12,
'Input must be an instance of xmldom Document or Element',
);
if (this.usx.tagName !== 'usx') {
assert(
this.usx.getElementsByTagName('usx').length === 1,
`Expects a <usx> node.
Refer docs: https://docs.usfm.bible/usfm/3.1/syntax.html#_usx_usfm_xml`,
);
this.usx = this.usx.getElementsByTagName('usx')[0];
}
// assert(this.usx.childNodes[0].tagName === 'book', "<book> expected as first element in <usx>")
} catch (err) {
throw new Error(`USX not in expected format. ${ err.message}`);
}
try {
const usfmGen = new USFMGenerator();
usfmGen.usxToUsfm(this.usx);
// console.log(usfmGen.usfmString)
return usfmGen.usfmString;
} catch (err) {
const message = 'Unable to do the conversion from USX to USFM. ';
throw new Error(message, { cause: err });
}
}
parseUSFM() {
let tree = null;
try {
if (this.usfm.length > 25000) {
tree = this.parser.parse(this.usfm, null, this.parserOptions);
} else {
tree = this.parser.parse(this.usfm);
}
} catch (err) {
throw err;
// console.log("Error in parser.parse()");
// console.log(err.toString());
// console.log(this.usfm);
}
this.checkForErrors(tree);
this.checkforMissing(tree.rootNode);
// if (error) throw error;
this.syntaxTree = tree.rootNode;
}
checkForErrors(tree) {
const errorQuery = new Query(USFM3, '(ERROR) @errors');
const errors = errorQuery.captures(tree.rootNode);
if (errors.length > 0) {
this.errors = errors.map(
(err) =>
`At ${err.node.startPosition.row}:${
err.node.startPosition.column
}, Error: ${this.usfm.substring(
err.node.startIndex,
err.node.endIndex,
)}`,
);
return new Error(`Errors found in USFM: ${this.errors.join(', ')}`);
}
}
checkforMissing(node) {
for (const n of node.children) {
if (n.isMissing) {
this.errors.push(
`At ${n.startPosition.row + 1}:${
n.startPosition.column
}, Error: Missing ${n.type}`,
);
}
this.checkforMissing(n);
}
}
convertUSJToUSFM() {
const outputUSFM = new USFMGenerator().usjToUsfm(this.usj); // Simulated conversion
return outputUSFM;
}
convertBibleNLPtoUSFM(bookCode) {
try {
assert(this.bibleNlp.vref, "Should have 'vref' key");
assert(this.bibleNlp.text, "Should have 'text' key");
assert(
Array.isArray(this.bibleNlp.vref),
"'vref' should contain an array of references.",
);
assert(
Array.isArray(this.bibleNlp.text),
"'text' should contain an array of strings.",
);
let vrefs = this.bibleNlp.vref;
if (
[31170, 23213].includes(this.bibleNlp.text.length) &&
vrefs.length === 41899
) {
vrefs = vrefs.slice(0, this.bibleNlp.text.length);
this.bibleNlp.vref = vrefs;
}
if (bookCode !== null) {
bookCode = bookCode.trim().toUpperCase();
vrefs = this.bibleNlp.vref.filter((ref) =>
ref.trim().toUpperCase().startsWith(bookCode),
);
}
if (vrefs.length !== this.bibleNlp.text.length) {
if (
this.bibleNlp.vref.length === this.bibleNlp.text.length &&
bookCode !== null
) {
const texts = this.bibleNlp.text.filter((txt, index) =>
this.bibleNlp.vref[index].trim().toUpperCase().startsWith(bookCode),
);
this.bibleNlp.text = texts;
}
if (vrefs.length !== this.bibleNlp.text.length) {
throw new Error(
'Mismatch in lengths of vref and text lists. ' +
'Specify a bookCode or check for versification differences. ' +
`${vrefs.length} != ${this.bibleNlp.text.length}`,
);
}
}
this.bibleNlp.vref = vrefs;
} catch (err) {
throw new Error(`BibleNLP object not in expected format. ${ err.message}`);
}
try {
const usfmGen = new USFMGenerator();
usfmGen.bibleNlptoUsfm(this.bibleNlp);
this.warnings = usfmGen.warnings;
return usfmGen.usfmString;
} catch (err) {
const message = 'Unable to do the conversion from BibleNLP to USFM. ';
throw new Error(message, { cause: err });
}
}
convertUSFMToUSJ(
excludeMarkers = null,
includeMarkers = null,
ignoreErrors = false,
combineTexts = true,
) {
if (!ignoreErrors && this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
throw new Error(
`Errors present:\n\t${errorString}
Use ignoreErrors = true, as third parameter of toUSJ(), to generate output despite errors.`,
);
}
let outputUSJ;
try {
const usjGenerator = new USJGenerator(USFM3, this.usfm);
usjGenerator.nodeToUSJ(this.syntaxTree, usjGenerator.jsonRootObj);
outputUSJ = usjGenerator.jsonRootObj;
} catch (err) {
let message = 'Unable to do the conversion. ';
if (this.errors) {
const errorString = this.errors.join('\n\t');
message += `Could be due to an error in the USFM\n\t${errorString}`;
} else {
message = err.message;
}
return { error: message };
}
if (includeMarkers) {
outputUSJ = Filter.keepOnly(
outputUSJ,
[...includeMarkers, 'USJ'],
combineTexts,
);
}
if (excludeMarkers) {
outputUSJ = Filter.remove(outputUSJ, excludeMarkers, combineTexts);
}
return outputUSJ;
}
toList(
excludeMarkers = null,
includeMarkers = null,
ignoreErrors = false,
combineTexts = true,
) {
/* Uses the toJSON function and converts JSON to CSV
To be re-implemented to work with the flat JSON schema */
if (!ignoreErrors && this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
throw new Error(
`Errors present:\n\t${errorString}
Use ignoreErrors=true to generate output despite errors`,
);
}
try {
let excludeList = null;
let includeList = null;
if (includeMarkers) {
includeList = [...includeMarkers, ...Filter.BCV];
}
if (excludeMarkers) {
excludeList = excludeMarkers.filter(
(item) => !Filter.BCV.includes(item),
);
}
const usjDict = this.toUSJ(
excludeList,
includeList,
ignoreErrors,
combineTexts,
);
const listGenerator = new ListGenerator();
listGenerator.usjToList(usjDict, excludeMarkers, includeMarkers);
return listGenerator.list;
} catch (exe) {
let message = 'Unable to do the conversion. ';
if (this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
message += `Could be due to an error in the USFM\n\t${errorString}`;
}
throw new Error(message, { cause: exe });
}
}
toBibleNlpFormat(ignoreErrors = false) {
/* Uses the toUSJ function with only BVC and text.
Then the JSOn is converted to list of verse texts and vrefs*/
if (!ignoreErrors && this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
throw new Error(
`Errors present:\n\t${errorString}
Use ignoreErrors=true to generate output despite errors`,
);
}
try {
const usjDict = this.toUSJ(
null,
[...Filter.BCV, ...Filter.TEXT],
ignoreErrors,
true,
);
const listGenerator = new ListGenerator();
listGenerator.usjToBibleNlpFormat(usjDict);
return listGenerator.bibleNlpFormat;
} catch (exe) {
let message = 'Unable to do the conversion. ';
if (this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
message += `Could be due to an error in the USFM\n\t${errorString}`;
}
throw new Error(message, { cause: exe });
}
}
toUSX(ignoreErrors = false) {
/* Convert the syntax_tree to the XML format (USX) */
if (!ignoreErrors && this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
throw new Error(
`Errors present:\n\t${errorString}
Use ignoreErrors=true to generate output despite errors`,
);
}
let xmlContent = null;
try {
// Initialize the USX generator (assuming the constructor is already implemented in JS)
const usxGenerator = new USXGenerator(USFM3, this.usfm);
// Process the syntax tree and convert to USX format
usxGenerator.node2Usx(this.syntaxTree, usxGenerator.xmlRootNode);
// xmlContent = usxSerializer.serializeToString(usxGenerator.xmlRootNode);
xmlContent = usxGenerator.xmlRootNode;
} catch (exe) {
let message = 'Unable to do the conversion. ';
if (this.errors.length > 0) {
const errorString = this.errors.join('\n\t');
message += `Could be due to an error in the USFM\n\t${errorString}`;
}
throw new Error(message, { cause: exe });
}
// Return the generated XML structure (in JSON format)
return xmlContent;
}
}
exports.USFMParser = USFMParser;
exports.Filter = Filter;
exports.ORIGINAL_VREF = ORIGINAL_VREF;
// exports.Format = Format;