usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
267 lines (235 loc) • 8.04 kB
JavaScript
const { NO_USFM_USJ_TYPES,
CLOSING_USJ_TYPES, NON_ATTRIB_USJ_KEYS, NO_NEWLINE_USJ_TYPES } = require('./utils/types');
const { NON_ATTRIB_USX_KEYS, NO_NEWLINE_USX_TYPES } = require('./utils/types');
class USFMGenerator {
constructor() {
this.usfmString = '';
this.warnings = [];
}
usjToUsfm(usjObj, nested = false) {
if (usjObj.type === 'optbreak') {
if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) {
this.usfmString += ' ';
}
this.usfmString += '// ';
return;
}
if (usjObj.type === 'ref') {
usjObj.marker = 'ref';
}
if (!NO_USFM_USJ_TYPES.includes(usjObj.type)) {
this.usfmString += '\\';
if (nested && usjObj.type === 'char') {
this.usfmString += '+';
}
this.usfmString += `${usjObj.marker} `;
}
['code', 'number', 'caller'].forEach((key) => {
if (usjObj[key]) {
this.usfmString += `${usjObj[key]} `;
}
});
if (usjObj.category) {
this.usfmString += `\\cat ${usjObj.category}\\cat*\n`;
}
if (usjObj.altnumber) {
if (usjObj.marker === 'c') {
this.usfmString += `\\ca ${usjObj.altnumber} \\ca*\n`;
} else if (usjObj.marker === 'v') {
this.usfmString += `\\va ${usjObj.altnumber} \\va* `;
}
}
if (usjObj.pubnumber) {
if (usjObj.marker === 'c') {
this.usfmString += `\\cp ${usjObj.pubnumber}\n`;
} else if (usjObj.marker === 'v') {
this.usfmString += `\\vp ${usjObj.pubnumber} \\vp* `;
}
}
if (Array.isArray(usjObj.content)) {
usjObj.content.forEach((item) => {
if (typeof item === 'string') {
this.usfmString += item;
} else {
this.usjToUsfm(item, usjObj.type === 'char' && item.marker !== 'fv');
}
});
}
const attributes = [];
Object.keys(usjObj).forEach((key) => {
if (!NON_ATTRIB_USJ_KEYS.includes(key)) {
let lhs = key;
if (key === 'file') { lhs = 'src'; }
attributes.push(`${lhs}="${usjObj[key]}"`);
}
});
if (attributes.length > 0) {
this.usfmString += `|${attributes.join(' ')}`;
}
if (CLOSING_USJ_TYPES.includes(usjObj.type)) {
this.usfmString += '\\';
if (nested && usjObj.type === 'char') {
this.usfmString += '+';
}
this.usfmString += `${usjObj.marker}* `;
}
if (usjObj.type === 'ms') {
if ('sid' in usjObj) {
if (attributes.length === 0 ) {
this.usfmString += '|';
}
this.usfmString += `sid="${usjObj.sid}" `;
}
this.usfmString = `${this.usfmString.trim() }\\*`;
}
if (usjObj.type === 'sidebar' ) {
this.usfmString += '\\esbe';
}
if (
!NO_NEWLINE_USJ_TYPES.includes(usjObj.type) &&
this.usfmString[this.usfmString.length - 1] !== '\n'
) {
this.usfmString += '\n';
}
return this.usfmString;
}
usxToUsfm(xmlObj, nested = false) {
// Check if xmlObj is a string
// if (typeof xmlObj === 'string') {
// // this.usfmString += xmlObj;
// return;
// }
const objType = xmlObj.tagName;
let marker = null;
const usfmAttributes = [];
if (['verse', 'chapter'].includes(objType) && xmlObj.hasAttribute('eid')) {
return;
}
if (!NO_NEWLINE_USX_TYPES.includes(objType)) {
this.usfmString += '\n';
}
if (objType === 'optbreak') {
if (this.usfmString !== '' && !['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) {
this.usfmString += ' ';
}
this.usfmString += '// ';
}
if (xmlObj.hasAttribute('style')) {
marker = xmlObj.getAttribute('style');
if (nested && objType === 'char' && !['xt', 'fv', 'ref'].includes(marker)) {
marker = `+${marker}`;
}
this.usfmString += `\\${marker} `;
} else if (objType === 'ref') {
marker = 'ref';
this.usfmString += `\\${marker} `;
}
if (xmlObj.hasAttribute('code')) {
this.usfmString += xmlObj.getAttribute('code');
}
if (xmlObj.hasAttribute('number')) {
this.usfmString += `${xmlObj.getAttribute('number')} `;
}
if (xmlObj.hasAttribute('caller')) {
this.usfmString += `${xmlObj.getAttribute('caller')} `;
}
if (xmlObj.hasAttribute('altnumber')) {
if (objType === 'verse') {
this.usfmString += `\\va ${xmlObj.getAttribute('altnumber')}\\va*`;
} else if (objType === 'chapter') {
this.usfmString += `\n\\ca ${xmlObj.getAttribute('altnumber')}\\ca*`;
}
}
if (xmlObj.hasAttribute('pubnumber')) {
if (objType === 'verse') {
this.usfmString += `\\vp ${xmlObj.getAttribute('pubnumber')}\\vp*`;
} else if (objType === 'chapter') {
this.usfmString += `\n\\cp ${xmlObj.getAttribute('pubnumber')}`;
}
}
if (xmlObj.hasAttribute('category')) {
this.usfmString += `\n\\cat ${xmlObj.getAttribute('category')} \\cat*`;
}
const children = Array.from(xmlObj.childNodes);
for (const child of children) {
if (child.nodeType === 1) { // Check if child is an element node
if (objType === 'char') {
this.usxToUsfm(child, true);
} else {
this.usxToUsfm(child, false);
}
}
if (child.nodeType === 3 && child.nodeValue.trim()) { // Check if child is a text node with content
if (this.usfmString !== '' &&
!['\n', '\r', ' ', '\t'].includes(this.usfmString.slice(-1))) {
this.usfmString += ' ';
}
this.usfmString += child.nodeValue.trim();
}
}
const attributes = Array.from(xmlObj.attributes);
for (const attrNode of attributes) {
const key = attrNode.name;
const val = attrNode.value.replace(/"/g, '');
if (key === 'file' && objType === 'figure') {
usfmAttributes.push(`src="${val}"`);
} else if (!NON_ATTRIB_USX_KEYS.includes(key)) {
usfmAttributes.push(`${key}="${val}"`);
}
if (['sid', 'eid'].includes(key) && objType === 'ms') {
usfmAttributes.push(`${key}="${val}"`);
}
}
if (usfmAttributes.length > 0) {
this.usfmString += '|';
this.usfmString += usfmAttributes.join(' ');
}
if ((xmlObj.hasAttribute('closed') && xmlObj.getAttribute('closed') === 'true')
|| CLOSING_USJ_TYPES.includes(objType)
|| usfmAttributes.length > 0) {
if (objType === 'ms') {
this.usfmString += '\\*';
} else {
this.usfmString += `\\${marker}*`;
}
}
if (objType === 'sidebar') {
this.usfmString += '\n\\esbe\n';
}
}
bibleNlptoUsfm(bibleNlpObj) {
const vrefPattern = /([a-zA-Z0-9]{3}) (\d+):(.*)/;
let currBook = null;
let currChapter = null;
for (let i = 0; i < bibleNlpObj.vref.length; i++) {
const vref = bibleNlpObj.vref[i];
const verseText = bibleNlpObj.text[i];
const refMatch = vref.match(vrefPattern);
if (!refMatch) {
throw new Error(`Incorrect format: ${vref}.\nIn BibleNlp, vref should have ` +
'three-letter book code, chapter, and verse in the following format: GEN 1:1');
}
const book = refMatch[1].toUpperCase();
const chap = refMatch[2];
const verse = refMatch[3];
if (book !== currBook) {
if (currBook !== null) {
this.warnings.push('USFM can contain only one book per file. ' +
`Only ${currBook} is processed. Specify bookCode for other books.`);
break;
}
this.usfmString += `\\id ${book}`;
currBook = book;
}
if (chap !== currChapter) {
this.usfmString += `\n\\c ${chap}\n\\p\n`;
currChapter = chap;
}
if (!this.usfmString.endsWith('\n')) {
this.usfmString += ' ';
}
this.usfmString += `\\v ${verse} ${verseText}`;
}
}
}
exports.USFMGenerator = USFMGenerator;