usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
591 lines (528 loc) • 20.5 kB
JavaScript
//Logics for syntax-tree to xml(USX) conversions
const { DOMImplementation } = require('@xmldom/xmldom');
const {
PARA_STYLE_MARKERS,
NOTE_MARKERS,
CHAR_STYLE_MARKERS,
NESTED_CHAR_STYLE_MARKERS,
DEFAULT_ATTRIB_MAP,
TABLE_CELL_MARKERS,
MARKER_SETS,
} = require('./utils/markers.js');
const { createQueriesAsNeeded } = require('./queries.js');
class USXGenerator {
/**
* A binding for all methods used in generating USX from Syntax tree
* @param {object} treeSitterLanguageObj - The Tree-sitter language object
* @param {Buffer} usfmString - The USFM byte data
* @param {Element} [usxRootElement] - The root element of the USX (optional)
*/
constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) {
this.usfmLanguage = treeSitterLanguageObj;
this.usfm = usfmString;
const domImpl = new DOMImplementation();
const doc = domImpl.createDocument(null, 'usx', null);
if (usxRootElement === null) {
this.xmlRootNode = doc.documentElement;
this.xmlRootNode.setAttribute('version', '3.1');
} else {
this.xmlRootNode = usxRootElement;
}
// Cache for the query objects
this.queries = {};
this.getQuery = (name) => {
if (!this.queries[name]) {
this.queries[name] = this.createQuery(name);
}
return this.queries[name];
};
this.createQuery = (name) => createQueriesAsNeeded(name, this.usfmLanguage);
this.markerSets = MARKER_SETS;
this.parseState = {
bookSlug: null,
currentChapter: null,
prevVerseSid: null, //each xml verse node:
prevChapterSid: null,
prevVerse: null,
};
// maps and id to a fn;
this.dispatchMap = this.populateDispatchMap();
}
populateDispatchMap() {
const thisMap = new Map();
const thisClass = this;
const bindToClass = (method) => method.bind(thisClass);
const addHandlers = (markers, handler) => {
markers.forEach((marker) => thisMap.set(marker, handler.bind(thisClass)));
};
// Instead of at worst O(n) lookup time in switch statement, we can map marker to a handler and then at most O(1) lookup time with room for fallback on stuff like type ends with ATtributes: returned functions take the args of the handler
thisMap.set('text', bindToClass(this.pushTextNode));
thisMap.set('verseText', bindToClass(this.handleVerseText));
thisMap.set('v', bindToClass(this.node2UsxVerse));
thisMap.set('id', this.node2UsxId.bind(this));
thisMap.set('chapter', this.node2UsxChapter.bind(this));
// nooop
thisMap.set('usfm', () => {});
addHandlers(['paragraph', 'q', 'w'], this.node2UsxPara);
addHandlers(['cl', 'cl', 'cp', 'vp'], this.node2UsxGeneric);
addHandlers(['ca', 'va'], this.node2UsxCaVa);
addHandlers(['table', 'tr'], this.node2UsxTable);
addHandlers(['milestone', 'zNameSpace'], this.node2UsxMilestone);
addHandlers(['esb', 'cat', 'fig', 'ref'], this.node2UsxSpecial);
addHandlers(NOTE_MARKERS, this.node2UsxNotes);
addHandlers(
[CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, 'xt_standalone'].flat(),
this.node2UsxChar,
);
// addHandlers(NESTED_CHAR_STYLE_MARKERS, this.node2UsxChar);
// thisMap.set("xt_standalone", this.node2UsxChar.bind(this));
addHandlers(TABLE_CELL_MARKERS, this.node2UsxTable);
addHandlers(
PARA_STYLE_MARKERS.filter((m) => m !== 'usfm'),
this.node2UsxGeneric,
);
return thisMap;
}
/**
* Builds the ID node in USX
* @param {SyntaxNode} node - The syntax node
* @param {Element} parentXmlNode - The parent XML node to append the ID to
*/
node2UsxId(node, parentXmlNode) {
const idCaptures = this.getQuery('id').captures(node);
let code = null;
let desc = null;
idCaptures.forEach((capture) => {
if (capture.name === 'book-code') {
code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
} else if (capture.name === 'desc') {
desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
}
});
const bookXmlNode = parentXmlNode.ownerDocument.createElement('book');
bookXmlNode.setAttribute('code', code);
bookXmlNode.setAttribute('style', 'id');
this.parseState.bookSlug = code;
if (desc && desc.trim() !== '') {
const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim());
bookXmlNode.appendChild(textNode);
}
parentXmlNode.appendChild(bookXmlNode);
}
node2UsxC(node, parentXmlNode) {
// Build c, the chapter milestone node in usj
const chapCap = this.getQuery('chapter').captures(node);
const chapNum = this.usfm.slice(
chapCap[0].node.startIndex,
chapCap[0].node.endIndex,
);
// const bookNode = xpath.select1("book", parentXmlNode);
const bookCode = this.parseState.bookSlug;
const chapRef = `${bookCode} ${chapNum}`;
this.parseState.prevChapterSid = chapRef;
// Create the 'chapter' element
const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter');
chapXmlNode.setAttribute('number', chapNum);
chapXmlNode.setAttribute('style', 'c');
chapXmlNode.setAttribute('sid', chapRef);
this.parseState.currentChapter = chapNum;
chapCap.forEach((cap) => {
if (cap.name === 'alt-num') {
const altNum = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
chapXmlNode.setAttribute('altnumber', altNum);
}
if (cap.name === 'pub-num') {
const pubNum = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
chapXmlNode.setAttribute('pubnumber', pubNum);
}
});
parentXmlNode.appendChild(chapXmlNode);
node.children.forEach((child) => {
if (['cl', 'cd'].includes(child.type)) {
this.node2Usx(child, parentXmlNode);
}
});
}
handleVerseText(node, parentXmlNode) {
node.children.forEach((child) => this.node2Usx(child, parentXmlNode));
this.parseState.prevVerseParent = parentXmlNode;
}
node2UsxChapter(node, parentXmlNode) {
// Build chapter node in USJ
node.children.forEach((child) => {
if (child.type === 'c') {
this.node2UsxC(child, parentXmlNode);
} else {
this.node2Usx(child, parentXmlNode);
}
});
// const prevVerses = xpath.select("//verse", this.xmlRootNode);
// chapter means we need both closing verse and closing chapter eids
const lastVerse = this.parseState.prevVerse;
if (lastVerse && !lastVerse.getAttribute('eid')) {
const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse');
vEndXmlNode.setAttribute('eid', this.parseState.prevVerseSid);
this.parseState.prevVerseSid = null;
this.parseState.prevVerse = null;
const sibblingCount = parentXmlNode.childNodes.length;
const lastSibbling = parentXmlNode.childNodes[sibblingCount - 1];
if (lastSibbling.tagName === 'para') {
lastSibbling.appendChild(vEndXmlNode);
} else if (lastSibbling.tagName === 'table') {
const rows = lastSibbling.getElementsByTagName('row');
rows[rows.length - 1].appendChild(vEndXmlNode);
} else {
parentXmlNode.appendChild(vEndXmlNode);
}
}
const cEndXmlNode = parentXmlNode.ownerDocument.createElement('chapter');
cEndXmlNode.setAttribute('eid', this.parseState.prevChapterSid);
this.parseState.prevChapterSid = null;
parentXmlNode.appendChild(cEndXmlNode);
}
findPrevUncle(parentXmlNode) {
// Get the grandparent node
const grandParent = parentXmlNode.parentNode;
let uncleIndex = grandParent.childNodes.length - 2; // Start from the previous sibling
while (uncleIndex >= 0) {
const uncle = grandParent.childNodes[uncleIndex];
// Skip 'sidebar' and 'ms' elements
if (uncle.tagName === 'sidebar' || uncle.tagName === 'ms') {
uncleIndex--;
}
// Skip elements with 'ca' or 'cp' in the style attribute
else if (
uncle.getAttribute('style') === 'ca' ||
uncle.getAttribute('style') === 'cp'
) {
uncleIndex--;
}
// Return the found uncle element
else {
return uncle;
}
}
return null; // No suitable uncle found
}
node2UsxVerse(node, parentXmlNode) {
// Find all previous 'verse' elements
// const prevVerses = xpath.select("//verse", this.xmlRootNode);
// Check if there are previous verses and if the last one has a 'sid' attribute
// Check if there are previous verses to close
if (this.parseState.prevVerseSid) {
const prevPara = this.parseState.prevVerseParent;
const vEndXmlNode = prevPara.ownerDocument.createElement('verse');
vEndXmlNode.setAttribute('eid', this.parseState.prevVerseSid);
prevPara.appendChild(vEndXmlNode);
}
// Query to capture verse-related elements
const verseNumCap = this.getQuery('verseNumCap').captures(node);
const verseNum = this.usfm.substring(
verseNumCap[0].node.startIndex,
verseNumCap[0].node.endIndex,
);
const vXmlNode = parentXmlNode.ownerDocument.createElement('verse');
this.parseState.prevVerse = vXmlNode;
parentXmlNode.appendChild(vXmlNode);
// Loop through the captured elements and set the attributes
verseNumCap.forEach((capture) => {
if (capture.name === 'alt') {
const altNum = this.usfm.slice(
capture.node.startIndex,
capture.node.endIndex,
);
vXmlNode.setAttribute('altnumber', altNum);
} else if (capture.name === 'vp') {
const vpText = this.usfm
.slice(capture.node.startIndex, capture.node.endIndex)
.trim();
vXmlNode.setAttribute('pubnumber', vpText);
}
});
const ref = `${this.parseState.bookSlug} ${
this.parseState.currentChapter
}:${verseNum.trim()}`;
// Set attributes on the newly created 'verse' element
vXmlNode.setAttribute('number', verseNum.trim());
vXmlNode.setAttribute('style', 'v');
vXmlNode.setAttribute('sid', ref.trim());
this.parseState.prevVerseSid = ref.trim();
}
node2UsxCaVa(node, parentXmlNode) {
// Build elements for independent ca and va away from c and v
const style = node.type;
// Create a new 'char' element under the parent XML node
const charXmlNode = parentXmlNode.ownerDocument.createElement('char');
charXmlNode.setAttribute('style', style);
// Query to capture chapterNumber or verseNumber
const altNumMatch = this.getQuery('usjCaVa').captures(node);
// Extract the alternate number from the captured range
const altNum = this.usfm
.slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex)
.trim();
// Set the attributes on the 'char' element
charXmlNode.setAttribute('altnumber', altNum);
charXmlNode.setAttribute('closed', 'true');
// Append the 'char' element to the parent XML node
parentXmlNode.appendChild(charXmlNode);
}
node2UsxPara(node, parentXmlNode) {
// Build paragraph nodes in USX
if (node.children[0].type.endsWith('Block')) {
for (const child of node.children[0].children) {
this.node2UsxPara(child, parentXmlNode);
}
} else if (node.type === 'paragraph') {
const paraTagCap = this.getQuery('para').captures(node)[0];
const paraMarker = paraTagCap.node.type;
if (!paraMarker.endsWith('Block')) {
const paraXmlNode = parentXmlNode.ownerDocument.createElement('para');
paraXmlNode.setAttribute('style', paraMarker);
parentXmlNode.appendChild(paraXmlNode);
for (const child of paraTagCap.node.children.slice(1)) {
this.node2Usx(child, paraXmlNode);
}
}
} else if (['pi', 'ph'].includes(node.type)) {
const paraMarker = this.usfm
.slice(node.children[0].startIndex, node.children[0].endIndex)
.replace('\\', '')
.trim();
const paraXmlNode = parentXmlNode.ownerDocument.createElement('para');
paraXmlNode.setAttribute('style', paraMarker);
parentXmlNode.appendChild(paraXmlNode);
for (const child of node.children.slice(1)) {
this.node2Usx(child, paraXmlNode);
}
}
}
node2UsxNotes(node, parentXmlNode) {
// Build USJ nodes for footnotes and cross-references
const tagNode = node.children[0];
const callerNode = node.children[1];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.trim();
const noteXmlNode = parentXmlNode.ownerDocument.createElement('note');
noteXmlNode.setAttribute('style', style);
const caller = this.usfm
.substring(callerNode.startIndex, callerNode.endIndex)
.trim();
noteXmlNode.setAttribute('caller', caller);
parentXmlNode.appendChild(noteXmlNode);
for (let i = 2; i < node.children.length - 1; i++) {
this.node2Usx(node.children[i], noteXmlNode);
}
}
node2UsxChar(node, parentXmlNode) {
// Build USJ nodes for character markups, both regular and nested
const tagNode = node.children[0];
let childrenRange = node.children.length;
for (let i = node.children.length - 1; i > 0; i--) {
if (
node.children[i].type.startsWith('\\') ||
node.children[i].type === '*' ||
node.children[i].type.endsWith('Tag')
) {
childrenRange -= 1;
}
}
const charXmlNode = parentXmlNode.ownerDocument.createElement('char');
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.replace('+', '')
.trim();
charXmlNode.setAttribute('style', style);
parentXmlNode.appendChild(charXmlNode);
for (let i = 1; i < childrenRange; i++) {
this.node2Usx(node.children[i], charXmlNode);
}
}
node2UsxAttrib(node, parentXmlNode) {
// Add attribute values to USJ elements
const attribNameNode = node.children[0];
let attribName = this.usfm
.slice(attribNameNode.startIndex, attribNameNode.endIndex)
.trim();
// Handling special cases for attribute names
if (attribName === '|') {
let parentType = node.parent.type;
if (parentType.includes('Nested')) {
parentType = parentType.replace('Nested', '');
}
attribName = DEFAULT_ATTRIB_MAP[parentType];
}
if (attribName === 'src') {
// for \fig
attribName = 'file';
}
const attribValCap = this.getQuery('attribVal').captures(node);
let attribValue = '';
if (attribValCap.length > 0) {
attribValue = this.usfm
.substring(
attribValCap[0].node.startIndex,
attribValCap[0].node.endIndex,
)
.trim();
}
parentXmlNode.setAttribute(attribName, attribValue);
}
node2UsxTable(node, parentXmlNode) {
// Handle table related components and convert to USJ
if (node.type === 'table') {
const tableXmlNode = parentXmlNode.ownerDocument.createElement('table');
parentXmlNode.appendChild(tableXmlNode);
node.children.forEach((child) => {
this.node2Usx(child, tableXmlNode);
});
} else if (node.type === 'tr') {
const rowXmlNode = parentXmlNode.ownerDocument.createElement('row');
rowXmlNode.setAttribute('style', 'tr');
parentXmlNode.appendChild(rowXmlNode);
node.children.slice(1).forEach((child) => {
this.node2Usx(child, rowXmlNode);
});
} else if (this.markerSets.TABLE_CELL_MARKERS.has(node.type)) {
const tagNode = node.children[0];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.trim();
const cellXmlNode = parentXmlNode.ownerDocument.createElement('cell');
cellXmlNode.setAttribute('style', style);
cellXmlNode.setAttribute(
'align',
style.includes('tcc') || style.includes('thc')
? 'center'
: style.includes('r')
? 'end' : 'start',
);
parentXmlNode.appendChild(cellXmlNode);
node.children.slice(1).forEach((child) => {
this.node2Usx(child, cellXmlNode);
});
}
}
node2UsxMilestone(node, parentXmlNode) {
// Create ms node in USJ
const msNameCap = this.getQuery('milestone').captures(node)[0]; //
const style = this.usfm
.slice(msNameCap.node.startIndex, msNameCap.node.endIndex)
.replace('\\', '')
.trim();
const msXmlNode = parentXmlNode.ownerDocument.createElement('ms');
msXmlNode.setAttribute('style', style);
parentXmlNode.appendChild(msXmlNode);
node.children.forEach((child) => {
if (child.type.endsWith('Attribute')) {
this.node2Usx(child, msXmlNode);
}
});
}
node2UsxSpecial(node, parentXmlNode) {
// Build nodes for esb, cat, fig, optbreak in USJ
if (node.type === 'esb') {
const sidebarXmlNode =
parentXmlNode.ownerDocument.createElement('sidebar');
sidebarXmlNode.setAttribute('style', 'esb');
parentXmlNode.appendChild(sidebarXmlNode);
node.children.slice(1, -1).forEach((child) => {
this.node2Usx(child, sidebarXmlNode);
});
} else if (node.type === 'cat') {
const catCap = this.getQuery('category').captures(node)[0];
const category = this.usfm
.substring(catCap.node.startIndex, catCap.node.endIndex)
.trim();
parentXmlNode.setAttribute('category', category);
} else if (node.type === 'fig') {
const figXmlNode = parentXmlNode.ownerDocument.createElement('figure');
figXmlNode.setAttribute('style', 'fig');
parentXmlNode.appendChild(figXmlNode);
node.children.slice(1, -1).forEach((child) => {
this.node2Usx(child, figXmlNode);
});
} else if (node.type === 'ref') {
const refXmlNode = parentXmlNode.ownerDocument.createElement('ref');
parentXmlNode.appendChild(refXmlNode);
node.children.slice(1, -1).forEach((child) => {
this.node2Usx(child, refXmlNode);
});
}
}
node2UsxGeneric(node, parentXmlNode) {
const tagNode = node.children[0];
let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim();
// Strip leading backslashes from the style or use node type
if (style.startsWith('\\')) {
style = style.replace('\\', '');
} else {
style = node.type;
}
if (style === 'usfm') {
return;
}
const childrenRangeStart = 1;
// Create a 'para' element and set its style attribute
const paraXmlNode = parentXmlNode.ownerDocument.createElement('para');
paraXmlNode.setAttribute('style', style);
parentXmlNode.appendChild(paraXmlNode);
// Loop through the child nodes and recursively process them
for (let i = childrenRangeStart; i < node.children.length; i++) {
const child = node.children[i];
if (
[
this.markerSets.CHAR_STYLE_MARKERS,
this.markerSets.NESTED_CHAR_STYLE_MARKERS,
this.markerSets.OTHER_PARA_NESTABLES,
].some((markerSet) => markerSet.has(child.type))
) {
// If the child is of one of the allowed types, nest it inside the para node
this.node2Usx(child, paraXmlNode);
} else {
// Otherwise, append the child to the parent XML node
this.node2Usx(child, parentXmlNode);
}
}
// Append the created para node to the parent XML node
}
pushTextNode(node, parentXmlNode) {
let textVal = this.usfm.substring(node.startIndex, node.endIndex);
textVal = textVal.replace('~', ' ');
if (textVal !== '') {
const textNode = parentXmlNode.ownerDocument.createTextNode(textVal);
parentXmlNode.appendChild(textNode);
}
}
node2Usx(node, parentXmlNode) {
const nodeType = node.type?.replace('\\', '');
const handler = this.dispatchMap.get(nodeType);
if (handler) {
handler(node, parentXmlNode);
return;
} else {
// special cases or children:
if (!nodeType) { return; }
if (node.type.endsWith('Attribute')) {
return this.node2UsxAttrib(node, parentXmlNode);
}
if (['', '|'].includes(node.type.trim())) {
// Skip whitespace nodes
return;
}
if (node.children.length > 0) {
node.children.forEach((child) => {
this.node2Usx(child, parentXmlNode);
});
}
}
}
}
exports.USXGenerator = USXGenerator;