usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
510 lines (462 loc) • 16.8 kB
JavaScript
//Logics for syntax-tree to dict(USJ) conversions
const {
PARA_STYLE_MARKERS,
NOTE_MARKERS,
CHAR_STYLE_MARKERS,
NESTED_CHAR_STYLE_MARKERS,
DEFAULT_ATTRIB_MAP,
TABLE_CELL_MARKERS,
MARKER_SETS,
} = require('./utils/markers.js');
const { createQueriesAsNeeded } = require('./queries.js');
class USJGenerator {
constructor(treeSitterLanguageObj, usfmString, usjRootObj = null) {
this.usfmLanguage = treeSitterLanguageObj;
this.usfm = usfmString;
this.jsonRootObj = usjRootObj || {
type: 'USJ',
version: '3.1',
content: [],
};
// Cache for the query objects
this.queries = {};
// this would be nicer with TS types and not stringly typed, but this pattern creates queries as needed. And creating tree-sitter queries is nearly all the overhead (not single time travee traversal, and not node gerneration and allocation). So only create queries if they are actually neeeded.
this.getQuery = (name) => {
if (!this.queries[name]) {
this.queries[name] = this.createQuery(name);
}
return this.queries[name];
};
this.createQuery = (name) => createQueriesAsNeeded(name, this.usfmLanguage);
// Make o(1) sets for marker lookups
this.markerSets = MARKER_SETS;
this.parseState = {
bookSlug: null,
currentChapter: null,
};
// maps and id to a fn;
this.dispatchMap = this.populateDispatchMap();
}
nodeToUSJId(node, parentJsonObj) {
const idCaptures = this.getQuery('id').captures(node);
// const idCaptures = this.queries.id.captures(node);
let code = null;
let desc = null;
idCaptures.forEach((capture) => {
if (capture.name === 'book-code') {
code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
} else if (capture.name === 'desc') {
desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
}
});
const bookJsonObj = {
type: 'book',
marker: 'id',
code: code,
content: [],
};
this.parseState.bookSlug = code;
if (desc && desc.trim() !== '') {
bookJsonObj.content.push(desc.trim());
}
parentJsonObj.content.push(bookJsonObj);
}
// Similar conversion methods for other node types
nodeToUSJC(node, parentJsonObj) {
// Build c, the chapter milestone node in usj
const chapCap = this.getQuery('chapter').captures(node);
// const chapCap = this.queries.chapter.captures(node);
const chapNum = this.usfm.slice(
chapCap[0].node.startIndex,
chapCap[0].node.endIndex,
);
const chapRef = `${this.parseState.bookSlug} ${chapNum}`;
const chapJsonObj = {
type: 'chapter',
marker: 'c',
number: chapNum,
sid: chapRef,
};
this.parseState.currentChapter = chapNum;
chapCap.forEach((cap) => {
if (cap.name === 'alt-num') {
chapJsonObj.altnumber = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
}
if (cap.name === 'pub-num') {
chapJsonObj.pubnumber = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
}
});
parentJsonObj.content.push(chapJsonObj);
node.children.forEach((child) => {
if (['cl', 'cd'].includes(child.type)) {
this.nodeToUSJ(child, parentJsonObj);
}
});
}
nodeToUSJChapter(node, parentJsonObj) {
// Build chapter node in USJ
node.children.forEach((child) => {
if (child.type === 'c') {
this.nodeToUSJC(child, parentJsonObj);
} else {
this.nodeToUSJ(child, parentJsonObj);
}
});
}
nodeToUSJVerse(node, parentJsonObj) {
// Build verse node in USJ
const verseNumCap = this.getQuery('verseNumCap').captures(node);
// const verseNumCap = this.queries.verseNumCap.captures(node);
const verseNum = this.usfm.substring(
verseNumCap[0].node.startIndex,
verseNumCap[0].node.endIndex,
);
const vJsonObj = {
type: 'verse',
marker: 'v',
number: verseNum.trim(),
};
verseNumCap.forEach((capture) => {
if (capture.name === 'alt') {
const altNum = this.usfm.slice(
capture.node.startIndex,
capture.node.endIndex,
);
vJsonObj.altnumber = altNum;
} else if (capture.name === 'vp') {
const vpText = this.usfm.substring(
capture.node.startIndex,
capture.node.endIndex,
);
vJsonObj.pubnumber = vpText;
}
});
const ref = `${this.parseState.bookSlug} ${this.parseState.currentChapter}:${verseNum}`;
vJsonObj.sid = ref.trim();
parentJsonObj.content.push(vJsonObj);
}
nodeToUSJCaVa(node, parentJsonObj) {
// Build elements for independent ca and va away from c and v
const style = node.type;
const charJsonObj = {
type: 'char',
marker: style,
};
const altNumMatch = this.getQuery('usjCaVa').captures(node);
// const altNumMatch = this.queries.usjCaVa.captures(node);
const altNum = this.usfm
.slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex)
.trim();
charJsonObj.altnumber = altNum;
parentJsonObj.content.push(charJsonObj);
}
nodeToUSJPara(node, parentJsonObj) {
// Build paragraph nodes in USJ
if (node.children[0].type.endsWith('Block')) {
node.children[0].children.forEach((child) => {
this.nodeToUSJPara(child, parentJsonObj);
});
} else if (node.type === 'paragraph') {
const paraTagCap = this.getQuery('para').captures(node)[0];
// const paraTagCap = this.queries.para.captures(node)[0];
const paraMarker = paraTagCap.node.type;
if (paraMarker === 'b') {
parentJsonObj.content.push({ type: 'para', marker: paraMarker });
} else if (!paraMarker.endsWith('Block')) {
const paraJsonObj = { type: 'para', marker: paraMarker, content: [] };
paraTagCap.node.children.forEach((child) => {
this.nodeToUSJ(child, paraJsonObj);
});
parentJsonObj.content.push(paraJsonObj);
}
} else if (['pi', 'ph'].includes(node.type)) {
const paraMarker = this.usfm
.substring(node.children[0].startIndex, node.children[0].endIndex)
.replace('\\', '')
.trim();
const paraJsonObj = { type: 'para', marker: paraMarker, content: [] };
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, paraJsonObj);
});
parentJsonObj.content.push(paraJsonObj);
}
}
nodeToUSJNotes(node, parentJsonObj) {
// Build USJ nodes for footnotes and cross-references
const tagNode = node.children[0];
const callerNode = node.children[1];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.trim();
const noteJsonObj = {
type: 'note',
marker: style,
content: [],
};
noteJsonObj.caller = this.usfm
.substring(callerNode.startIndex, callerNode.endIndex)
.trim();
for (let i = 2; i < node.children.length - 1; i++) {
this.nodeToUSJ(node.children[i], noteJsonObj);
}
parentJsonObj.content.push(noteJsonObj);
}
nodeToUSJChar(node, parentJsonObj) {
// Build USJ nodes for character markups, both regular and nested
const tagNode = node.children[0];
let childrenRange = node.children.length;
for (let i = node.children.length - 1; i > 0; i--) {
if (
node.children[i].type.startsWith('\\') ||
node.children[i].type === '*' ||
node.children[i].type.endsWith('Tag')
) {
childrenRange -= 1;
}
}
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.replace('+', '')
.trim();
const charJsonObj = {
type: 'char',
marker: style,
content: [],
};
// Assume a flag for closed markup, toggle this if your conditions and data structure require
// charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\');
for (let i = 1; i < childrenRange; i++) {
this.nodeToUSJ(node.children[i], charJsonObj);
}
parentJsonObj.content.push(charJsonObj);
}
nodeToUSJTable(node, parentJsonObj) {
// Handle table related components and convert to USJ
if (node.type === 'table') {
const tableJsonObj = { type: 'table', content: [] };
node.children.forEach((child) => {
this.nodeToUSJ(child, tableJsonObj);
});
parentJsonObj.content.push(tableJsonObj);
} else if (node.type === 'tr') {
const rowJsonObj = { type: 'table:row', marker: 'tr', content: [] };
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, rowJsonObj);
});
parentJsonObj.content.push(rowJsonObj);
} else if (this.markerSets.TABLE_CELL_MARKERS.has(node.type)) {
const tagNode = node.children[0];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace('\\', '')
.trim();
const cellJsonObj = {
type: 'table:cell',
marker: style,
content: [],
align: style.includes('tcc') || style.includes('thc')
? 'center'
: style.includes('r')
? 'end'
: 'start',
};
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, cellJsonObj);
});
parentJsonObj.content.push(cellJsonObj);
}
}
nodeToUSJAttrib(node, parentJsonObj) {
// Add attribute values to USJ elements
const attribNameNode = node.children[0];
let attribName = this.usfm
.slice(attribNameNode.startIndex, attribNameNode.endIndex)
.trim();
// Handling special cases for attribute names
if (attribName === '|') {
let parentType = node.parent.type;
if (parentType.includes('Nested')) {
parentType = parentType.replace('Nested', '');
}
attribName = DEFAULT_ATTRIB_MAP[parentType];
}
if (attribName === 'src') {
// for \fig
attribName = 'file';
}
const attribValCap = this.getQuery('attribVal').captures(node);
// const attribValCap = this.queries.attribVal.captures(node);
let attribValue = '';
if (attribValCap.length > 0) {
attribValue = this.usfm
.substring(
attribValCap[0].node.startIndex,
attribValCap[0].node.endIndex,
)
.trim();
}
parentJsonObj[attribName] = attribValue;
}
nodeToUSJMilestone(node, parentJsonObj) {
// Create ms node in USJ
const msNameCap = this.getQuery('milestone').captures(node)[0]; // this.queries.milestone.captures(node)[0];
// const msNameCap = this.queries.milestone.captures(node)[0];
// slice, not substring. Hence not using util fxn extractAndCleanMarker
const style = this.usfm
.slice(msNameCap.node.startIndex, msNameCap.node.endIndex)
.replace('\\', '')
.trim();
const msJsonObj = { type: 'ms', marker: style, content: [] };
node.children.forEach((child) => {
if (child.type.endsWith('Attribute')) {
this.nodeToUSJ(child, msJsonObj);
}
});
// Though normally milestones don't have contents, custom z-namespaces could have them
if (!msJsonObj.content.length) {
delete msJsonObj.content; // Remove empty content array if not used
}
parentJsonObj.content.push(msJsonObj);
}
nodeToUSJSpecial(node, parentJsonObj) {
// Build nodes for esb, cat, fig, optbreak in USJ
if (node.type === 'esb') {
const sidebarJsonObj = { type: 'sidebar', marker: 'esb', content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, sidebarJsonObj);
});
parentJsonObj.content.push(sidebarJsonObj);
} else if (node.type === 'cat') {
const catCap = this.getQuery('category').captures(node)[0];
// const catCap = this.queries.category.captures(node)[0];
const category = this.usfm
.substring(catCap.node.startIndex, catCap.node.endIndex)
.trim();
parentJsonObj.category = category;
} else if (node.type === 'fig') {
const figJsonObj = { type: 'figure', marker: 'fig', content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, figJsonObj);
});
parentJsonObj.content.push(figJsonObj);
} else if (node.type === 'ref') {
const refJsonObj = { type: 'ref', content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, refJsonObj);
});
parentJsonObj.content.push(refJsonObj);
}
}
nodeToUSJGeneric(node, parentJsonObj) {
// Build nodes for para style markers in USJ
const tagNode = node.children[0];
let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex);
if (style.startsWith('\\')) {
style = style.replace('\\', '').trim();
} else {
style = node.type;
}
let childrenRangeStart = 1;
if (
node.children.length > 1 &&
node.children[1].type.startsWith('numbered')
) {
const numNode = node.children[1];
const num = this.usfm.substring(numNode.startIndex, numNode.endIndex);
style += num;
childrenRangeStart = 2;
}
const paraJsonObj = { type: 'para', marker: style, content: [] };
parentJsonObj.content.push(paraJsonObj);
for (let i = childrenRangeStart; i < node.children.length; i++) {
const child = node.children[i];
if (
[
this.markerSets.CHAR_STYLE_MARKERS,
this.markerSets.NESTED_CHAR_STYLE_MARKERS,
this.markerSets.OTHER_PARA_NESTABLES,
].some((markerSet) => markerSet.has(child.type))
) {
// Only nest these types inside the upper para style node
this.nodeToUSJ(child, paraJsonObj);
} else {
this.nodeToUSJ(child, parentJsonObj);
}
}
}
pushTextNode(node, parentJsonObj) {
const textVal = this.usfm
.substring(node.startIndex, node.endIndex)
.replace('~', ' ');
if (textVal !== '') {
parentJsonObj.content.push(textVal);
}
}
handleVerseText(node, parentJsonObj) {
node.children.forEach((child) => this.nodeToUSJ(child, parentJsonObj));
}
populateDispatchMap() {
const thisMap = new Map();
const thisClass = this;
const bindToClass = (method) => method.bind(thisClass);
const addHandlers = (markers, handler) => {
markers.forEach((marker) => thisMap.set(marker, handler.bind(thisClass)));
};
// Instead of at worst O(n) lookup time in switch statement, we can map marker to a handler and then at most O(1) lookup time with room for fallback on stuff like type ends with ATtributes: returned functions take the args of the handler
thisMap.set('text', bindToClass(this.pushTextNode));
thisMap.set('verseText', bindToClass(this.handleVerseText));
thisMap.set('v', bindToClass(this.nodeToUSJVerse));
thisMap.set('id', this.nodeToUSJId.bind(this));
thisMap.set('chapter', this.nodeToUSJChapter.bind(this));
// nooop
thisMap.set('usfm', () => {});
addHandlers(['paragraph', 'q', 'w'], this.nodeToUSJPara);
addHandlers(['cl', 'cp', 'vp'], this.nodeToUSJGeneric);
addHandlers(['ca', 'va'], this.nodeToUSJCaVa);
addHandlers(['table', 'tr'], this.nodeToUSJTable);
addHandlers(['milestone', 'zNameSpace'], this.nodeToUSJMilestone);
addHandlers(['esb', 'cat', 'fig', 'ref'], this.nodeToUSJSpecial);
addHandlers(NOTE_MARKERS, this.nodeToUSJNotes);
addHandlers(
[CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, 'xt_standalone'].flat(),
this.nodeToUSJChar,
);
// addHandlers(NESTED_CHAR_STYLE_MARKERS, this.nodeToUSJChar);
// thisMap.set("xt_standalone", this.nodeToUSJChar.bind(this));
addHandlers(TABLE_CELL_MARKERS, this.nodeToUSJTable);
addHandlers(
PARA_STYLE_MARKERS.filter((m) => m !== 'usfm'),
this.nodeToUSJGeneric,
);
return thisMap;
}
nodeToUSJ(node, parentJsonObj) {
const nodeType = node.type?.replace('\\', '');
const handler = this.dispatchMap.get(nodeType);
if (handler) {
handler(node, parentJsonObj);
return;
} else {
if (!nodeType) { return; }
// some edge cases where we can't cleanly map to a marker:
if (nodeType.endsWith('Attribute')) {
return this.nodeToUSJAttrib(node, parentJsonObj);
}
if (['', '|'].includes(node.type.trim())) {
// known noop;
return;
}
// Process children while discarding nodes that don't go into usj
if (node.children.length > 0) {
node.children.forEach((child) => this.nodeToUSJ(child, parentJsonObj));
}
}
}
}
exports.USJGenerator = USJGenerator;