usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
561 lines (509 loc) • 16.7 kB
JavaScript
//Logics for syntax-tree to dict(USJ) conversions
const Parser = require("tree-sitter");
const {Query} = Parser;
const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers");
class USJGenerator {
constructor(treeSitterLanguageObj, usfmString, usjRootObj=null) {
this.usfmLanguage = treeSitterLanguageObj;
this.usfm = usfmString;
this.jsonRootObj = usjRootObj || {
type: "USJ",
version: "3.1",
content: [],
};
}
findLastFromJson(jsonObj, typeValue) {
let output = null;
if (
typeValue === jsonObj.type ||
(jsonObj.marker && typeValue === jsonObj.marker)
) {
output = jsonObj;
}
if (jsonObj.content) {
jsonObj.content.forEach((child) => {
if (typeof child === "string") {
return;
}
const childOutput = this.findLastFromJson(child, typeValue);
if (childOutput !== null) {
output = childOutput;
}
});
}
return output;
}
nodeToUSJId(node, parentJsonObj) {
const idCaptures = new Query(this.usfmLanguage,
"(id (bookcode) @book-code (description)? @desc)")
.captures(node);
let code = null;
let desc = null;
idCaptures.forEach((capture) => {
if (capture.name === "book-code") {
code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
} else if (capture.name === "desc") {
desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex);
}
});
const bookJsonObj = {
type: "book",
marker: "id",
code: code,
content: [],
};
if (desc && desc.trim() !== "") {
bookJsonObj.content.push(desc.trim());
}
parentJsonObj.content.push(bookJsonObj);
}
// Similar conversion methods for other node types
nodeToUSJC(node, parentJsonObj) {
// Build c, the chapter milestone node in usj
const chapCap = new Query(this.usfmLanguage,
`(c (chapterNumber) -num
(ca (chapterNumber) -num)?
(cp (text) -num)?)`,
)
.captures(node);
const chapNum = this.usfm.slice(
chapCap[0].node.startIndex,
chapCap[0].node.endIndex,
);
let chapRef = null;
this.jsonRootObj.content.forEach((child) => {
if (child.type === "book") {
chapRef = `${child.code} ${chapNum}`;
return;
}
});
const chapJsonObj = {
type: "chapter",
marker: "c",
number: chapNum,
sid: chapRef,
};
chapCap.forEach((cap) => {
if (cap.name === "alt-num") {
chapJsonObj.altnumber = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
}
if (cap.name === "pub-num") {
chapJsonObj.pubnumber = this.usfm
.substring(cap.node.startIndex, cap.node.endIndex)
.trim();
}
});
parentJsonObj.content.push(chapJsonObj);
node.children.forEach((child) => {
if (["cl", "cd"].includes(child.type)) {
this.nodeToUSJ(child, parentJsonObj);
}
});
}
nodeToUSJChapter(node, parentJsonObj) {
// Build chapter node in USJ
node.children.forEach((child) => {
if (child.type === "c") {
this.nodeToUSJC(child, parentJsonObj);
} else {
this.nodeToUSJ(child, parentJsonObj);
}
});
}
nodeToUSJVerse(node, parentJsonObj) {
// Build verse node in USJ
const verseNumCap = new Query(this.usfmLanguage,
`
(v
(verseNumber)
(va (verseNumber) )?
(vp (text) )?
)`,
)
.captures(node);
const verseNum = this.usfm.substring(
verseNumCap[0].node.startIndex,
verseNumCap[0].node.endIndex,
);
const vJsonObj = {
type: "verse",
marker: "v",
number: verseNum.trim(),
};
verseNumCap.forEach((capture) => {
if (capture.name === "alt") {
const altNum = this.usfm.slice(
capture.node.startIndex,
capture.node.endIndex,
);
vJsonObj.altnumber = altNum;
} else if (capture.name === "vp") {
const vpText = this.usfm.substring(
capture.node.startIndex,
capture.node.endIndex,
);
vJsonObj.pubnumber = vpText;
}
});
const ref = `${this.findLastFromJson(this.jsonRootObj, "chapter").sid}:${verseNum}`;
vJsonObj.sid = ref.trim();
parentJsonObj.content.push(vJsonObj);
}
nodeToUSJCaVa(node, parentJsonObj) {
// Build elements for independent ca and va away from c and v
const style = node.type;
const charJsonObj = {
type: "char",
marker: style,
};
const altNumMatch = new Query(this.usfmLanguage,
`([
(chapterNumber)
(verseNumber)
] -num)`,
)
.captures(node);
const altNum = this.usfm
.slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex)
.trim();
charJsonObj.altnumber = altNum;
parentJsonObj.content.push(charJsonObj);
}
nodeToUSJPara(node, parentJsonObj) {
// Build paragraph nodes in USJ
if (node.children[0].type.endsWith("Block")) {
node.children[0].children.forEach((child) => {
this.nodeToUSJPara(child, parentJsonObj);
});
} else if (node.type === "paragraph") {
const paraTagCap = new Query(this.usfmLanguage,
"(paragraph (_) @para-marker)")
.captures(node)[0];
const paraMarker = paraTagCap.node.type;
let paraJsonObj = null;
if (paraMarker === "b") {
parentJsonObj.content.push( { type: "para", marker: paraMarker} );
} else if (!paraMarker.endsWith("Block")) {
let paraJsonObj = { type: "para", marker: paraMarker, content: [] };
paraTagCap.node.children.forEach((child) => {
this.nodeToUSJ(child, paraJsonObj);
});
parentJsonObj.content.push(paraJsonObj);
}
} else if (["pi", "ph"].includes(node.type)) {
const paraMarker = this.usfm
.substring(node.children[0].startIndex, node.children[0].endIndex)
.replace("\\", "")
.trim();
let paraJsonObj = { type: "para", marker: paraMarker, content: [] };
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, paraJsonObj);
});
parentJsonObj.content.push(paraJsonObj);
}
}
nodeToUSJNotes(node, parentJsonObj) {
// Build USJ nodes for footnotes and cross-references
const tagNode = node.children[0];
const callerNode = node.children[1];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace("\\", "")
.trim();
const noteJsonObj = {
type: "note",
marker: style,
content: [],
};
noteJsonObj.caller = this.usfm
.substring(callerNode.startIndex, callerNode.endIndex)
.trim();
for (let i = 2; i < node.children.length - 1; i++) {
this.nodeToUSJ(node.children[i], noteJsonObj);
}
parentJsonObj.content.push(noteJsonObj);
}
nodeToUSJChar(node, parentJsonObj) {
// Build USJ nodes for character markups, both regular and nested
const tagNode = node.children[0];
let childrenRange = node.children.length;
if (node.children[node.children.length - 1].type.startsWith("\\")) {
childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node
}
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace("\\", "")
.replace("+", "")
.trim();
const charJsonObj = {
type: "char",
marker: style,
content: [],
};
// Assume a flag for closed markup, toggle this if your conditions and data structure require
// charJsonObj.closed = node.children[node.children.length - 1].type.startsWith('\\');
for (let i = 1; i < childrenRange; i++) {
this.nodeToUSJ(node.children[i], charJsonObj);
}
parentJsonObj.content.push(charJsonObj);
}
nodeToUSJTable(node, parentJsonObj) {
// Handle table related components and convert to USJ
if (node.type === "table") {
const tableJsonObj = { type: "table", content: [] };
node.children.forEach((child) => {
this.nodeToUSJ(child, tableJsonObj);
});
parentJsonObj.content.push(tableJsonObj);
} else if (node.type === "tr") {
const rowJsonObj = { type: "table:row", marker: "tr", content: [] };
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, rowJsonObj);
});
parentJsonObj.content.push(rowJsonObj);
} else if (TABLE_CELL_MARKERS.includes(node.type)) {
const tagNode = node.children[0];
const style = this.usfm
.substring(tagNode.startIndex, tagNode.endIndex)
.replace("\\", "")
.trim();
const cellJsonObj = {
type: "table:cell",
marker: style,
content: [],
align: style.includes("tcc") ? "center" : style.includes("r") ? "end" : "start",
};
node.children.slice(1).forEach((child) => {
this.nodeToUSJ(child, cellJsonObj);
});
parentJsonObj.content.push(cellJsonObj);
}
}
nodeToUSJAttrib(node, parentJsonObj) {
// Add attribute values to USJ elements
const attribNameNode = node.children[0];
let attribName = this.usfm
.slice(attribNameNode.startIndex, attribNameNode.endIndex)
.trim();
// Handling special cases for attribute names
if (attribName === "|") {
let parentType = node.parent.type;
if (parentType.includes("Nested")) {
parentType = parentType.replace("Nested", "")
}
attribName = DEFAULT_ATTRIB_MAP[parentType];
}
if (attribName === "src") {
// for \fig
attribName = "file";
}
const attribValCap = new Query(this.usfmLanguage,
"((attributeValue) @attrib-val)")
.captures(node);
let attribValue = "";
if (attribValCap.length > 0) {
attribValue = this.usfm
.substring(
attribValCap[0].node.startIndex,
attribValCap[0].node.endIndex,
)
.trim();
}
parentJsonObj[attribName] = attribValue;
}
nodeToUSJMilestone(node, parentJsonObj) {
// Create ms node in USJ
const msNameCap = new Query(this.usfmLanguage,
`(
[(milestoneTag)
(milestoneStartTag)
(milestoneEndTag)
(zSpaceTag)
] -name)`,
)
.captures(node)[0];
const style = this.usfm
.slice(msNameCap.node.startIndex, msNameCap.node.endIndex)
.replace("\\", "")
.trim();
const msJsonObj = { type: "ms", marker: style, content: [] };
node.children.forEach((child) => {
if (child.type.endsWith("Attribute")) {
this.nodeToUSJ(child, msJsonObj);
}
});
// Though normally milestones don't have contents, custom z-namespaces could have them
if (!msJsonObj.content.length) {
delete msJsonObj.content; // Remove empty content array if not used
}
parentJsonObj.content.push(msJsonObj);
}
nodeToUSJSpecial(node, parentJsonObj) {
// Build nodes for esb, cat, fig, optbreak in USJ
if (node.type === "esb") {
const sidebarJsonObj = { type: "sidebar", marker: "esb", content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, sidebarJsonObj);
});
parentJsonObj.content.push(sidebarJsonObj);
} else if (node.type === "cat") {
const catCap = new Query(this.usfmLanguage,
"((category) @category)")
.captures(node)[0];
const category = this.usfm
.substring(catCap.node.startIndex, catCap.node.endIndex)
.trim();
parentJsonObj.category = category;
} else if (node.type === "fig") {
const figJsonObj = { type: "figure", marker: "fig", content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, figJsonObj);
});
parentJsonObj.content.push(figJsonObj);
} else if (node.type === "ref") {
const refJsonObj = { type: "ref", content: [] };
node.children.slice(1, -1).forEach((child) => {
this.nodeToUSJ(child, refJsonObj);
});
parentJsonObj.content.push(refJsonObj);
}
}
nodeToUSJGeneric(node, parentJsonObj) {
// Build nodes for para style markers in USJ
const tagNode = node.children[0];
let style = this.usfm.substring(tagNode.startIndex, tagNode.endIndex);
if (style.startsWith("\\")) {
style = style.replace("\\", "").trim();
// } else {
// style = node.type;
}
// console.log(node.children.length, node.children[0].type, node.children[1].type)
let childrenRangeStart = 1;
// if (
// node.children.length > 1 &&
// node.children[1].type.startsWith("numbered")
// ) {
// const numNode = node.children[1];
// const num = this.usfm.substring(numNode.startIndex, numNode.endIndex);
// style += num;
// childrenRangeStart = 2;
// }
const paraJsonObj = { type: "para", marker: style, content: [] };
parentJsonObj.content.push(paraJsonObj);
for (let i = childrenRangeStart; i < node.children.length; i++) {
const child = node.children[i];
if (
CHAR_STYLE_MARKERS.includes(child.type) ||
NESTED_CHAR_STYLE_MARKERS.includes(child.type) ||
[
"text",
"footnote",
"crossref",
"verseText",
"v",
"b",
"milestone",
"zNameSpace",
].includes(child.type)
) {
// Only nest these types inside the upper para style node
this.nodeToUSJ(child, paraJsonObj);
} else {
this.nodeToUSJ(child, parentJsonObj);
}
}
}
nodeToUSJ(node, parentJsonObj) {
// Check each node and based on the type convert to corresponding XML element
switch (node.type) {
case "id":
this.nodeToUSJId(node, parentJsonObj);
break;
case "chapter":
this.nodeToUSJChapter(node, parentJsonObj);
break;
case "cl":
case "cp":
case "cd":
case "vp":
this.nodeToUSJGeneric(node, parentJsonObj);
break;
case "ca":
case "va":
this.nodeToUSJCaVa(node, parentJsonObj);
break;
case "v":
this.nodeToUSJVerse(node, parentJsonObj);
break;
case "verseText":
node.children.forEach((child) => this.nodeToUSJ(child, parentJsonObj));
break;
case "paragraph":
case "pi":
case "ph":
this.nodeToUSJPara(node, parentJsonObj);
break;
case "text":
let textVal = this.usfm
.substring(node.startIndex, node.endIndex);
textVal = textVal.replace("~", " ")
if (textVal !== "") {
parentJsonObj.content.push(textVal);
}
break;
case "table":
case "tr":
this.nodeToUSJTable(node, parentJsonObj);
break;
case "milestone":
case "zNameSpace":
this.nodeToUSJMilestone(node, parentJsonObj);
break;
case "esb":
case "cat":
case "fig":
case "ref":
this.nodeToUSJSpecial(node, parentJsonObj);
break;
case "usfm":
break
default:
if (NOTE_MARKERS.includes(node.type)) {
this.nodeToUSJNotes(node, parentJsonObj)
}
else if (
CHAR_STYLE_MARKERS.includes(node.type) ||
NESTED_CHAR_STYLE_MARKERS.includes(node.type) ||
["xt_standalone"].includes(node.type)
) {
this.nodeToUSJChar(node, parentJsonObj);
} else if (TABLE_CELL_MARKERS.includes(node.type)) {
this.nodeToUSJTable(node, parentJsonObj)
} else if (node.type.endsWith("Attribute")) {
this.nodeToUSJAttrib(node, parentJsonObj);
} else if (
PARA_STYLE_MARKERS.includes(node.type) ||
PARA_STYLE_MARKERS.includes(
node.type.replace("\\", "").trim(),
)
) {
this.nodeToUSJGeneric(node, parentJsonObj);
} else if (["", "|"].includes(node.type.trim())) {
// Skip white space nodes
break;
} else if (node.children.length > 0) {
node.children.forEach((child) =>
this.nodeToUSJ(child, parentJsonObj),
);
}
// else {
//
// console.error("Encountered unknown element ", node.type);
// }
break;
}
}
}
exports.USJGenerator = USJGenerator;