UNPKG

usfm-grammar

Version:

Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM

543 lines (474 loc) 20.2 kB
//Logics for syntax-tree to xml(USX) conversions const { DOMImplementation, XMLSerializer } = require('xmldom'); const xpath = require('xpath'); const Parser = require("tree-sitter"); const {Query} = Parser; const { PARA_STYLE_MARKERS, NOTE_MARKERS, CHAR_STYLE_MARKERS, NESTED_CHAR_STYLE_MARKERS, DEFAULT_ATTRIB_MAP, TABLE_CELL_MARKERS, MISC_MARKERS } = require("./utils/markers"); class USXGenerator { /** * A binding for all methods used in generating USX from Syntax tree * @param {object} treeSitterLanguageObj - The Tree-sitter language object * @param {Buffer} usfmString - The USFM byte data * @param {Element} [usxRootElement] - The root element of the USX (optional) */ constructor(treeSitterLanguageObj, usfmString, usxRootElement = null) { this.usfmLanguage = treeSitterLanguageObj; this.usfm = usfmString; const domImpl = new DOMImplementation(); const doc = domImpl.createDocument(null, 'usx', null); if (usxRootElement === null) { this.xmlRootNode = doc.documentElement; this.xmlRootNode.setAttribute('version', '3.1'); } else { this.xmlRootNode = usxRootElement; } this.parseState = { prevVerseSid: null, prevVerseParent: null } } /** * Builds the ID node in USX * @param {SyntaxNode} node - The syntax node * @param {Element} parentXmlNode - The parent XML node to append the ID to */ node2UsxId(node, parentXmlNode) { const idCaptures = new Query(this.usfmLanguage, "(id (bookcode) @book-code (description)? @desc)") .captures(node); let code = null; let desc = null; idCaptures.forEach(capture => { if (capture.name === 'book-code') { code = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); } else if (capture.name === 'desc') { desc = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); } }); const bookXmlNode = parentXmlNode.ownerDocument.createElement('book'); bookXmlNode.setAttribute('code', code); bookXmlNode.setAttribute('style', 'id'); if (desc && desc.trim() !== '') { const textNode = parentXmlNode.ownerDocument.createTextNode(desc.trim()); bookXmlNode.appendChild(textNode); } parentXmlNode.appendChild(bookXmlNode); } node2UsxC(node, parentXmlNode) { // Build c, the chapter milestone node in usj const chapCap = new Query(this.usfmLanguage, `(c (chapterNumber) @chap-num (ca (chapterNumber) @alt-num)? (cp (text) @pub-num)?)`, ) .captures(node); const chapNum = this.usfm.slice( chapCap[0].node.startIndex, chapCap[0].node.endIndex, ); const bookNode = xpath.select1("book", parentXmlNode); const bookCode = bookNode.getAttribute("code"); const chapRef = `${bookCode} ${chapNum}`; // Create the 'chapter' element const chapXmlNode = parentXmlNode.ownerDocument.createElement('chapter'); chapXmlNode.setAttribute("number", chapNum); chapXmlNode.setAttribute("style", "c"); chapXmlNode.setAttribute("sid", chapRef); chapCap.forEach((cap) => { if (cap.name === "alt-num") { const altNum = this.usfm .substring(cap.node.startIndex, cap.node.endIndex) .trim(); chapXmlNode.setAttribute('altnumber', altNum); } if (cap.name === "pub-num") { const pubNum = this.usfm .substring(cap.node.startIndex, cap.node.endIndex) .trim(); chapXmlNode.setAttribute('pubnumber', pubNum); } }); parentXmlNode.appendChild(chapXmlNode); node.children.forEach((child) => { if (["cl", "cd"].includes(child.type)) { this.node2Usx(child, parentXmlNode); } }); } node2UsxChapter(node, parentXmlNode) { // Build chapter node in USJ node.children.forEach((child) => { if (child.type === "c") { this.node2UsxC(child, parentXmlNode); } else { this.node2Usx(child, parentXmlNode); } }); const prevVerses = xpath.select("//verse", this.xmlRootNode); if (prevVerses.length > 0 && prevVerses[prevVerses.length - 1].hasAttribute('sid')) { const vEndXmlNode = parentXmlNode.ownerDocument.createElement('verse'); vEndXmlNode.setAttribute('eid', prevVerses[prevVerses.length - 1].getAttribute('sid')); const sibblingCount = parentXmlNode.childNodes.length; const lastSibbling = parentXmlNode.childNodes[sibblingCount-1]; if (lastSibbling.tagName === "para") { lastSibbling.appendChild(vEndXmlNode); } else if (lastSibbling.tagName === "table") { const rows = lastSibbling.getElementsByTagName('row'); rows[rows.length - 1].appendChild(vEndXmlNode); } else { parentXmlNode.appendChild(vEndXmlNode); } } } node2UsxVerse(node, parentXmlNode) { // Check if there are previous verses to close if (this.parseState.prevVerseSid !== null) { let prevPara = this.parseState.prevVerseParent; let vEndXmlNode = prevPara.ownerDocument.createElement('verse'); vEndXmlNode.setAttribute("eid", this.parseState.prevVerseSid); prevPara.appendChild(vEndXmlNode); } // Query to capture verse-related elements const verseNumCap = new Query(this.usfmLanguage, ` (v (verseNumber) @vnum (va (verseNumber) @alt)? (vp (text) @vp)? )`, ) .captures(node); const verseNum = this.usfm.substring( verseNumCap[0].node.startIndex, verseNumCap[0].node.endIndex, ); const vXmlNode = parentXmlNode.ownerDocument.createElement('verse'); parentXmlNode.appendChild(vXmlNode); // Loop through the captured elements and set the attributes verseNumCap.forEach(capture => { if (capture.name === 'alt') { const altNum = this.usfm.slice(capture.node.startIndex, capture.node.endIndex); vXmlNode.setAttribute('altnumber', altNum); } else if (capture.name === 'vp') { const vpText = this.usfm.slice(capture.node.startIndex, capture.node.endIndex).trim(); vXmlNode.setAttribute('pubnumber', vpText); } }); // Get the last chapter's 'sid' attribute to form the verse reference const chapterSid = xpath.select("//chapter", this.xmlRootNode).pop().getAttribute('sid'); const ref = `${chapterSid}:${verseNum}`; // Set attributes on the newly created 'verse' element vXmlNode.setAttribute('number', verseNum.trim()); vXmlNode.setAttribute('style', 'v'); vXmlNode.setAttribute('sid', ref.trim()); } node2UsxCaVa(node, parentXmlNode) { // Build elements for independent ca and va away from c and v const style = node.type; // Create a new 'char' element under the parent XML node const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); charXmlNode.setAttribute('style', style); // Query to capture chapterNumber or verseNumber const altNumMatch = new Query(this.usfmLanguage, `([ (chapterNumber) (verseNumber) ] @alt-num)`, ) .captures(node); // Extract the alternate number from the captured range const altNum = this.usfm .slice(altNumMatch[0].node.startIndex, altNumMatch[0].node.endIndex) .trim(); // Set the attributes on the 'char' element charXmlNode.setAttribute('altnumber', altNum); charXmlNode.setAttribute('closed', 'true'); // Append the 'char' element to the parent XML node parentXmlNode.appendChild(charXmlNode); } node2UsxPara(node, parentXmlNode) { // Build paragraph nodes in USX if (node.children[0].type.endsWith('Block')) { for (const child of node.children[0].children) { this.node2UsxPara(child, parentXmlNode); } } else if (node.type === 'paragraph') { const paraTagCap = new Query(this.usfmLanguage, "(paragraph (_) @para-marker)").captures(node)[0]; const paraMarker = paraTagCap.node.type; if (!paraMarker.endsWith("Block")) { const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); paraXmlNode.setAttribute("style", paraMarker); parentXmlNode.appendChild(paraXmlNode); for (const child of paraTagCap.node.children.slice(1)) { this.node2Usx(child, paraXmlNode); } } } else if (['pi', 'ph'].includes(node.type)) { const paraMarker = this.usfm.slice(node.children[0].startIndex, node.children[0].endIndex) .replace("\\", "") .trim(); const paraXmlNode = parentXmlNode.ownerDocument.createElement("para"); paraXmlNode.setAttribute("style", paraMarker); parentXmlNode.appendChild(paraXmlNode); for (const child of node.children.slice(1)) { this.node2Usx(child, paraXmlNode); } } } node2UsxNotes(node, parentXmlNode) { // Build USJ nodes for footnotes and cross-references const tagNode = node.children[0]; const callerNode = node.children[1]; const style = this.usfm .substring(tagNode.startIndex, tagNode.endIndex) .replace("\\", "") .trim(); const noteXmlNode = parentXmlNode.ownerDocument.createElement('note'); noteXmlNode.setAttribute('style', style); const caller = this.usfm .substring(callerNode.startIndex, callerNode.endIndex) .trim(); noteXmlNode.setAttribute('caller', caller); parentXmlNode.appendChild(noteXmlNode); for (let i = 2; i < node.children.length - 1; i++) { this.node2Usx(node.children[i], noteXmlNode); } } node2UsxChar(node, parentXmlNode) { // Build USJ nodes for character markups, both regular and nested const tagNode = node.children[0]; let childrenRange = node.children.length; if (node.children[node.children.length - 1].type.startsWith("\\")) { childrenRange -= 1; // Exclude the last node if it starts with '\', treating it as a closing node } const charXmlNode = parentXmlNode.ownerDocument.createElement('char'); const style = this.usfm .substring(tagNode.startIndex, tagNode.endIndex) .replace("\\", "") .replace("+", "") .trim(); charXmlNode.setAttribute('style', style); parentXmlNode.appendChild(charXmlNode); for (let i = 1; i < childrenRange; i++) { this.node2Usx(node.children[i], charXmlNode); } } node2UsxAttrib(node, parentXmlNode) { // Add attribute values to USJ elements const attribNameNode = node.children[0]; let attribName = this.usfm .slice(attribNameNode.startIndex, attribNameNode.endIndex) .trim(); // Handling special cases for attribute names if (attribName === "|") { let parentType = node.parent.type; if (parentType.includes("Nested")) { parentType = parentType.replace("Nested", "") } attribName = DEFAULT_ATTRIB_MAP[parentType]; } if (attribName === "src") { // for \fig attribName = "file"; } const attribValCap = new Query(this.usfmLanguage, "((attributeValue) @attrib-val)") .captures(node); let attribValue = ""; if (attribValCap.length > 0) { attribValue = this.usfm .substring( attribValCap[0].node.startIndex, attribValCap[0].node.endIndex, ) .trim(); } parentXmlNode.setAttribute(attribName, attribValue); } node2UsxTable(node, parentXmlNode) { // Handle table related components and convert to USJ if (node.type === "table") { const tableXmlNode = parentXmlNode.ownerDocument.createElement('table'); parentXmlNode.appendChild(tableXmlNode); node.children.forEach((child) => { this.node2Usx(child, tableXmlNode); }); } else if (node.type === "tr") { const rowXmlNode = parentXmlNode.ownerDocument.createElement('row'); rowXmlNode.setAttribute("style", "tr"); parentXmlNode.appendChild(rowXmlNode); node.children.slice(1).forEach((child) => { this.node2Usx(child, rowXmlNode); }); } else if (TABLE_CELL_MARKERS.includes(node.type)) { const tagNode = node.children[0]; const style = this.usfm .substring(tagNode.startIndex, tagNode.endIndex) .replace("\\", "") .trim(); const cellXmlNode = parentXmlNode.ownerDocument.createElement("cell"); cellXmlNode.setAttribute("style", style); cellXmlNode.setAttribute("align", style.includes("tcc") ? "center" : style.includes("r") ? "end" : "start"); parentXmlNode.appendChild(cellXmlNode); node.children.slice(1).forEach((child) => { this.node2Usx(child, cellXmlNode); }); } } node2UsxMilestone(node, parentXmlNode) { // Create ms node in USJ const msNameCap = new Query(this.usfmLanguage, `( [(milestoneTag) (milestoneStartTag) (milestoneEndTag) (zSpaceTag) ] @ms-name)`, ) .captures(node)[0]; const style = this.usfm .slice(msNameCap.node.startIndex, msNameCap.node.endIndex) .replace("\\", "") .trim(); const msXmlNode = parentXmlNode.ownerDocument.createElement("ms"); msXmlNode.setAttribute("style", style); parentXmlNode.appendChild(msXmlNode); node.children.forEach((child) => { if (child.type.endsWith("Attribute")) { this.node2Usx(child, msXmlNode); } }); } node2UsxSpecial(node, parentXmlNode) { // Build nodes for esb, cat, fig, optbreak in USJ if (node.type === "esb") { const sidebarXmlNode = parentXmlNode.ownerDocument.createElement('sidebar'); sidebarXmlNode.setAttribute('style', "esb"); parentXmlNode.appendChild(sidebarXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, sidebarXmlNode); }); } else if (node.type === "cat") { const catCap = new Query(this.usfmLanguage, "((category) @category)") .captures(node)[0]; const category = this.usfm .substring(catCap.node.startIndex, catCap.node.endIndex) .trim(); parentXmlNode.setAttribute("category", category); } else if (node.type === "fig") { const figXmlNode = parentXmlNode.ownerDocument.createElement('figure'); figXmlNode.setAttribute("style", "fig"); parentXmlNode.appendChild(figXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, figXmlNode); }); } else if (node.type === "ref") { const refXmlNode = parentXmlNode.ownerDocument.createElement('ref'); parentXmlNode.appendChild(refXmlNode); node.children.slice(1, -1).forEach((child) => { this.node2Usx(child, refXmlNode); }); } } node2UsxGeneric(node, parentXmlNode) { const tagNode = node.children[0]; let style = this.usfm.slice(tagNode.startIndex, tagNode.endIndex).trim(); // Strip leading backslashes from the style or use node type if (style.startsWith('\\')) { style = style.replace('\\', ''); // } else { // style = node.type; } if (style === "usfm") { return } let childrenRangeStart = 1; // Create a 'para' element and set its style attribute const paraXmlNode = parentXmlNode.ownerDocument.createElement('para'); paraXmlNode.setAttribute('style', style); parentXmlNode.appendChild(paraXmlNode); // Loop through the child nodes and recursively process them for (let i = childrenRangeStart; i < node.children.length; i++) { const child = node.children[i]; if ( CHAR_STYLE_MARKERS.includes(child.type) || NESTED_CHAR_STYLE_MARKERS.includes(child.type) || [ "text", "footnote", "crossref", "verseText", "v", "b", "milestone", "zNameSpace", ].includes(child.type) ) { // If the child is of one of the allowed types, nest it inside the para node this.node2Usx(child, paraXmlNode); } else { // Otherwise, append the child to the parent XML node this.node2Usx(child, parentXmlNode); } } // Append the created para node to the parent XML node } node2Usx(node, parentXmlNode) { // Handling node types with respective functions if (node.type === "id") { this.node2UsxId(node, parentXmlNode); } else if (node.type === "chapter") { this.node2UsxChapter(node, parentXmlNode); } else if (["cl", "cp", "cd", "vp"].includes(node.type)) { this.node2UsxGeneric(node, parentXmlNode); } else if (["ca", "va"].includes(node.type)) { this.node2UsxCaVa(node, parentXmlNode); } else if (node.type === "v") { this.node2UsxVerse(node, parentXmlNode); } else if (node.type === "verseText") { node.children.forEach(child => { this.node2Usx(child, parentXmlNode); }); } else if (["paragraph", "pi", "ph"].includes(node.type)) { this.node2UsxPara(node, parentXmlNode); } else if (NOTE_MARKERS.includes(node.type)) { this.node2UsxNotes(node, parentXmlNode); } else if ( CHAR_STYLE_MARKERS.concat(NESTED_CHAR_STYLE_MARKERS, ["xt_standalone"]).includes(node.type) ) { this.node2UsxChar(node, parentXmlNode); } else if (node.type.endsWith("Attribute")) { this.node2UsxAttrib(node, parentXmlNode); } else if (node.type === "text") { let textVal = this.usfm.slice(node.startIndex, node.endIndex); textVal = textVal.replace("~", " ") if (textVal !== "") { const textNode = parentXmlNode.ownerDocument.createTextNode(textVal); parentXmlNode.appendChild(textNode); } } else if (["table", "tr"].concat(TABLE_CELL_MARKERS).includes(node.type)) { this.node2UsxTable(node, parentXmlNode); } else if (node.type === "milestone" || node.type === "zNameSpace") { this.node2UsxMilestone(node, parentXmlNode); } else if (["esb", "cat", "fig", "ref"].includes(node.type)) { this.node2UsxSpecial(node, parentXmlNode); } else if ( PARA_STYLE_MARKERS.includes(node.type) || PARA_STYLE_MARKERS.includes(node.type.replace("\\", "").trim()) ) { this.node2UsxGeneric(node, parentXmlNode); } else if (["", "|"].includes(node.type.trim())) { // Skip whitespace nodes } else if (node.children.length > 0) { node.children.forEach(child => { this.node2Usx(child, parentXmlNode); }); } // else { // throw new Error(`Encountered unknown element: ${node}`); // } } } exports.USXGenerator = USXGenerator;