UNPKG

usfm-grammar

Version:

Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM

251 lines (227 loc) 9.15 kB
const Parser = require('tree-sitter'); const USFM3 = require('tree-sitter-usfm3'); const fs = require('node:fs'); const Ajv = require('ajv'); const { USJ_SCHEMA } = require('./utils/usjSchema'); const { Query } = Parser; const bookCodeMissingPattern = /\\id[\s\n\r]*\\/; const vWithoutSpacePattern = /(\\v)(\d+)/; const cWithoutSpacePattern = /(\\c)(\d+)/; const validMarkersPattern = new RegExp( '(\\\\id|\\\\usfm|\\\\ide|\\\\ref|\\\\h|\\\\toc|\\\\toca|\\\\sts|\\\\rem|\\\\restore|' + '\\\\lit|\\\\iqt|\\\\imt|\\\\imte|\\\\is|\\\\io|\\\\ior|\\\\iot|\\\\ip|\\\\im|\\\\ipi|' + '\\\\imi|\\\\ili|\\\\ipq|\\\\imq|\\\\ipr|\\\\ib|\\\\iq|\\\\ie|\\\\iex|\\\\v|\\\\va|\\\\vp|' + '\\\\c|\\\\cl|\\\\ca|\\\\cp|\\\\cd|\\\\mt|\\\\mte|\\\\ms|\\\\mr|\\\\s|\\\\sr|\\\\r|\\\\sp|' + '\\\\d|\\\\sd|\\\\p|\\\\m|\\\\po|\\\\pr|\\\\cls|\\\\pmo|\\\\pm|\\\\pmc|\\\\pmr|\\\\pi|\\\\mi|' + '\\\\nb|\\\\pc|\\\\ph|\\\\phi|\\\\b|\\\\q|\\\\qr|\\\\qc|\\\\qs|\\\\qa|\\\\qac|\\\\qm|\\\\qd|' + '\\\\lh|\\\\lf|\\\\li|\\\\lim|\\\\liv|\\\\lik|\\\\litl|\\\\tr|\\\\th|\\\\thr|\\\\tc|\\\\tcr|' + '\\\\f|\\\\fe|\\\\ef|\\\\fr|\\\\fq|\\\\fqa|\\\\fk|\\\\fl|\\\\fw|\\\\fp|\\\\ft|\\\\fdc|\\\\fv|' + '\\\\fm|\\\\x|\\\\xo|\\\\xk|\\\\xq|\\\\xt|\\\\xta|\\\\xop|\\\\xot|\\\\xnt|\\\\xdc|\\\\rq|' + '\\\\add|\\\\bk|\\\\dc|\\\\k|\\\\nd|\\\\ord|\\\\pn|\\\\png|\\\\addpn|\\\\qt|\\\\sig|\\\\sls|' + '\\\\tl|\\\\wj|\\\\em|\\\\bd|\\\\it|\\\\bdit|\\\\no|\\\\sc|\\\\sup|\\\\ndx|\\\\pro|\\\\rb|' + '\\\\w|\\\\wg|\\\\wh|\\\\wa|\\\\fig|\\\\jmp|\\\\pb|\\\\z|\\\\esb|\\\\esbe|\\\\cat)' + '(\\d|\\s|\\n|\\r|$)', ); class Validator { constructor(usjSchemaPath = null) { this.USFMParser = new Parser(); this.USFMParser.setLanguage(USFM3); this.parserOptions = Parser.Options = { bufferSize: 1024 * 1024, }; this.USFMErrors = []; // Load the schema for validation this.USJValidator = null; try { const ajv = new Ajv(); if (usjSchemaPath === null) { this.USJValidator = ajv.compile(USJ_SCHEMA); } else { const schemaStr = fs.readFileSync(usjSchemaPath, 'utf8'); const schema = JSON.parse(schemaStr); this.USJValidator = ajv.compile(schema); } } catch (error) { throw new Error(`Error loading schema: ${error}`); } this.message = ''; this.modifiedUSFM = ''; this.usfm = ''; } isValidUSJ(usj) { this.message = ''; if (this.USJValidator(usj) === true) { return true; } else { for (const err of this.USJValidator.errors) { this.message += `Error at ${err.instancePath}: ${err.message}\n`; } return false; } } isValidUSFM(usfm) { this.usfm = usfm; this.USFMErrors = []; let tree = null; if (usfm.length > 25000) { tree = this.USFMParser.parse(usfm, null, this.parserOptions); } else { tree = this.USFMParser.parse(usfm); } const errorQuery = new Query(USFM3, '(ERROR) @errors'); const errors = errorQuery.captures(tree.rootNode); for (const error of errors) { // console.log(getAllProperties(error.node)); this.USFMErrors.push(error.node); } this.checkForMissing(tree.rootNode); if (this.USFMErrors.length > 0) { this.message = this.formatErrors(); return false; } return true; } checkForMissing(node) { for (const n of node.children) { if (n.isMissing) { this.USFMErrors.push(n); } else { this.checkForMissing(n); } } } formatErrors() { const errLines = this.USFMErrors.map(err => { if (err.isMissing) { const start = Math.max(0, err.startIndex - 3); const end = Math.min(this.usfm.length, err.startIndex + 10); return `At ${err.startIndex}:Missing something here:${this.usfm.slice(start, end)}`; } else { return `At ${err.startPosition.row}:${err.startPosition.column}, Error: ${this.usfm.substring(err.startIndex, err.endIndex)}`; } }); return `Errors present:\n\t${errLines.join('\n\t')}`; } autoFixUSFM(usfm, fixed = false) { if (this.isValidUSFM(usfm)) { if (fixed) { this.message = 'Fixed Errors in USFM'; } else { this.message = 'No Errors in USFM'; } return usfm; } let modifiedUSFM = usfm; let changed = false; for (const error of this.USFMErrors) { const errorText = usfm.substring(error.startIndex, error.endIndex); // No \P after \s5 if (error.isError && errorText.startsWith('\\s5') && !error.children.some(ch => ch.type === 'paragraph')) { // console.log("Match 1"); modifiedUSFM = modifiedUSFM.replace(/\\s5[\s\n\r]*/g, '\\s5 \n\\p\n'); changed = true; } // Missing space after \s5 else if ( error.isMissing && error.parent.type === 'sTag' && error.toString() === '(MISSING " ")') { // console.log("Match 2"); modifiedUSFM = modifiedUSFM.replace(/\\s5\n/g, '\\s5 \n'); changed = true; } // Book code is missing (empty id marker) else if (bookCodeMissingPattern.test(modifiedUSFM)) { // console.log("Match 3"); modifiedUSFM = modifiedUSFM.replace(/\\id[\s\n\r]*\\/g, '\\id XXX xxx\n\\'); changed = true; } // \p not given after section heading else if (error.isError && errorText.startsWith('\\v') && error.parent.type === 's' && !error.children.some(ch => ch.type === 'paragraph')) { // console.log("Match 4"); const start = error.parent.startIndex; const end = error.startIndex; const toReplace = modifiedUSFM.slice(start, end); modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`); changed = true; } // Space missing between \v and number else if (vWithoutSpacePattern.test(errorText)) { // console.log("Match 5"); modifiedUSFM = modifiedUSFM.replace(vWithoutSpacePattern, '$1 $2'); changed = true; } // Space missing between \c and number else if (cWithoutSpacePattern.test(errorText)) { // console.log("Match 6"); modifiedUSFM = modifiedUSFM.replace(cWithoutSpacePattern, '$1 $2'); changed = true; } // \p not given at chapter start else if ( error.isError && errorText.startsWith('\\v') && error.previousSibling.type === 'chapter' && !error.children.some(ch => ch.type === 'paragraph')) { // console.log("Match 7"); const start = error.previousSibling.startIndex; const end = error.startIndex; const toReplace = modifiedUSFM.slice(start, end); modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`); changed = true; } else if ( error.isError && !errorText.startsWith('\\') && error.previousSibling.type === 'chapter' && !error.children.some(ch => ch.type === 'paragraph')) { // console.log("Match 7.1"); const start = error.previousSibling.startIndex; const end = error.startIndex; const toReplace = modifiedUSFM.slice(start, end); modifiedUSFM = modifiedUSFM.replace(toReplace, `${toReplace}\\p\n`); changed = true; } // Stray slash not with a valid marker else if (errorText.startsWith('\\') && !validMarkersPattern.test(errorText)) { // console.log("Match 8"); modifiedUSFM = modifiedUSFM.replace(errorText, errorText.slice(1)); changed = true; } // Just a single problematic marker (could be w/o text) else if (errorText.startsWith('\\') && validMarkersPattern.test(errorText)) { // console.log("Match 9"); const start = Math.max(0, error.startIndex - 5); const end = Math.min(modifiedUSFM.length, error.endIndex + 5); const toReplace = modifiedUSFM.slice(start, end); const replacement = toReplace.replace(errorText, ''); modifiedUSFM = modifiedUSFM.replace(toReplace, replacement); changed = true; } // Empty attribute else if (errorText.trim() === '|') { // console.log("Match 10"); const start = Math.max(0, error.startIndex - 5); const end = Math.min(modifiedUSFM.length, error.endIndex + 5); const toReplace = modifiedUSFM.slice(start, end); const replacement = toReplace.replace(errorText, ''); modifiedUSFM = modifiedUSFM.replace(toReplace, replacement); changed = true; } // Stray content in the chapter line else if ( error.parent.type === 'chapter' && error.previousSibling.type === 'c' && !errorText.includes('\\')) { // console.log("Match 11"); modifiedUSFM = modifiedUSFM.replace(errorText, ''); changed = true; } } if (!changed || modifiedUSFM === usfm) { const errStr = this.formatErrors(); this.message = `Cannot fix these errors:\n\t${errStr}`; return modifiedUSFM; } // return modifiedUSFM return this.autoFixUSFM(modifiedUSFM, true); } } exports.Validator = Validator;