UNPKG

usfm-grammar

Version:

Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM

359 lines (322 loc) 11.2 kB
const assert = require("assert"); const fs = require("node:fs"); const Ajv = require("ajv"); const { allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers, } = require("./config"); const {USFMParser, Filter} = require("../src/index"); // Cache for parsed USFM files and their generated USJ to avoid repeated parsing. Not a testing vioaltion cause tests read the same usfm in, and are checking several things against usj out const parsedCache = new Map(); // Setup function to populate the cache before(async function () { // Increase timeout to handle the parsing of many files this.timeout(90_000); // 90 seconds timeout console.log("Initializing USJ test cache..."); const schemaStr = fs.readFileSync("../schemas/usj.js", "utf8"); const schema = JSON.parse(schemaStr); parsedCache.set("ajvSchema", schema); for (const filepath of allUsfmFiles) { if (isValidUsfm[filepath]) { try { const parser = await initialiseParser(filepath); const usj = parser.toUSJ(); parsedCache.set(filepath, { parser, usj, usfm: parser.usfm, }); } catch (error) { console.error(`Failed to pre-parse ${filepath}: ${error.message}`); } } } console.log(`Cached ${parsedCache.size} USFM files for testing`); }); beforeEach(() => { if (global.gc) { global.gc(); } }); describe("Check successful USFM-USJ conversion for positive samples", () => { allUsfmFiles.forEach(function (value) { if (isValidUsfm[value]) { it(`Convert ${value} to USJ`, async (inputUsfmPath = value) => { const cached = parsedCache.get(value); assert(cached, `File ${value} should be in cache`); const usj = cached.usj; assert(usj instanceof Object); assert.strictEqual(usj["type"], "USJ"); assert.strictEqual(usj["version"], "3.1"); assert.strictEqual(usj.content[0].type, "book"); assert.strictEqual(usj.content[0].marker, "id"); }); } }); }); describe("Compare generated USJ with testsuite sample", () => { allUsfmFiles.forEach(function (filepath) { const usjPath = filepath.replace(".usfm", ".json"); if (isValidUsfm[filepath] && !excludeUSJs.includes(usjPath)) { it(`Compare generated USJ to ${usjPath}`, function () { const cached = parsedCache.get(filepath); assert(cached, `File ${filepath} should be in cache`); let fileData = null; try { fileData = fs.readFileSync(usjPath, "utf8"); } catch (err) { if (err.code === "ENOENT") { this.skip(); } throw err; } const generatedUSJ = JSON.parse(JSON.stringify(cached.usj)); // Deep clone to avoid modifying cached object const testsuiteUSJ = JSON.parse(fileData); stripDefaultAttribValue(testsuiteUSJ); removeNewlinesInText(testsuiteUSJ); stripTextValue(testsuiteUSJ); removeNewlinesInText(generatedUSJ); stripTextValue(generatedUSJ); assert.deepEqual(generatedUSJ, testsuiteUSJ); }); } }); }); describe("Test USFM-USJ-USFM roundtripping", () => { allUsfmFiles.forEach(function (filepath) { if (isValidUsfm[filepath]) { it(`Roundtrip ${filepath} via USJ`, function () { const cached = parsedCache.get(filepath); assert(cached, `File ${filepath} should be in cache`); const usj = cached.usj; const originalUsfm = cached.usfm; const testParser2 = new USFMParser(null, usj); const generatedUSFM = testParser2.usfm; assert.strictEqual(typeof generatedUSFM, "string"); assert(generatedUSFM.startsWith("\\id")); const inputMarkers = findAllMarkers(originalUsfm); const finalMarkers = findAllMarkers(generatedUSFM); assert.deepStrictEqual( inputMarkers, finalMarkers, `Markers in input and generated USFMs differ` ); }); } }); }); describe("Ensure all markers are in USJ", () => { allUsfmFiles.forEach(function (filepath) { if (isValidUsfm[filepath]) { it(`Check for markers of ${filepath} in USJ`, function () { const cached = parsedCache.get(filepath); assert(cached, `File ${filepath} should be in cache`); const usj = cached.usj; const originalUsfm = cached.usfm; const inputMarkers = [...new Set(findAllMarkers(originalUsfm, true))]; const allUSJTypes = getTypes(usj); assert.deepStrictEqual( inputMarkers, allUSJTypes, `Markers in input and generated USJ differ` ); }); } }); }); describe("Validate USJ against schema", () => { // Test generated USJ against USJ schema // const ajv = new Ajv(); // const schemaStr = fs.readFileSync("../schemas/usj.js", "utf8"); const ajv = new Ajv(); const schema = parsedCache; const validate = ajv.compile(schema); allUsfmFiles.forEach(function (value) { if (isValidUsfm[value]) { it(`Validate USJ generated from ${value}`, async () => { const cached = parsedCache.get(value); assert(cached, `File ${value} should be in cache`); const usj = cached.usj; assert(validate(usj), JSON.stringify(validate.errors, null, 2)); }); } }); }); describe("Test Exclude Marker option", () => { // Test Exclude Maker option by checking markers in the USJ const excludeTests = [ ["v", "c"], Filter.PARAGRAPHS, [...Filter.TITLES, ...Filter.BOOK_HEADERS], ]; excludeTests.forEach(function (exList) { allUsfmFiles.forEach(function (filepath) { if (isValidUsfm[filepath]) { it(`Exclude ${exList.slice(0, 5)} from ${filepath}`, async function () { const cached = parsedCache.get(filepath); assert(cached, `File ${filepath} should be in cache`); // For exclude tests, we need to regenerate the USJ with specific options const parser = cached.parser; const usj = parser.toUSJ(exList); const allUSJTypes = getTypes(usj); let types = new Set(allUSJTypes); let intersection = exList.filter((value) => types.has(value)); assert.deepStrictEqual(intersection, []); }); } }); }); }); describe("Test Include Marker option", () => { const includeTests = [ ["v", "c"], Filter.PARAGRAPHS, [...Filter.TITLES, ...Filter.BOOK_HEADERS], ]; includeTests.forEach(function (inList) { allUsfmFiles.forEach(function (filepath) { if (isValidUsfm[filepath]) { it(`Include ${inList.slice(0, 5)} in ${filepath}`, async function () { const cached = parsedCache.get(filepath); assert(cached, `File ${filepath} should be in cache`); // For include tests, we need to regenerate the USJ with specific options const parser = cached.parser; const usj = parser.toUSJ(null, inList); let allUSJTypes = getTypes(usj, false); assert( allUSJTypes.every((element) => inList.includes(element)), allUSJTypes ); }); } }); }); }); describe("Try invalid USJ", () => { it("without type", async () => { const usj = {"some key": "qwerty", content: []}; try { const testParser = new USFMParser(null, usj); } catch (err) { assert.strictEqual( "Invalid input for USJ. Expected USJ json object.", err.message ); } }); it("interger", () => { const usj = {type: "para", content: [1, 2, 3]}; try { const testParser = new USFMParser(null, usj); } catch (err) { assert.strictEqual( "Invalid input for USJ. Expected USJ json object.", err.message ); } }); it("content with array", () => { const usj = {"some key": "qwerty", content: [["test", "test", "test"]]}; try { const testParser = new USFMParser(null, usj); } catch (err) { assert.strictEqual( "Invalid input for USJ. Expected USJ json object.", err.message ); } }); }); function stripTextValue(usjObj) { /* Trailing and preceding space handling can be different between tcdocs and our logic. Strip both before comparison */ if (usjObj.hasOwnProperty("content")) { usjObj["content"].forEach((item, index) => { if (typeof item === "string") { usjObj["content"][index] = item.trim(); // Strip spaces from strings } else { stripTextValue(item); // Recursively handle nested objects } }); } } function removeNewlinesInText(usjDict) { /* The test samples in testsuite do not preserve new lines. But we do in usfm-grammar. So removing them just for comparison */ if (usjDict.hasOwnProperty("content")) { usjDict["content"].forEach((item, index) => { if (typeof item === "string") { // Replace newlines with spaces usjDict["content"][index] = item.replace(/\n/g, " "); // Replace multiple spaces with a single space usjDict["content"][index] = usjDict["content"][index].replace( /\s+/g, " " ); } else { removeNewlinesInText(item); // Recursively handle nested dictionaries } }); // there will be difference in number of white space only text snippets usjDict["content"] = usjDict["content"].filter((item) => item === ""); } } function stripDefaultAttribValue(usjDict) { /* The USX samples in test suite have space in lemma values when given as default attribute */ if (usjDict.hasOwnProperty("content")) { usjDict["content"].forEach((item) => { if (typeof item === "object" && !Array.isArray(item)) { if (item["type"] === "char" && item["marker"] === "w") { if (item.hasOwnProperty("lemma")) { item["lemma"] = item["lemma"].trim(); // Strip spaces from 'lemma' } } stripDefaultAttribValue(item); // Recursively handle nested dictionaries } }); } } function getTypes(element, keepNumber = true) { // Recursive function to find all keys in the dict output let types = []; if (typeof element === "string") { return types; // Return empty array if element is a string } else { if ("marker" in element) { types.push(element.marker); } if (element.type === "ref") { types.push("ref"); } if ("altnumber" in element) { if (element.marker === "c") { types.push("ca"); } else { types.push("va"); } } if ("pubnumber" in element) { if (element.marker === "c") { types.push("cp"); } else { types.push("vp"); } } if ("category" in element) { types.push("cat"); } if ("content" in element) { element.content.forEach((item) => { types = types.concat(getTypes(item)); // Recursively get types from content }); } } let uniqueTypes = [...new Set(types)]; if (!keepNumber) { uniqueTypes = uniqueTypes.map((item) => item.replace(/\d+$/, "")); } return uniqueTypes; }