usfm-grammar
Version:
Uses the tree-sitter-usfm3 parser to convert USFM files to other formats such as USJ, USX, and CSV, and converts them back to USFM
300 lines (260 loc) • 10.3 kB
JavaScript
const assert = require('assert');
const fs = require('node:fs');
const Ajv = require('ajv');
const {allUsfmFiles, initialiseParser, isValidUsfm, excludeUSJs, findAllMarkers} = require('./config');
const {USFMParser, Filter} = require("../src/index");
describe("Check successful USFM-USJ conversion for positive samples", () => {
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Convert ${value} to USJ`, (inputUsfmPath=value) => {
//Tests if input parses without errors
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ();
assert(usj instanceof Object);
assert.strictEqual(usj["type"], "USJ");
assert.strictEqual(usj["version"], "3.1");
assert.strictEqual(usj.content[0].type, "book");
assert.strictEqual(usj.content[0].marker, "id");
});
}
});
});
describe("Compare generated USJ with testsuite sample", () => {
allUsfmFiles.forEach(function(value) {
const usjPath = value.replace(".usfm", ".json");
if (isValidUsfm[value] && ! excludeUSJs.includes(usjPath)) {
it(`Compare generated USJ to ${usjPath}`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
const generatedUSJ = testParser.toUSJ();
const filePath = usjPath;
let fileData = null;
try {
fileData = fs.readFileSync(filePath, "utf8");
} catch(err) {
if (err.code === "ENOENT") {
return
}
}
const testsuiteUSJ = JSON.parse(fileData);
stripDefaultAttribValue(testsuiteUSJ)
removeNewlinesInText(testsuiteUSJ)
stripTextValue(testsuiteUSJ)
removeNewlinesInText(generatedUSJ)
stripTextValue(generatedUSJ)
assert.deepEqual(generatedUSJ, testsuiteUSJ);
});
}
});
});
describe("Test USFM-USJ-USFM roundtripping", () => {
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Roundtrip ${value} via USJ`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ();
assert(usj instanceof Object);
const testParser2 = new USFMParser(usfmString=null, fromUsj=usj);
const generatedUSFM = testParser2.usfm;
assert.strictEqual(typeof generatedUSFM, 'string');
assert(generatedUSFM.startsWith("\\id"));
const inputMarkers = findAllMarkers(testParser.usfm)
const finalMarkers = findAllMarkers(generatedUSFM)
assert.deepStrictEqual(inputMarkers, finalMarkers, `Markers in input and generated USFMs differ`)
});
}
});
});
describe("Ensure all markers are in USJ", () => {
// Tests if all markers in USFM are present in output also
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Check for markers of ${value} in USJ`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ();
assert(usj instanceof Object);
const inputMarkers = [... new Set(findAllMarkers(testParser.usfm, keepId=true))]
const allUSJTypes = getTypes(usj);
assert.deepStrictEqual(inputMarkers, allUSJTypes, `Markers in input and generated USJ differ`)
});
}
});
});
describe("Validate USJ against schema", () => {
// Test generated USJ against USJ schema
const ajv = new Ajv();
const schemaStr = fs.readFileSync("../schemas/usj.js", 'utf8');
const schema = JSON.parse(schemaStr);
const validate = ajv.compile(schema);
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Validate USJ generated from ${value}`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ();
assert(usj instanceof Object);
assert(validate(usj));
});
}
});
});
describe("Test Exclude Marker option", () => {
// Test Exclude Maker option by checking markers in the USJ
const excludeTests = [
['v', 'c'],
Filter.PARAGRAPHS,
[...Filter.TITLES, ...Filter.BOOK_HEADERS ]
]
excludeTests.forEach(function(exList) {
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Exclude ${exList.slice(0, 5)} from ${value}`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ(excludeMarkers=exList);
assert(usj instanceof Object);
const allUSJTypes = getTypes(usj)
let types = new Set(allUSJTypes);
let intersection = exList.filter(value => types.has(value));
assert.deepStrictEqual(intersection, [])
});
}
})
})
});
describe("Test Include Marker option", () => {
// Test Include Maker option by checking markers in the USJ
const includeTests = [
['v', 'c'],
Filter.PARAGRAPHS,
[...Filter.TITLES, ...Filter.BOOK_HEADERS ]
]
includeTests.forEach(function(inList) {
allUsfmFiles.forEach(function(value) {
if (isValidUsfm[value]) {
it(`Include ${inList.slice(0, 5)} in ${value}`, (inputUsfmPath=value) => {
const testParser = initialiseParser(inputUsfmPath)
assert(testParser instanceof USFMParser)
const usj = testParser.toUSJ(null, inList);
assert(usj instanceof Object);
let allUSJTypes = getTypes(usj, keepNumber=false)
assert( allUSJTypes.every(element => inList.includes(element)), allUSJTypes)
});
}
})
})
});
describe("Try invlaid USJ", () => {
it("without type", () => {
const usj = {"some key":"qwerty", "content": []};
try {
const testParser = new USFMParser(null, usj);
} catch(err) {
assert.strictEqual("Invalid input for USJ. Expected USJ json object.", err.message)
}
});
it("interger", () => {
const usj = {"type":"para", "content": [1, 2, 3]};
try {
const testParser = new USFMParser(null, usj);
} catch(err) {
assert.strictEqual("Invalid input for USJ. Expected USJ json object.", err.message)
}
});
it("content with array", () => {
const usj = {"some key":"qwerty", "content": [["test", "test", "test"]]};
try {
const testParser = new USFMParser(null, usj);
} catch(err) {
assert.strictEqual("Invalid input for USJ. Expected USJ json object.", err.message)
}
});
});
function stripTextValue(usjObj) {
/* Trailing and preceding space handling can be different between tcdocs and our logic.
Strip both before comparison */
if (usjObj.hasOwnProperty("content")) {
usjObj["content"].forEach((item, index) => {
if (typeof item === 'string') {
usjObj["content"][index] = item.trim(); // Strip spaces from strings
} else {
stripTextValue(item); // Recursively handle nested objects
}
usjObj['content'] = usjObj['content'].filter(item => item === "")
});
}
}
function removeNewlinesInText(usjDict) {
/* The test samples in testsuite do not preserve new lines. But we do in usfm-grammar.
So removing them just for comparison */
if (usjDict.hasOwnProperty("content")) {
usjDict["content"].forEach((item, index) => {
if (typeof item === 'string') {
// Replace newlines with spaces
usjDict["content"][index] = item.replace(/\n/g, " ");
// Replace multiple spaces with a single space
usjDict["content"][index] = usjDict["content"][index].replace(/\s+/g, " ");
} else {
removeNewlinesInText(item); // Recursively handle nested dictionaries
}
});
}
}
function stripDefaultAttribValue(usjDict) {
/* The USX samples in test suite have space in lemma values when given as default attribute */
if (usjDict.hasOwnProperty("content")) {
usjDict["content"].forEach(item => {
if (typeof item === 'object' && !Array.isArray(item)) {
if (item["type"] === "char" && item["marker"] === "w") {
if (item.hasOwnProperty("lemma")) {
item["lemma"] = item["lemma"].trim(); // Strip spaces from 'lemma'
}
}
stripDefaultAttribValue(item); // Recursively handle nested dictionaries
}
});
}
}
function getTypes(element, keepNumber=true) {
// Recursive function to find all keys in the dict output
let types = [];
if (typeof element === 'string') {
return types; // Return empty array if element is a string
} else {
if ('marker' in element) {
types.push(element.marker);
}
if (element.type === 'ref') {
types.push("ref");
}
if ('altnumber' in element) {
if (element.marker === 'c') {
types.push('ca');
} else {
types.push('va');
}
}
if ('pubnumber' in element) {
if (element.marker === 'c') {
types.push('cp');
} else {
types.push('vp');
}
}
if ('category' in element) {
types.push('cat');
}
if ('content' in element) {
element.content.forEach(item => {
types = types.concat(getTypes(item)); // Recursively get types from content
});
}
}
let uniqueTypes = [...new Set(types)];
if (! keepNumber) {
uniqueTypes = uniqueTypes.map(item => item.replace(/\d+$/, ''));
}
return uniqueTypes;
}