bible-checker
Version:
A bible tool to run several checks with a target translation and a source tranlation
524 lines (502 loc) • 18.6 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.checkChapterVerseIntegrity = checkChapterVerseIntegrity;
exports.detectFootnoteQuotes = detectFootnoteQuotes;
exports.detectMissingVerses = detectMissingVerses;
exports.detectNumberMismatches = detectNumberMismatches;
exports.detectRepeatedWordsAndWhitespace = detectRepeatedWordsAndWhitespace;
exports.detectShortLongVerses = detectShortLongVerses;
exports.detectUnmatchedPunctuation = detectUnmatchedPunctuation;
exports.extractChapterVerses = extractChapterVerses;
exports.extractNumbers = extractNumbers;
exports.extractVerses = extractVerses;
exports.normalizeNumber = normalizeNumber;
var _USJHandler = require("./USJHandler.js");
const numeralMapping = {
// Arabic and Eastern Arabic numerals
"0": ["0", "٠"],
"1": ["1", "١"],
"2": ["2", "٢"],
"3": ["3", "٣"],
"4": ["4", "٤"],
"5": ["5", "٥"],
"6": ["6", "٦"],
"7": ["7", "٧"],
"8": ["8", "٨"],
"9": ["9", "٩"]
};
function normalizeNumber(symbol) {
for (const [normalized, variants] of Object.entries(numeralMapping)) {
if (variants.includes(symbol)) {
return normalized;
}
}
return null;
}
/**
* Extracts verses from the USJ JSON format while skipping metadata like 'w' and 'zaln-*'.
* @param {object} usj - Parsed USJ JSON object.
* @returns {object} Map of verse IDs to their cleaned text content.
*/
function extractVerses(usj) {
// const handler = new USJHandler(usj);
const verses = {};
let currentChapter = null;
let currentVerse = null;
let currentContent = '';
function traverse(content, inFootNote = false) {
if (inFootNote) {
inFootNote = false;
return;
}
for (const item of content) {
if (item.marker === 'f' || item.marker === 'x') {
inFootNote = true;
} else if (item.marker === 'c' && item.number) {
// New chapter: reset tracking
currentChapter = item.number;
currentVerse = null;
currentContent = '';
} else if (item.marker === 'v' && item.number) {
// New verse: store previous verse and reset content
if (currentChapter && currentVerse) {
verses[`${currentChapter}:${currentVerse}`] = currentContent.trim();
}
currentVerse = item.number;
currentContent = '';
} else if (typeof item === 'string') {
// Append plain strings directly
currentContent += item;
} else if (item.type === 'char' && Array.isArray(item.content)) {
// Extract content from char markers
currentContent += item.content.join('');
} else if (item.content) {
// Recursively handle nested content
traverse(item.content);
}
}
}
traverse(usj.content);
// Save the last verse if it exists
if (currentChapter && currentVerse) {
verses[`${currentChapter}:${currentVerse}`] = currentContent.trim();
}
return verses;
}
/**
* Extracts chapter and verse numbers from USJ content.
* @param {object} text - USJ JSON object.
* @returns {object} Map of chapters to arrays of verses.
*/
function extractChapterVerses(text) {
const handler = new _USJHandler.USJHandler(text);
const chapters = {};
handler.traverse(text.content, item => {
if (item.marker === 'c' && item.number) {
handler.currentChapter = item.number;
if (!chapters[handler.currentChapter]) {
chapters[handler.currentChapter] = [];
}
} else if (item.marker === 'v' && item.number) {
chapters[handler.currentChapter].push(parseInt(item.number, 10));
}
});
return chapters;
}
function extractNumbers(text) {
const numberRegex = /[\d٠-٩]/g;
return [...text.matchAll(numberRegex)].map(match => normalizeNumber(match[0])).filter(Boolean);
}
/**
* Detects short, long, or empty verses based on length comparison between source and target.
* @param {object} source - Parsed JSON object of the source text.
* @param {object} target - Parsed JSON object of the target text.
* @param {number} threshold - Percentage difference to consider (default: 20%).
* @returns {object} Report of short/long verses and empty verses.
*/
function detectShortLongVerses(source, target, threshold = 20) {
const handlerSrc = new _USJHandler.USJHandler(source);
const handlerTgt = new _USJHandler.USJHandler(target);
const issues = [];
const sourceVerses = handlerSrc.extractVerses();
const targetVerses = handlerTgt.extractVerses();
for (const [key, sourceText] of Object.entries(sourceVerses)) {
const targetText = targetVerses[key] || '';
const sourceLength = sourceText.trim().length;
const targetLength = targetText.trim().length;
// Detect empty source or target verses
if (sourceLength === 0 && targetLength > 0) {
issues.push({
type: 'empty',
verse: key,
source_length: sourceLength,
target_length: targetLength,
verse_text: targetText,
difference: null,
comment: 'Source verse is empty, but target contains text.'
});
} else if (sourceLength > 0 && targetLength === 0) {
issues.push({
type: 'empty',
verse: key,
source_length: sourceLength,
target_length: targetLength,
verse_text: sourceText,
difference: null,
comment: 'Target verse is empty, but source contains text.'
});
} else if (sourceLength > 0 && targetLength > 0) {
// Detect short or long verses
const diffPercentage = (targetLength - sourceLength) / sourceLength * 100;
if (Math.abs(diffPercentage) > threshold) {
issues.push({
type: diffPercentage > 0 ? 'long' : 'short',
verse: key,
source_length: sourceLength,
target_length: targetLength,
verse_text: sourceText,
difference: `${parseFloat(Math.abs(diffPercentage).toFixed(2))}%`,
comment: diffPercentage > 0 ? 'Target verse is too long compared to source.' : 'Target verse is too short compared to source.'
});
}
}
}
return {
check: 'short_long_verses',
issues
};
}
/**
* Checks for duplicated or out-of-order chapter/verse numbers.
* @param {object} source - Parsed JSON object of the source text.
* @param {object} target - Parsed JSON object of the target text.
* @returns {object} Report of chapter/verse integrity issues.
*/
function checkChapterVerseIntegrity(source, target) {
const handler = new _USJHandler.USJHandler(target);
const issues = [];
// const sourceChapters = extractChapterVerses(source);
const targetChapters = extractChapterVerses(target);
const targetVerses = handler.extractVerses();
function validateIntegrity(chapterVerses, textType, verses) {
const seen = new Set();
let lastChapter = 0;
let lastVerse;
for (let [chapter, versesInChapter] of Object.entries(chapterVerses)) {
lastVerse = 0;
chapter = parseInt(chapter, 10);
if (chapter < lastChapter) {
issues.push({
type: 'out_of_order',
chapter,
comment: `${textType} has out-of-order chapter ${chapter}.`
});
}
lastChapter = chapter;
for (const verse of versesInChapter) {
if (verse < lastVerse) {
issues.push({
type: 'out_of_order',
chapter,
verse,
verse_text: verses[`${chapter}:${verse}`],
comment: `${textType} has out-of-order verse ${verse} in chapter ${chapter}.`
});
}
if (seen.has(`${chapter}:${verse}`)) {
issues.push({
type: 'duplicate',
chapter,
verse,
verse_text: verses[`${chapter}:${verse}`],
comment: `${textType} has duplicate verse ${verse} in chapter ${chapter}.`
});
}
seen.add(`${chapter}:${verse}`);
lastVerse = verse;
}
}
}
// validateIntegrity(sourceChapters, 'Source', sourceVerses);
validateIntegrity(targetChapters, 'Target', targetVerses);
return {
check: 'chapter_verse_integrity',
issues
};
}
/**
* Detects missing verses in the target compared to the source.
* @param {object} source - Parsed JSON object of the source text.
* @param {object} target - Parsed JSON object of the target text.
* @returns {object} Report of missing verses.
*/
function detectMissingVerses(source, target) {
const handlerSrc = new _USJHandler.USJHandler(source);
const issues = [];
const sourceChapters = extractChapterVerses(source);
const targetChapters = extractChapterVerses(target);
const sourceVerses = handlerSrc.extractVerses();
for (const [chapter, verses] of Object.entries(sourceChapters)) {
const targetVerses = targetChapters[chapter] || [];
const missingVerses = verses.filter(verse => !targetVerses.includes(verse));
for (const missingVerse of missingVerses) {
issues.push({
type: 'missing',
chapter: parseInt(chapter, 10),
verse: missingVerse,
verse_text: sourceVerses[`${chapter}:${missingVerse}`],
comment: `Target is missing verse ${missingVerse} in chapter ${chapter}.`
});
}
}
return {
check: 'missing_verses',
issues
};
}
/**
* Detects consecutive repeated words and excessive whitespace in verses.
* @param {object} target - Parsed JSON object of the target text.
* @returns {object} Report of consecutive repeated words or whitespace issues.
*/
function detectRepeatedWordsAndWhitespace(target) {
const handlerSrc = new _USJHandler.USJHandler(target);
const issues = [];
const targetVerses = handlerSrc.extractVerses(target);
console.log(targetVerses);
for (const [key, text] of Object.entries(targetVerses)) {
const words = text.split(/\s+/);
const consecutiveRepeats = [];
const repeatPositions = [];
const whitespacePositions = [];
let excessiveWhitespace = /\s{2,}/.test(text);
// Detect consecutive repeated words with positions
for (let i = 0; i < words.length - 1; i++) {
const currentWord = words[i].toLowerCase().replace(/[.,!?"()]/g, '');
const nextWord = words[i + 1].toLowerCase().replace(/[.,!?"()]/g, '');
if (currentWord && currentWord === nextWord) {
consecutiveRepeats.push(currentWord);
repeatPositions.push(i);
}
}
// Detect excessive whitespaces with positions
if (excessiveWhitespace) {
const matches = [...text.matchAll(/\s{2,}/g)];
for (const match of matches) {
whitespacePositions.push(match.index);
}
}
if (consecutiveRepeats.length > 0 || excessiveWhitespace) {
issues.push({
verse: key,
repeated_words: consecutiveRepeats,
positions: repeatPositions,
whitespace_positions: whitespacePositions,
whitespace_issue: excessiveWhitespace,
comment: consecutiveRepeats.length > 0 ? `Consecutive repeated words: ${[...new Set(consecutiveRepeats)].join(', ')}` : "Excessive whitespace detected"
});
}
}
return {
check: 'repeated_words_and_whitespace',
issues
};
}
/**
* Detects unmatched punctuation pairs across verses (e.g., quotes, parentheses).
* @param {object} target - Parsed JSON object of the target text.
* @param {object|null} pair_punctuation_list - Optional custom punctuation pairs.
* @returns {object} Report of unmatched punctuation issues.
*/
function detectUnmatchedPunctuation(target, pair_punctuation_list = null) {
const handlerSrc = new _USJHandler.USJHandler(target);
const issues = [];
const targetVerses = handlerSrc.extractVerses();
// Define default punctuation pairs or use provided ones
let PAIR_PUNCTUATION = {
'(': ')',
'[': ']',
'{': '}',
'«': '»'
// '"': '"',
// "'": "'",
};
if (pair_punctuation_list !== null) {
PAIR_PUNCTUATION = pair_punctuation_list;
}
let stack = []; // Shared stack for punctuation tracking
let toggles = {}; // Toggles for characters that are the same for opening and closing
let openVerse = null; // Keeps track of the verse where punctuation started
// Initialize toggles for symmetric punctuation
for (const char of Object.keys(PAIR_PUNCTUATION)) {
if (PAIR_PUNCTUATION[char] === char) {
toggles[char] = false; // False means "not inside"
}
}
for (const [key, text] of Object.entries(targetVerses)) {
for (const char of text) {
if (PAIR_PUNCTUATION[char]) {
if (PAIR_PUNCTUATION[char] === char) {
// Handle symmetric punctuation using toggles
toggles[char] = !toggles[char];
if (toggles[char]) {
// Entering a symmetric punctuation
if (stack.length === 0) openVerse = key;
stack.push({
char,
verse: key
});
} else {
// Exiting a symmetric punctuation
const last = stack.pop();
if (!last || last.char !== char) {
issues.push({
verse: key,
unmatched_punctuation: char,
comment: `Unmatched closing punctuation: ${char}`
});
// } else if (last.verse !== key) {
// issues.push({
// verse: `${last.verse} - ${key}`,
// unmatched_punctuation: last.char,
// comment: `Punctuation "${last.char}" started in verse ${last.verse} and matched in verse ${key}.`,
// });
}
}
} else {
// Handle asymmetric punctuation (e.g., (), {}, etc.)
if (stack.length === 0) openVerse = key;
stack.push({
char,
verse: key
});
}
} else if (Object.values(PAIR_PUNCTUATION).includes(char)) {
// Handle closing punctuation
const last = stack.pop();
if (!last || PAIR_PUNCTUATION[last.char] !== char) {
// Unmatched closing punctuation
issues.push({
verse: key,
unmatched_punctuation: char,
comment: `Unmatched closing punctuation: ${char}`
});
// } else if (last.verse !== key) {
// // Matched punctuation spanning multiple verses
// issues.push({
// verse: `${last.verse} - ${key}`,
// unmatched_punctuation: last.char,
// comment: `Punctuation "${last.char}" started in verse ${last.verse} and matched in verse ${key}.`,
// });
}
}
}
}
// Remaining unmatched opening punctuation in the stack
if (stack.length > 0) {
const unmatchedSet = new Set();
while (stack.length > 0) {
const unmatched = stack.pop();
if (!unmatchedSet.has(unmatched.char)) {
unmatchedSet.add(unmatched.char);
issues.push({
verse: openVerse,
unmatched_punctuation: unmatched.char,
comment: `Unmatched opening punctuation: ${unmatched.char}`
});
}
}
}
return {
check: 'unmatched_punctuation',
issues
};
}
/**
* Detects number mismatches between source and target verses.
* @param {object} source - Parsed JSON object of the source text.
* @param {object} target - Parsed JSON object of the target text.
* @returns {object} Report of number mismatches.
*/
function detectNumberMismatches(source, target) {
const handlerSrc = new _USJHandler.USJHandler(source);
const handlerTgt = new _USJHandler.USJHandler(target);
const issues = [];
const sourceVerses = handlerSrc.extractVerses();
const targetVerses = handlerTgt.extractVerses();
const numberRegex = /\b\d+\b/g;
for (const [verseKey, sourceText] of Object.entries(sourceVerses)) {
const sourceNumbers = [...sourceText.matchAll(numberRegex)].map(match => ({
number: match[0],
position: match.index
}));
const targetText = targetVerses[verseKey] || '';
const targetNumbers = [...targetText.matchAll(numberRegex)].map(match => ({
number: match[0],
position: match.index
}));
const sourceNumberSet = new Set(sourceNumbers.map(item => item.number));
const targetNumberSet = new Set(targetNumbers.map(item => item.number));
const missingNumbers = [...sourceNumberSet].filter(num => !targetNumberSet.has(num));
const extraNumbers = [...targetNumberSet].filter(num => !sourceNumberSet.has(num));
if (missingNumbers.length > 0 || extraNumbers.length > 0) {
issues.push({
verse: verseKey,
verse_text: sourceText,
missing_numbers: missingNumbers.map(num => ({
number: num,
position: sourceNumbers.find(item => item.number === num)?.position || -1
})),
extra_numbers: extraNumbers.map(num => ({
number: num,
position: targetNumbers.find(item => item.number === num)?.position || -1
})),
comment: `Number mismatches detected. Missing: [${missingNumbers.join(', ')}], Extra: [${extraNumbers.join(', ')}]`
});
}
}
return {
check: 'numbers_check::mismatches',
issues
};
}
/**
* Detects mismatches between quoted text (`fq`) in footnotes and the referenced verse content.
* Verifies that the quoted text (`fq`) exists within the corresponding verse in the target text.
* @param {object} target - Parsed USJ JSON object of the target text.
* @returns {object} Report of missing or unmatched quotations in footnotes.
*/
function detectFootnoteQuotes(target) {
const handler = new _USJHandler.USJHandler(target);
const issues = [];
const verses = handler.extractVerses(); // Extract verses in the format { "chapter:verse": "text content" }
const footnotes = handler.extractFootnotes(); // Extract all footnotes with `fq` markers and their references
// console.log(verses);
for (const {
content,
reference
} of footnotes) {
const quotedTexts = handler.extractQuotedText(content); // Extract `fq` content from the footnote
const unmatchedQuotes = [];
// Check if each `fq` quoted text exists in the referenced verse
for (const quote of quotedTexts) {
const verseContent = verses[reference];
if (!verseContent || !verseContent.includes(quote)) {
unmatchedQuotes.push(quote);
}
}
if (unmatchedQuotes.length > 0) {
issues.push({
verse: reference,
unmatched_quotes: unmatchedQuotes,
comment: `Quoted text not found in the verse (${reference}): ${unmatchedQuotes.join(', ')}`
});
}
}
return {
check: 'footnote::quotation_mismatch',
issues
};
}