twl-linker
Version:
Biblical Semantic Linker - Uses the biblical context database to create semantic links between USFM Bible text and biblical articles with confidence scoring.
966 lines (826 loc) • 33.1 kB
JavaScript
/**
* Biblical Semantic Linker
* ========================
*
* Uses the biblical context database to create semantic links between
* USFM Bible text and biblical articles with confidence scoring.
*/
// Core imports that work in both browser and Node.js
import { removeAlignments } from './usfm-alignment-remover.js';
// Import the biblical context database
import contextDatabase from './biblical_context_database.json' with { type: 'json' };
// Node.js-specific functionality - will be undefined in browser
const isNode = typeof process !== 'undefined' && process.versions?.node;
/**
* Generate a 4-character hexadecimal ID starting with a-f
*/
function generateId() {
const firstChar = String.fromCharCode(97 + Math.floor(Math.random() * 6)); // a-f
const remainingChars = Math.random().toString(16).substr(2, 3).padEnd(3, '0');
return firstChar + remainingChars;
}
/**
* Map internal category names to TSV output format
*/
function mapCategoryToTags(category) {
const categoryMap = {
'kt': 'keyterm',
'other': '',
'names': 'name'
};
return categoryMap.hasOwnProperty(category) ? categoryMap[category] : category;
}
/**
* Parse USFM content and extract verses with their references
*/
function parseUSFM(usfmContent) {
const lines = usfmContent.split('\n');
const verses = [];
let currentChapter = 1;
let currentVerse = 1;
let currentText = '';
for (const line of lines) {
const trimmedLine = line.trim();
// Skip empty lines and metadata
if (!trimmedLine || trimmedLine.startsWith('\\id') || trimmedLine.startsWith('\\h') ||
trimmedLine.startsWith('\\toc') || trimmedLine.startsWith('\\mt')) {
continue;
}
// Chapter marker
const chapterMatch = trimmedLine.match(/^\\c\s+(\d+)/);
if (chapterMatch) {
// Save previous verse if exists
if (currentText.trim()) {
verses.push({
reference: `${currentChapter}:${currentVerse}`,
text: cleanText(currentText.trim()),
chapter: currentChapter,
verse: currentVerse
});
}
currentChapter = parseInt(chapterMatch[1]);
currentVerse = 1;
currentText = '';
continue;
}
// Verse marker
const verseMatch = trimmedLine.match(/^\\v\s+(\d+)(.*)$/);
if (verseMatch) {
// Save previous verse if exists
if (currentText.trim()) {
verses.push({
reference: `${currentChapter}:${currentVerse}`,
text: cleanText(currentText.trim()),
chapter: currentChapter,
verse: currentVerse
});
}
currentVerse = parseInt(verseMatch[1]);
currentText = verseMatch[2] || '';
continue;
}
// Regular text line (add to current verse)
if (!trimmedLine.startsWith('\\')) {
currentText += ' ' + trimmedLine;
}
}
// Save final verse
if (currentText.trim()) {
verses.push({
reference: `${currentChapter}:${currentVerse}`,
text: cleanText(currentText.trim()),
chapter: currentChapter,
verse: currentVerse
});
}
return verses;
}
/**
* Clean text by removing footnotes and formatting markers
*/
function cleanText(text) {
// Remove footnotes (\f ...\f*)
text = text.replace(/\\f\s+[^\\]*\\f\*/g, '');
// Remove other USFM markers
text = text.replace(/\\[a-z]+\*?/g, '');
// Clean up whitespace
text = text.replace(/\s+/g, ' ').trim();
return text;
}
/**
* Normalize text for matching (lowercase, remove punctuation)
*/
function normalizeForMatching(text) {
return text.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
/**
* Create word boundaries regex for exact matching
*/
function createWordBoundaryRegex(term) {
// Escape special regex characters
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
return new RegExp(`\\b${escapedTerm}\\b`, 'gi');
}
/**
* Get all possible search terms for an article
*/
function getSearchTerms(article) {
const terms = [];
// Add cleaned terms (exact matches) - highest confidence
article.cleaned_terms?.forEach(term => {
// Skip very short terms that are likely to cause false matches
if (term.length >= 3) {
terms.push({
text: term.toLowerCase(),
original: term,
type: 'exact',
confidence: 0.95
});
}
});
// Add morphological variants
article.morphological_variants?.forEach(variant => {
if (variant.length >= 3) {
terms.push({
text: variant.toLowerCase(),
original: variant,
type: 'morphological',
confidence: 0.85
});
}
});
// Add theological variants (be more selective)
article.theological_variants?.forEach(variant => {
if (variant.length >= 4) { // Require longer terms for theological variants
terms.push({
text: variant.toLowerCase(),
original: variant,
type: 'theological',
confidence: 0.75
});
}
});
return terms;
}/**
* Calculate confidence score based on context
*/
function calculateContextualConfidence(verseText, article, baseConfidence) {
const normalizedVerse = normalizeForMatching(verseText);
const words = normalizedVerse.split(/\s+/);
let positiveScore = 0;
let negativeScore = 0;
// Check positive contexts
if (article.positive_contexts) {
for (const context of article.positive_contexts) {
const contextWords = normalizeForMatching(context).split(/\s+/);
for (const contextWord of contextWords) {
if (words.includes(contextWord)) {
positiveScore += 0.1;
}
}
}
}
// Check negative contexts (reduce confidence)
if (article.negative_contexts) {
for (const context of article.negative_contexts) {
const contextWords = normalizeForMatching(context).split(/\s+/);
for (const contextWord of contextWords) {
if (words.includes(contextWord)) {
negativeScore += 0.15;
}
}
}
}
// Apply contextual adjustments
let contextualConfidence = baseConfidence + positiveScore - negativeScore;
// Clamp between 0.1 and 1.0
return Math.max(0.1, Math.min(1.0, contextualConfidence));
}
/**
* Get surrounding context for a word match - shows full verse with brackets around specific occurrence
*/
function getWordContext(text, matchStart, matchEnd, contextWords = 2) {
// Validate inputs
if (!text || matchStart < 0 || matchEnd <= matchStart || matchEnd > text.length) {
return {
before: '',
match: '',
after: '',
full: ''
};
}
const matchText = text.substring(matchStart, matchEnd);
// For verse context, we want the full cleaned text with brackets around this specific occurrence
// The text should already be cleaned by cleanText() function in parseUSFM
let fullVerse = text.trim();
// Create context by replacing only the specific occurrence at the given position
const beforeMatch = fullVerse.substring(0, matchStart);
const afterMatch = fullVerse.substring(matchEnd);
const contextWithBrackets = `${beforeMatch}[${matchText}]${afterMatch}`;
return {
before: '', // Not really applicable for verse context
match: matchText,
after: '', // Not really applicable for verse context
full: contextWithBrackets
};
}/**
* Determine best article for ambiguous terms using context analysis
*/
function disambiguateAmbiguousTerm(term, context, possibleArticles, contextDatabase) {
const scoredArticles = [];
for (const filename of possibleArticles) {
const article = contextDatabase.articles[filename];
if (!article) continue;
let score = 0;
const contextLower = context.toLowerCase();
// Analyze definition for contextual keywords
const definition = article.definition_summary?.toLowerCase() || '';
// Specific disambiguation rules for "God" vs "false god"
if (term === 'god') {
if (filename === 'god.md') {
// Look for indicators of the true God
if (contextLower.includes('lord') ||
contextLower.includes('father') ||
contextLower.includes('creator') ||
contextLower.includes('heaven') ||
contextLower.includes('blessed') ||
contextLower.includes('almighty') ||
contextLower.includes('create') ||
contextLower.includes('made') ||
contextLower.includes('word') ||
contextLower.includes('glory') ||
contextLower.includes('spirit') ||
contextLower.includes('holy') ||
contextLower.includes('jesus') ||
contextLower.includes('christ')) {
score += 0.8;
}
// Default preference for "god.md" in most contexts
score += 0.6;
// Negative indicators for true God
if (contextLower.includes('idol') ||
contextLower.includes('false') ||
contextLower.includes('worship') && contextLower.includes('not') ||
contextLower.includes('carved') ||
contextLower.includes('image') ||
contextLower.includes('golden') && contextLower.includes('calf')) {
score -= 0.7;
}
} else if (filename === 'falsegod.md') {
// Look for indicators of false gods/idols
if (contextLower.includes('idol') ||
contextLower.includes('false') ||
contextLower.includes('carved') ||
contextLower.includes('image') ||
contextLower.includes('golden') ||
contextLower.includes('baal') ||
contextLower.includes('asherah') ||
contextLower.includes('molech') ||
contextLower.includes('dagon') ||
contextLower.includes('worship') && contextLower.includes('not')) {
score += 0.8;
}
// Negative indicators for false gods (default to true God)
if (contextLower.includes('lord') ||
contextLower.includes('father') ||
contextLower.includes('creator') ||
contextLower.includes('blessed') ||
contextLower.includes('jesus') ||
contextLower.includes('christ')) {
score -= 0.5;
}
}
}
// Specific disambiguation rules for "call" variants
if (term === 'call' || term === 'called') {
if (filename === 'call-toname.md') {
// Look for naming context: "called Paul", "called Jesus", "called by name"
if (contextLower.includes('name') ||
contextLower.includes('by name') ||
/\bcalled\s+[A-Z][a-z]+/.test(context) ||
/\b[A-Z][a-z]+\s+called/.test(context) ||
contextLower.includes('this place') ||
contextLower.includes('bethel') ||
contextLower.includes('jerusalem') ||
// Biblical designations/roles
contextLower.includes('apostle') ||
contextLower.includes('saints') ||
contextLower.includes('called ones') ||
contextLower.includes('prophet') ||
contextLower.includes('priest') ||
/called\s+(to\s+be\s+)?(an?\s+)?(apostle|saint|prophet|priest|servant)/.test(contextLower)) {
score += 0.8;
}
// Negative indicators for naming
if (contextLower.includes('out') || contextLower.includes('forth') || contextLower.includes('come')) {
score -= 0.3;
}
} else if (filename === 'call-tosummon.md') {
// Look for summoning context: "called to him", "called forth", "called disciples"
if (contextLower.includes('to him') ||
contextLower.includes('to her') ||
contextLower.includes('forth') ||
contextLower.includes('come') ||
contextLower.includes('disciple') ||
(contextLower.includes('to be') && !contextLower.includes('apostle') && !contextLower.includes('saint')) ||
/called\s+\w+\s+to\s+/.test(contextLower)) {
score += 0.8;
}
// Negative indicators for summoning
if (contextLower.includes('name') || contextLower.includes('place') || contextLower.includes('apostle') || contextLower.includes('saints')) {
score -= 0.3;
}
} else if (filename === 'call-speakloudly.md') {
// Look for loud speaking context: "called out", "cried out", "voice"
if (contextLower.includes('out') ||
contextLower.includes('cry') ||
contextLower.includes('voice') ||
contextLower.includes('shout') ||
contextLower.includes('loud') ||
contextLower.includes('help')) {
score += 0.8;
}
// Negative indicators for loud speaking
if (contextLower.includes('name') || contextLower.includes('forth') || contextLower.includes('to be') || contextLower.includes('apostle')) {
score -= 0.3;
}
}
} // Generic context matching based on definition keywords
const definitionWords = definition.split(/\s+/);
const contextWords = contextLower.split(/\s+/);
for (const defWord of definitionWords) {
if (defWord.length > 3 && contextWords.includes(defWord)) {
score += 0.1;
}
}
scoredArticles.push({ filename, score, article });
}
// Sort by score and return the best match
scoredArticles.sort((a, b) => b.score - a.score);
return scoredArticles.length > 0 ? scoredArticles[0] : null;
}
/**
* Find matches in a verse using improved word boundary detection and disambiguation
*/
function findMatches(verse, contextDatabase) {
const matches = [];
const originalText = verse.text;
const normalizedVerse = normalizeForMatching(verse.text);
// Track matched character ranges to avoid overlaps
const matchedRanges = [];
// Build a priority map for all terms to ensure exact matches take precedence
const termPriorityMap = new Map(); // term -> {priority, filename, searchTerm}
for (const [filename, article] of Object.entries(contextDatabase.articles)) {
const searchTerms = getSearchTerms(article);
for (const searchTerm of searchTerms) {
const termKey = searchTerm.text;
// Priority: exact (1) > morphological (2) > theological (3)
let priority;
switch (searchTerm.type) {
case 'exact': priority = 1; break;
case 'morphological': priority = 2; break;
case 'theological': priority = 3; break;
default: priority = 3;
}
// Only keep the highest priority match for each term
if (!termPriorityMap.has(termKey) || termPriorityMap.get(termKey).priority > priority) {
termPriorityMap.set(termKey, { priority, filename, searchTerm });
}
}
}
// Now process matches using the priority-filtered terms
for (const [termKey, { filename, searchTerm }] of termPriorityMap) {
// Skip if this term is ambiguous - we'll handle it separately
const isAmbiguous = contextDatabase.ambiguous_terms &&
contextDatabase.ambiguous_terms[termKey];
if (isAmbiguous) continue;
const article = contextDatabase.articles[filename];
if (!article) continue;
// Use regex with word boundaries for more precise matching
const regex = createWordBoundaryRegex(searchTerm.text);
const normalizedMatches = [...normalizedVerse.matchAll(regex)];
for (const match of normalizedMatches) {
const startPos = match.index;
const endPos = startPos + match[0].length;
// Check if this range overlaps with existing matches
if (hasRangeOverlap(startPos, endPos, matchedRanges)) {
continue;
}
// Find the corresponding original text
const originalMatchResult = extractOriginalMatch(originalText, normalizedVerse, startPos, endPos);
if (originalMatchResult && originalMatchResult.text) {
const confidence = calculateContextualConfidence(
verse.text, article, searchTerm.confidence
);
// Get context for this match using original text positions
const context = getWordContext(originalText, originalMatchResult.originalStart, originalMatchResult.originalEnd);
// Additional confidence adjustments
let adjustedConfidence = confidence;
// Boost confidence for longer terms
if (searchTerm.text.length >= 6) {
adjustedConfidence += 0.05;
}
// Reduce confidence for very common words
const commonWords = ['the', 'and', 'of', 'to', 'for', 'in', 'on', 'at', 'by', 'with'];
if (commonWords.includes(searchTerm.text)) {
adjustedConfidence -= 0.3;
}
// Only include matches with reasonable confidence
if (adjustedConfidence >= 0.5) {
matches.push({
reference: verse.reference,
id: generateId(),
tags: mapCategoryToTags(article.category),
origWords: originalMatchResult.text,
occurrence: 1,
twLink: `rc://*/tw/dict/bible/${article.category}/${filename.replace('.md', '')}`,
confidence: Math.min(1.0, adjustedConfidence).toFixed(3),
matchType: searchTerm.type,
context: context.full,
disambiguation: 'single',
startPos: originalMatchResult.originalStart,
endPos: originalMatchResult.originalEnd
});
// Mark this range as matched
matchedRanges.push({ start: startPos, end: endPos });
}
}
}
}
// Now handle ambiguous terms
if (contextDatabase.ambiguous_terms) {
for (const [ambiguousTerm, possibleArticles] of Object.entries(contextDatabase.ambiguous_terms)) {
const regex = createWordBoundaryRegex(ambiguousTerm);
const normalizedMatches = [...normalizedVerse.matchAll(regex)];
for (const match of normalizedMatches) {
const startPos = match.index;
const endPos = startPos + match[0].length;
// Check if this range overlaps with existing matches
if (hasRangeOverlap(startPos, endPos, matchedRanges)) {
continue;
}
// Find the corresponding original text
const originalMatchResult = extractOriginalMatch(originalText, normalizedVerse, startPos, endPos);
if (originalMatchResult && originalMatchResult.text) {
// Get extended context for disambiguation using original positions
const context = getWordContext(originalText, originalMatchResult.originalStart, originalMatchResult.originalEnd, 4);
// Try to disambiguate automatically
const bestMatch = disambiguateAmbiguousTerm(ambiguousTerm, context.full, possibleArticles, contextDatabase);
if (bestMatch && bestMatch.score > 0.3) {
// Found a clear best match
const confidence = calculateContextualConfidence(
verse.text, bestMatch.article, 0.8 + bestMatch.score * 0.2
);
matches.push({
reference: verse.reference,
id: generateId(),
tags: mapCategoryToTags(bestMatch.article.category),
origWords: originalMatchResult.text,
occurrence: 1,
twLink: `rc://*/tw/dict/bible/${bestMatch.article.category}/${bestMatch.filename.replace('.md', '')}`,
confidence: Math.min(1.0, confidence).toFixed(3),
matchType: 'disambiguated',
context: context.full,
disambiguation: `auto:${bestMatch.score.toFixed(2)}`,
startPos: originalMatchResult.originalStart,
endPos: originalMatchResult.originalEnd
});
} else {
// No clear best match found, pick the first (most likely) option
const filename = possibleArticles[0];
const article = contextDatabase.articles[filename];
if (article) {
const confidence = calculateContextualConfidence(
verse.text, article, 0.55 // Default confidence for ambiguous terms
);
// Create disambiguation string with all options
const optionsList = possibleArticles.map((articleFile, index) => {
const articleData = contextDatabase.articles[articleFile];
const articleName = articleFile.replace('.md', '');
return `${index + 1}:${articleData.category}/${articleName}`;
}).join(', ');
matches.push({
reference: verse.reference,
id: generateId(),
tags: mapCategoryToTags(article.category),
origWords: originalMatchResult.text,
occurrence: 1,
twLink: `rc://*/tw/dict/bible/${article.category}/${filename.replace('.md', '')}`,
confidence: Math.min(1.0, confidence).toFixed(3),
matchType: 'ambiguous',
context: context.full,
disambiguation: `manual:option1 (${optionsList})`,
startPos: originalMatchResult.originalStart,
endPos: originalMatchResult.originalEnd
});
}
}
// Mark this range as matched
matchedRanges.push({ start: startPos, end: endPos });
}
}
}
}
// Sort by confidence and remove lower-confidence duplicates
return deduplicateMatches(matches);
}/**
* Find the original text that corresponds to a normalized phrase
*/
function findOriginalText(originalText, normalizedPhrase) {
const originalWords = originalText.split(/\s+/);
const normalizedWords = normalizeForMatching(originalText).split(/\s+/);
const searchWords = normalizedPhrase.split(/\s+/);
for (let i = 0; i <= normalizedWords.length - searchWords.length; i++) {
let match = true;
for (let j = 0; j < searchWords.length; j++) {
if (normalizedWords[i + j] !== searchWords[j]) {
match = false;
break;
}
}
if (match) {
const matchedWords = originalWords.slice(i, i + searchWords.length);
const positions = Array.from({ length: searchWords.length }, (_, idx) => i + idx);
return {
text: matchedWords.join(' '),
positions: positions
};
}
}
return null;
}
/**
* Get original word at a specific position
*/
function getOriginalWord(originalText, position) {
const words = originalText.split(/\s+/);
return words[position] || '';
}
/**
* Check if two sets of positions overlap
*/
function hasOverlap(positions1, positions2) {
return positions1.some(pos => positions2.has(pos));
}
/**
* Clean punctuation from a word, preserving possessive apostrophes and internal hyphens
*/
function cleanPunctuation(word) {
// Handle possessive cases: Jesus', Mary's, etc.
// Keep apostrophe if it's after 's' at the end or if it's the last character
const possessiveMatch = word.match(/^(.+?s['']?)$/);
if (possessiveMatch) {
// Clean leading punctuation, keep possessive ending
const cleaned = possessiveMatch[1].replace(/^[^\w\-]+/, '');
return cleaned;
}
// For non-possessive words, remove leading and trailing punctuation
// Keep internal hyphens/dashes for compound words
const cleaned = word.replace(/^[^\w\-]+/, '').replace(/[^\w\-]+$/, '');
return cleaned;
}
/**
* Extract original text from a character range in normalized text
*/
function extractOriginalMatch(originalText, normalizedText, startPos, endPos) {
// Validate inputs
if (!originalText || !normalizedText || startPos < 0 || endPos <= startPos) {
return null;
}
// Get the normalized match
const normalizedMatch = normalizedText.substring(startPos, endPos);
// We need to map the normalized position back to original position
// This is complex because normalization can change character positions
// Approach: find the corresponding word range by counting words
const normalizedWords = normalizedText.split(/\s+/).filter(w => w.length > 0);
const originalWords = originalText.split(/\s+/).filter(w => w.length > 0);
// Safety check
if (normalizedWords.length === 0 || originalWords.length === 0) {
return null;
}
// Find which word indices the startPos and endPos correspond to
let charCount = 0;
let startWordIndex = -1;
let endWordIndex = -1;
for (let i = 0; i < normalizedWords.length; i++) {
const wordStart = charCount;
const wordEnd = charCount + normalizedWords[i].length;
// Find start word index
if (startWordIndex === -1 && startPos >= wordStart && startPos < wordEnd) {
startWordIndex = i;
}
// Find end word index (the word that contains endPos-1, since endPos is exclusive)
if (endWordIndex === -1 && (endPos - 1) >= wordStart && (endPos - 1) < wordEnd) {
endWordIndex = i;
break;
}
charCount = wordEnd + (i < normalizedWords.length - 1 ? 1 : 0); // +1 for space
}
// If we didn't find endWordIndex, it might be at the end of the last word
if (endWordIndex === -1 && startWordIndex !== -1) {
// Check if endPos is at the end of a word
charCount = 0;
for (let i = 0; i < normalizedWords.length; i++) {
const wordStart = charCount;
const wordEnd = charCount + normalizedWords[i].length;
if (endPos === wordEnd) {
endWordIndex = i;
break;
}
charCount = wordEnd + (i < normalizedWords.length - 1 ? 1 : 0);
}
}
// Safety checks
if (startWordIndex === -1 || endWordIndex === -1 || startWordIndex > endWordIndex) {
return null;
}
if (endWordIndex >= originalWords.length) {
return null;
}
// Extract the range of words from original text
const matchedWords = originalWords.slice(startWordIndex, endWordIndex + 1);
// Find the corresponding character positions in original text
let originalCharCount = 0;
for (let i = 0; i < startWordIndex; i++) {
originalCharCount += originalWords[i].length;
if (i < originalWords.length - 1) {
originalCharCount += 1; // +1 for space after each word except the last
}
}
const originalStart = originalCharCount;
// Calculate the end position
let originalEnd = originalStart;
for (let i = startWordIndex; i <= endWordIndex; i++) {
originalEnd += originalWords[i].length;
if (i < endWordIndex) {
originalEnd += 1; // +1 for space between words
}
}
// Clean punctuation from the extracted words
const cleanedWords = matchedWords.map(word => cleanPunctuation(word));
const cleanedText = cleanedWords.join(' ');
return {
text: cleanedText,
originalStart: originalStart,
originalEnd: originalEnd
};
}/**
* Check if a character range overlaps with existing ranges
*/
function hasRangeOverlap(start, end, existingRanges) {
return existingRanges.some(range =>
(start < range.end && end > range.start)
);
}
/**
* Remove duplicate matches, keeping the highest confidence ones and numbering occurrences
*/
function deduplicateMatches(matches) {
// First sort by position to process in order
matches.sort((a, b) => {
const [aChap, aVerse] = a.reference.split(':').map(Number);
const [bChap, bVerse] = b.reference.split(':').map(Number);
if (aChap !== bChap) return aChap - bChap;
if (aVerse !== bVerse) return aVerse - bVerse;
// Sort by start position within the verse
return a.startPos - b.startPos;
});
const seen = new Map();
const result = [];
for (const match of matches) {
// Create a more specific key that includes position to allow multiple occurrences
const baseKey = `${match.reference}-${match.origWords}-${match.twLink}`;
const posKey = `${baseKey}-${match.startPos}`;
if (!seen.has(posKey)) {
// Count how many times this term has appeared in this verse
const countKey = `${match.reference}-${match.origWords}-${match.twLink}`;
const previousOccurrences = result.filter(r =>
`${r.reference}-${r.origWords}-${r.twLink}` === countKey
).length;
// Set the occurrence number
match.occurrence = previousOccurrences + 1;
seen.set(posKey, true);
result.push(match);
}
}
return result;
}
/**
* Main function to generate TWL from USFM
*/
function generateTWL(usfmContent) {
// Remove alignment data from USFM content first
const cleanedUsfmContent = removeAlignments(usfmContent);
const verses = parseUSFM(cleanedUsfmContent);
console.log(`Parsed ${verses.length} verses`);
const allMatches = [];
for (const verse of verses) {
const matches = findMatches(verse, contextDatabase);
allMatches.push(...matches);
}
console.log(`Found ${allMatches.length} total matches`);
// Sort matches by reference (chapter:verse) and then by position within verse
allMatches.sort((a, b) => {
const [aChap, aVerse] = a.reference.split(':').map(Number);
const [bChap, bVerse] = b.reference.split(':').map(Number);
if (aChap !== bChap) return aChap - bChap;
if (aVerse !== bVerse) return aVerse - bVerse;
// Sort by start position within the verse
return a.startPos - b.startPos;
});
// Convert to TSV format with enhanced columns
const header = 'Reference\tID\tTags\tOrigWords\tOccurrence\tTWLink\tDisambiguation\tContext\tConfidence\tMatch_Type';
const rows = allMatches.map(match =>
`${match.reference}\t${match.id}\t${match.tags}\t${match.origWords}\t${match.occurrence}\t${match.twLink}\t${match.disambiguation || ''}\t${match.context || ''}\t${match.confidence}\t${match.matchType}`
);
return [header, ...rows].join('\n');
}
/**
* Generate output filename based on input filename
* (Node.js only - for CLI usage)
*/
async function generateOutputFilename(inputFile, outputDir = null) {
if (!isNode) {
throw new Error('generateOutputFilename is only available in Node.js environment');
}
const path = (await import('path')).default;
const baseName = path.basename(inputFile, '.usfm');
// If input starts with number and dash (e.g., "01-GEN"), remove prefix and add "twl_"
if (/^\d+-/.test(baseName)) {
const bookCode = baseName.replace(/^\d+-/, '');
const outputName = `twl_${bookCode}.tsv`;
return outputDir ? path.join(outputDir, outputName) : outputName;
}
// Otherwise just change extension to .tsv
const outputName = `${baseName}.tsv`;
return outputDir ? path.join(outputDir, outputName) : outputName;
}
/**
* Main execution function (Node.js only - for CLI usage)
*/
async function main() {
if (!isNode) {
console.error('CLI functionality is only available in Node.js environment');
return;
}
const fs = (await import('fs')).default;
const args = process.argv.slice(2);
if (args.length < 1) {
console.log('Usage: node twl-linker.js <usfm_file> [output_file]');
console.log('Example: node twl-linker.js 46-ROM.usfm twl_ROM.tsv');
console.log(' node twl-linker.js test.usfm test.tsv');
console.log(' node twl-linker.js 01-GEN.usfm # outputs to twl_GEN.tsv');
process.exit(1);
}
const usfmFile = args[0];
const outputFile = args[1] || await generateOutputFilename(usfmFile);
try {
console.log(`Reading USFM file: ${usfmFile}`);
const usfmContent = fs.readFileSync(usfmFile, 'utf8');
console.log('Generating semantic links...');
const tsvContent = generateTWL(usfmContent);
console.log(`Writing output to: ${outputFile}`);
fs.writeFileSync(outputFile, tsvContent);
console.log('✅ Semantic linking complete!');
// Show enhanced statistics
const lines = tsvContent.split('\n');
console.log(`📊 Generated ${lines.length - 1} links`);
// Analyze the results
const dataLines = lines.slice(1).filter(line => line.trim());
if (dataLines.length > 0) {
const confidences = dataLines.map(line => parseFloat(line.split('\t')[8])).filter(c => !isNaN(c));
const disambiguations = dataLines.map(line => line.split('\t')[6] || '');
// Confidence statistics
const avgConfidence = confidences.reduce((a, b) => a + b) / confidences.length;
const highConfidence = confidences.filter(c => c >= 0.8).length;
console.log(`📈 Average confidence: ${avgConfidence.toFixed(3)}`);
console.log(`🎯 High confidence links (≥0.8): ${highConfidence}/${confidences.length}`);
// Disambiguation statistics
const singleMatches = disambiguations.filter(d => d === 'single').length;
const autoDisambiguated = disambiguations.filter(d => d.startsWith('auto:')).length;
const manualNeeded = disambiguations.filter(d => d.startsWith('manual:')).length;
console.log(`🔍 Disambiguation breakdown:`);
console.log(` • Single matches: ${singleMatches}`);
console.log(` • Auto-disambiguated: ${autoDisambiguated}`);
console.log(` • Manual review needed: ${manualNeeded}`);
if (manualNeeded > 0) {
console.log(`⚠️ ${manualNeeded} ambiguous terms need manual review`);
}
}
} catch (error) {
console.error('Error:', error.message);
process.exit(1);
}
}
// Export the main function and utilities for use as a module
export {
generateTWL,
parseUSFM,
findMatches,
generateOutputFilename,
contextDatabase,
main // Export main for programmatic access if needed
};