UNPKG

twl-linker

Version:

Biblical Semantic Linker - Uses the biblical context database to create semantic links between USFM Bible text and biblical articles with confidence scoring.

github.com/unfoldingWord/twl-linker

unfoldingWord/twl-linker

966 lines (826 loc) • 33.1 kB

JavaScript

/** * Biblical Semantic Linker * ======================== * * Uses the biblical context database to create semantic links between * USFM Bible text and biblical articles with confidence scoring. */ // Core imports that work in both browser and Node.js import { removeAlignments } from './usfm-alignment-remover.js'; // Import the biblical context database import contextDatabase from './biblical_context_database.json' with { type: 'json' }; // Node.js-specific functionality - will be undefined in browser const isNode = typeof process !== 'undefined' && process.versions?.node; /** * Generate a 4-character hexadecimal ID starting with a-f */ function generateId() { const firstChar = String.fromCharCode(97 + Math.floor(Math.random() * 6)); // a-f const remainingChars = Math.random().toString(16).substr(2, 3).padEnd(3, '0'); return firstChar + remainingChars; } /** * Map internal category names to TSV output format */ function mapCategoryToTags(category) { const categoryMap = { 'kt': 'keyterm', 'other': '', 'names': 'name' }; return categoryMap.hasOwnProperty(category) ? categoryMap[category] : category; } /** * Parse USFM content and extract verses with their references */ function parseUSFM(usfmContent) { const lines = usfmContent.split('\n'); const verses = []; let currentChapter = 1; let currentVerse = 1; let currentText = ''; for (const line of lines) { const trimmedLine = line.trim(); // Skip empty lines and metadata if (!trimmedLine || trimmedLine.startsWith('\\id') || trimmedLine.startsWith('\\h') || trimmedLine.startsWith('\\toc') || trimmedLine.startsWith('\\mt')) { continue; } // Chapter marker const chapterMatch = trimmedLine.match(/^\\c\s+(\d+)/); if (chapterMatch) { // Save previous verse if exists if (currentText.trim()) { verses.push({ reference: `${currentChapter}:${currentVerse}`, text: cleanText(currentText.trim()), chapter: currentChapter, verse: currentVerse }); } currentChapter = parseInt(chapterMatch[1]); currentVerse = 1; currentText = ''; continue; } // Verse marker const verseMatch = trimmedLine.match(/^\\v\s+(\d+)(.*)$/); if (verseMatch) { // Save previous verse if exists if (currentText.trim()) { verses.push({ reference: `${currentChapter}:${currentVerse}`, text: cleanText(currentText.trim()), chapter: currentChapter, verse: currentVerse }); } currentVerse = parseInt(verseMatch[1]); currentText = verseMatch[2] || ''; continue; } // Regular text line (add to current verse) if (!trimmedLine.startsWith('\\')) { currentText += ' ' + trimmedLine; } } // Save final verse if (currentText.trim()) { verses.push({ reference: `${currentChapter}:${currentVerse}`, text: cleanText(currentText.trim()), chapter: currentChapter, verse: currentVerse }); } return verses; } /** * Clean text by removing footnotes and formatting markers */ function cleanText(text) { // Remove footnotes (\f ...\f*) text = text.replace(/\\f\s+[^\\]*\\f\*/g, ''); // Remove other USFM markers text = text.replace(/\\[a-z]+\*?/g, ''); // Clean up whitespace text = text.replace(/\s+/g, ' ').trim(); return text; } /** * Normalize text for matching (lowercase, remove punctuation) */ function normalizeForMatching(text) { return text.toLowerCase() .replace(/[^\w\s]/g, ' ') .replace(/\s+/g, ' ') .trim(); } /** * Create word boundaries regex for exact matching */ function createWordBoundaryRegex(term) { // Escape special regex characters const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); return new RegExp(`\\b${escapedTerm}\\b`, 'gi'); } /** * Get all possible search terms for an article */ function getSearchTerms(article) { const terms = []; // Add cleaned terms (exact matches) - highest confidence article.cleaned_terms?.forEach(term => { // Skip very short terms that are likely to cause false matches if (term.length >= 3) { terms.push({ text: term.toLowerCase(), original: term, type: 'exact', confidence: 0.95 }); } }); // Add morphological variants article.morphological_variants?.forEach(variant => { if (variant.length >= 3) { terms.push({ text: variant.toLowerCase(), original: variant, type: 'morphological', confidence: 0.85 }); } }); // Add theological variants (be more selective) article.theological_variants?.forEach(variant => { if (variant.length >= 4) { // Require longer terms for theological variants terms.push({ text: variant.toLowerCase(), original: variant, type: 'theological', confidence: 0.75 }); } }); return terms; }/** * Calculate confidence score based on context */ function calculateContextualConfidence(verseText, article, baseConfidence) { const normalizedVerse = normalizeForMatching(verseText); const words = normalizedVerse.split(/\s+/); let positiveScore = 0; let negativeScore = 0; // Check positive contexts if (article.positive_contexts) { for (const context of article.positive_contexts) { const contextWords = normalizeForMatching(context).split(/\s+/); for (const contextWord of contextWords) { if (words.includes(contextWord)) { positiveScore += 0.1; } } } } // Check negative contexts (reduce confidence) if (article.negative_contexts) { for (const context of article.negative_contexts) { const contextWords = normalizeForMatching(context).split(/\s+/); for (const contextWord of contextWords) { if (words.includes(contextWord)) { negativeScore += 0.15; } } } } // Apply contextual adjustments let contextualConfidence = baseConfidence + positiveScore - negativeScore; // Clamp between 0.1 and 1.0 return Math.max(0.1, Math.min(1.0, contextualConfidence)); } /** * Get surrounding context for a word match - shows full verse with brackets around specific occurrence */ function getWordContext(text, matchStart, matchEnd, contextWords = 2) { // Validate inputs if (!text || matchStart < 0 || matchEnd <= matchStart || matchEnd > text.length) { return { before: '', match: '', after: '', full: '' }; } const matchText = text.substring(matchStart, matchEnd); // For verse context, we want the full cleaned text with brackets around this specific occurrence // The text should already be cleaned by cleanText() function in parseUSFM let fullVerse = text.trim(); // Create context by replacing only the specific occurrence at the given position const beforeMatch = fullVerse.substring(0, matchStart); const afterMatch = fullVerse.substring(matchEnd); const contextWithBrackets = `${beforeMatch}[${matchText}]${afterMatch}`; return { before: '', // Not really applicable for verse context match: matchText, after: '', // Not really applicable for verse context full: contextWithBrackets }; }/** * Determine best article for ambiguous terms using context analysis */ function disambiguateAmbiguousTerm(term, context, possibleArticles, contextDatabase) { const scoredArticles = []; for (const filename of possibleArticles) { const article = contextDatabase.articles[filename]; if (!article) continue; let score = 0; const contextLower = context.toLowerCase(); // Analyze definition for contextual keywords const definition = article.definition_summary?.toLowerCase() || ''; // Specific disambiguation rules for "God" vs "false god" if (term === 'god') { if (filename === 'god.md') { // Look for indicators of the true God if (contextLower.includes('lord') || contextLower.includes('father') || contextLower.includes('creator') || contextLower.includes('heaven') || contextLower.includes('blessed') || contextLower.includes('almighty') || contextLower.includes('create') || contextLower.includes('made') || contextLower.includes('word') || contextLower.includes('glory') || contextLower.includes('spirit') || contextLower.includes('holy') || contextLower.includes('jesus') || contextLower.includes('christ')) { score += 0.8; } // Default preference for "god.md" in most contexts score += 0.6; // Negative indicators for true God if (contextLower.includes('idol') || contextLower.includes('false') || contextLower.includes('worship') && contextLower.includes('not') || contextLower.includes('carved') || contextLower.includes('image') || contextLower.includes('golden') && contextLower.includes('calf')) { score -= 0.7; } } else if (filename === 'falsegod.md') { // Look for indicators of false gods/idols if (contextLower.includes('idol') || contextLower.includes('false') || contextLower.includes('carved') || contextLower.includes('image') || contextLower.includes('golden') || contextLower.includes('baal') || contextLower.includes('asherah') || contextLower.includes('molech') || contextLower.includes('dagon') || contextLower.includes('worship') && contextLower.includes('not')) { score += 0.8; } // Negative indicators for false gods (default to true God) if (contextLower.includes('lord') || contextLower.includes('father') || contextLower.includes('creator') || contextLower.includes('blessed') || contextLower.includes('jesus') || contextLower.includes('christ')) { score -= 0.5; } } } // Specific disambiguation rules for "call" variants if (term === 'call' || term === 'called') { if (filename === 'call-toname.md') { // Look for naming context: "called Paul", "called Jesus", "called by name" if (contextLower.includes('name') || contextLower.includes('by name') || /\bcalled\s+[A-Z][a-z]+/.test(context) || /\b[A-Z][a-z]+\s+called/.test(context) || contextLower.includes('this place') || contextLower.includes('bethel') || contextLower.includes('jerusalem') || // Biblical designations/roles contextLower.includes('apostle') || contextLower.includes('saints') || contextLower.includes('called ones') || contextLower.includes('prophet') || contextLower.includes('priest') || /called\s+(to\s+be\s+)?(an?\s+)?(apostle|saint|prophet|priest|servant)/.test(contextLower)) { score += 0.8; } // Negative indicators for naming if (contextLower.includes('out') || contextLower.includes('forth') || contextLower.includes('come')) { score -= 0.3; } } else if (filename === 'call-tosummon.md') { // Look for summoning context: "called to him", "called forth", "called disciples" if (contextLower.includes('to him') || contextLower.includes('to her') || contextLower.includes('forth') || contextLower.includes('come') || contextLower.includes('disciple') || (contextLower.includes('to be') && !contextLower.includes('apostle') && !contextLower.includes('saint')) || /called\s+\w+\s+to\s+/.test(contextLower)) { score += 0.8; } // Negative indicators for summoning if (contextLower.includes('name') || contextLower.includes('place') || contextLower.includes('apostle') || contextLower.includes('saints')) { score -= 0.3; } } else if (filename === 'call-speakloudly.md') { // Look for loud speaking context: "called out", "cried out", "voice" if (contextLower.includes('out') || contextLower.includes('cry') || contextLower.includes('voice') || contextLower.includes('shout') || contextLower.includes('loud') || contextLower.includes('help')) { score += 0.8; } // Negative indicators for loud speaking if (contextLower.includes('name') || contextLower.includes('forth') || contextLower.includes('to be') || contextLower.includes('apostle')) { score -= 0.3; } } } // Generic context matching based on definition keywords const definitionWords = definition.split(/\s+/); const contextWords = contextLower.split(/\s+/); for (const defWord of definitionWords) { if (defWord.length > 3 && contextWords.includes(defWord)) { score += 0.1; } } scoredArticles.push({ filename, score, article }); } // Sort by score and return the best match scoredArticles.sort((a, b) => b.score - a.score); return scoredArticles.length > 0 ? scoredArticles[0] : null; } /** * Find matches in a verse using improved word boundary detection and disambiguation */ function findMatches(verse, contextDatabase) { const matches = []; const originalText = verse.text; const normalizedVerse = normalizeForMatching(verse.text); // Track matched character ranges to avoid overlaps const matchedRanges = []; // Build a priority map for all terms to ensure exact matches take precedence const termPriorityMap = new Map(); // term -> {priority, filename, searchTerm} for (const [filename, article] of Object.entries(contextDatabase.articles)) { const searchTerms = getSearchTerms(article); for (const searchTerm of searchTerms) { const termKey = searchTerm.text; // Priority: exact (1) > morphological (2) > theological (3) let priority; switch (searchTerm.type) { case 'exact': priority = 1; break; case 'morphological': priority = 2; break; case 'theological': priority = 3; break; default: priority = 3; } // Only keep the highest priority match for each term if (!termPriorityMap.has(termKey) || termPriorityMap.get(termKey).priority > priority) { termPriorityMap.set(termKey, { priority, filename, searchTerm }); } } } // Now process matches using the priority-filtered terms for (const [termKey, { filename, searchTerm }] of termPriorityMap) { // Skip if this term is ambiguous - we'll handle it separately const isAmbiguous = contextDatabase.ambiguous_terms && contextDatabase.ambiguous_terms[termKey]; if (isAmbiguous) continue; const article = contextDatabase.articles[filename]; if (!article) continue; // Use regex with word boundaries for more precise matching const regex = createWordBoundaryRegex(searchTerm.text); const normalizedMatches = [...normalizedVerse.matchAll(regex)]; for (const match of normalizedMatches) { const startPos = match.index; const endPos = startPos + match[0].length; // Check if this range overlaps with existing matches if (hasRangeOverlap(startPos, endPos, matchedRanges)) { continue; } // Find the corresponding original text const originalMatchResult = extractOriginalMatch(originalText, normalizedVerse, startPos, endPos); if (originalMatchResult && originalMatchResult.text) { const confidence = calculateContextualConfidence( verse.text, article, searchTerm.confidence ); // Get context for this match using original text positions const context = getWordContext(originalText, originalMatchResult.originalStart, originalMatchResult.originalEnd); // Additional confidence adjustments let adjustedConfidence = confidence; // Boost confidence for longer terms if (searchTerm.text.length >= 6) { adjustedConfidence += 0.05; } // Reduce confidence for very common words const commonWords = ['the', 'and', 'of', 'to', 'for', 'in', 'on', 'at', 'by', 'with']; if (commonWords.includes(searchTerm.text)) { adjustedConfidence -= 0.3; } // Only include matches with reasonable confidence if (adjustedConfidence >= 0.5) { matches.push({ reference: verse.reference, id: generateId(), tags: mapCategoryToTags(article.category), origWords: originalMatchResult.text, occurrence: 1, twLink: `rc://*/tw/dict/bible/${article.category}/${filename.replace('.md', '')}`, confidence: Math.min(1.0, adjustedConfidence).toFixed(3), matchType: searchTerm.type, context: context.full, disambiguation: 'single', startPos: originalMatchResult.originalStart, endPos: originalMatchResult.originalEnd }); // Mark this range as matched matchedRanges.push({ start: startPos, end: endPos }); } } } } // Now handle ambiguous terms if (contextDatabase.ambiguous_terms) { for (const [ambiguousTerm, possibleArticles] of Object.entries(contextDatabase.ambiguous_terms)) { const regex = createWordBoundaryRegex(ambiguousTerm); const normalizedMatches = [...normalizedVerse.matchAll(regex)]; for (const match of normalizedMatches) { const startPos = match.index; const endPos = startPos + match[0].length; // Check if this range overlaps with existing matches if (hasRangeOverlap(startPos, endPos, matchedRanges)) { continue; } // Find the corresponding original text const originalMatchResult = extractOriginalMatch(originalText, normalizedVerse, startPos, endPos); if (originalMatchResult && originalMatchResult.text) { // Get extended context for disambiguation using original positions const context = getWordContext(originalText, originalMatchResult.originalStart, originalMatchResult.originalEnd, 4); // Try to disambiguate automatically const bestMatch = disambiguateAmbiguousTerm(ambiguousTerm, context.full, possibleArticles, contextDatabase); if (bestMatch && bestMatch.score > 0.3) { // Found a clear best match const confidence = calculateContextualConfidence( verse.text, bestMatch.article, 0.8 + bestMatch.score * 0.2 ); matches.push({ reference: verse.reference, id: generateId(), tags: mapCategoryToTags(bestMatch.article.category), origWords: originalMatchResult.text, occurrence: 1, twLink: `rc://*/tw/dict/bible/${bestMatch.article.category}/${bestMatch.filename.replace('.md', '')}`, confidence: Math.min(1.0, confidence).toFixed(3), matchType: 'disambiguated', context: context.full, disambiguation: `auto:${bestMatch.score.toFixed(2)}`, startPos: originalMatchResult.originalStart, endPos: originalMatchResult.originalEnd }); } else { // No clear best match found, pick the first (most likely) option const filename = possibleArticles[0]; const article = contextDatabase.articles[filename]; if (article) { const confidence = calculateContextualConfidence( verse.text, article, 0.55 // Default confidence for ambiguous terms ); // Create disambiguation string with all options const optionsList = possibleArticles.map((articleFile, index) => { const articleData = contextDatabase.articles[articleFile]; const articleName = articleFile.replace('.md', ''); return `${index + 1}:${articleData.category}/${articleName}`; }).join(', '); matches.push({ reference: verse.reference, id: generateId(), tags: mapCategoryToTags(article.category), origWords: originalMatchResult.text, occurrence: 1, twLink: `rc://*/tw/dict/bible/${article.category}/${filename.replace('.md', '')}`, confidence: Math.min(1.0, confidence).toFixed(3), matchType: 'ambiguous', context: context.full, disambiguation: `manual:option1 (${optionsList})`, startPos: originalMatchResult.originalStart, endPos: originalMatchResult.originalEnd }); } } // Mark this range as matched matchedRanges.push({ start: startPos, end: endPos }); } } } } // Sort by confidence and remove lower-confidence duplicates return deduplicateMatches(matches); }/** * Find the original text that corresponds to a normalized phrase */ function findOriginalText(originalText, normalizedPhrase) { const originalWords = originalText.split(/\s+/); const normalizedWords = normalizeForMatching(originalText).split(/\s+/); const searchWords = normalizedPhrase.split(/\s+/); for (let i = 0; i <= normalizedWords.length - searchWords.length; i++) { let match = true; for (let j = 0; j < searchWords.length; j++) { if (normalizedWords[i + j] !== searchWords[j]) { match = false; break; } } if (match) { const matchedWords = originalWords.slice(i, i + searchWords.length); const positions = Array.from({ length: searchWords.length }, (_, idx) => i + idx); return { text: matchedWords.join(' '), positions: positions }; } } return null; } /** * Get original word at a specific position */ function getOriginalWord(originalText, position) { const words = originalText.split(/\s+/); return words[position] || ''; } /** * Check if two sets of positions overlap */ function hasOverlap(positions1, positions2) { return positions1.some(pos => positions2.has(pos)); } /** * Clean punctuation from a word, preserving possessive apostrophes and internal hyphens */ function cleanPunctuation(word) { // Handle possessive cases: Jesus', Mary's, etc. // Keep apostrophe if it's after 's' at the end or if it's the last character const possessiveMatch = word.match(/^(.+?s['']?)$/); if (possessiveMatch) { // Clean leading punctuation, keep possessive ending const cleaned = possessiveMatch[1].replace(/^[^\w\-]+/, ''); return cleaned; } // For non-possessive words, remove leading and trailing punctuation // Keep internal hyphens/dashes for compound words const cleaned = word.replace(/^[^\w\-]+/, '').replace(/[^\w\-]+$/, ''); return cleaned; } /** * Extract original text from a character range in normalized text */ function extractOriginalMatch(originalText, normalizedText, startPos, endPos) { // Validate inputs if (!originalText || !normalizedText || startPos < 0 || endPos <= startPos) { return null; } // Get the normalized match const normalizedMatch = normalizedText.substring(startPos, endPos); // We need to map the normalized position back to original position // This is complex because normalization can change character positions // Approach: find the corresponding word range by counting words const normalizedWords = normalizedText.split(/\s+/).filter(w => w.length > 0); const originalWords = originalText.split(/\s+/).filter(w => w.length > 0); // Safety check if (normalizedWords.length === 0 || originalWords.length === 0) { return null; } // Find which word indices the startPos and endPos correspond to let charCount = 0; let startWordIndex = -1; let endWordIndex = -1; for (let i = 0; i < normalizedWords.length; i++) { const wordStart = charCount; const wordEnd = charCount + normalizedWords[i].length; // Find start word index if (startWordIndex === -1 && startPos >= wordStart && startPos < wordEnd) { startWordIndex = i; } // Find end word index (the word that contains endPos-1, since endPos is exclusive) if (endWordIndex === -1 && (endPos - 1) >= wordStart && (endPos - 1) < wordEnd) { endWordIndex = i; break; } charCount = wordEnd + (i < normalizedWords.length - 1 ? 1 : 0); // +1 for space } // If we didn't find endWordIndex, it might be at the end of the last word if (endWordIndex === -1 && startWordIndex !== -1) { // Check if endPos is at the end of a word charCount = 0; for (let i = 0; i < normalizedWords.length; i++) { const wordStart = charCount; const wordEnd = charCount + normalizedWords[i].length; if (endPos === wordEnd) { endWordIndex = i; break; } charCount = wordEnd + (i < normalizedWords.length - 1 ? 1 : 0); } } // Safety checks if (startWordIndex === -1 || endWordIndex === -1 || startWordIndex > endWordIndex) { return null; } if (endWordIndex >= originalWords.length) { return null; } // Extract the range of words from original text const matchedWords = originalWords.slice(startWordIndex, endWordIndex + 1); // Find the corresponding character positions in original text let originalCharCount = 0; for (let i = 0; i < startWordIndex; i++) { originalCharCount += originalWords[i].length; if (i < originalWords.length - 1) { originalCharCount += 1; // +1 for space after each word except the last } } const originalStart = originalCharCount; // Calculate the end position let originalEnd = originalStart; for (let i = startWordIndex; i <= endWordIndex; i++) { originalEnd += originalWords[i].length; if (i < endWordIndex) { originalEnd += 1; // +1 for space between words } } // Clean punctuation from the extracted words const cleanedWords = matchedWords.map(word => cleanPunctuation(word)); const cleanedText = cleanedWords.join(' '); return { text: cleanedText, originalStart: originalStart, originalEnd: originalEnd }; }/** * Check if a character range overlaps with existing ranges */ function hasRangeOverlap(start, end, existingRanges) { return existingRanges.some(range => (start < range.end && end > range.start) ); } /** * Remove duplicate matches, keeping the highest confidence ones and numbering occurrences */ function deduplicateMatches(matches) { // First sort by position to process in order matches.sort((a, b) => { const [aChap, aVerse] = a.reference.split(':').map(Number); const [bChap, bVerse] = b.reference.split(':').map(Number); if (aChap !== bChap) return aChap - bChap; if (aVerse !== bVerse) return aVerse - bVerse; // Sort by start position within the verse return a.startPos - b.startPos; }); const seen = new Map(); const result = []; for (const match of matches) { // Create a more specific key that includes position to allow multiple occurrences const baseKey = `${match.reference}-${match.origWords}-${match.twLink}`; const posKey = `${baseKey}-${match.startPos}`; if (!seen.has(posKey)) { // Count how many times this term has appeared in this verse const countKey = `${match.reference}-${match.origWords}-${match.twLink}`; const previousOccurrences = result.filter(r => `${r.reference}-${r.origWords}-${r.twLink}` === countKey ).length; // Set the occurrence number match.occurrence = previousOccurrences + 1; seen.set(posKey, true); result.push(match); } } return result; } /** * Main function to generate TWL from USFM */ function generateTWL(usfmContent) { // Remove alignment data from USFM content first const cleanedUsfmContent = removeAlignments(usfmContent); const verses = parseUSFM(cleanedUsfmContent); console.log(`Parsed ${verses.length} verses`); const allMatches = []; for (const verse of verses) { const matches = findMatches(verse, contextDatabase); allMatches.push(...matches); } console.log(`Found ${allMatches.length} total matches`); // Sort matches by reference (chapter:verse) and then by position within verse allMatches.sort((a, b) => { const [aChap, aVerse] = a.reference.split(':').map(Number); const [bChap, bVerse] = b.reference.split(':').map(Number); if (aChap !== bChap) return aChap - bChap; if (aVerse !== bVerse) return aVerse - bVerse; // Sort by start position within the verse return a.startPos - b.startPos; }); // Convert to TSV format with enhanced columns const header = 'Reference\tID\tTags\tOrigWords\tOccurrence\tTWLink\tDisambiguation\tContext\tConfidence\tMatch_Type'; const rows = allMatches.map(match => `${match.reference}\t${match.id}\t${match.tags}\t${match.origWords}\t${match.occurrence}\t${match.twLink}\t${match.disambiguation || ''}\t${match.context || ''}\t${match.confidence}\t${match.matchType}` ); return [header, ...rows].join('\n'); } /** * Generate output filename based on input filename * (Node.js only - for CLI usage) */ async function generateOutputFilename(inputFile, outputDir = null) { if (!isNode) { throw new Error('generateOutputFilename is only available in Node.js environment'); } const path = (await import('path')).default; const baseName = path.basename(inputFile, '.usfm'); // If input starts with number and dash (e.g., "01-GEN"), remove prefix and add "twl_" if (/^\d+-/.test(baseName)) { const bookCode = baseName.replace(/^\d+-/, ''); const outputName = `twl_${bookCode}.tsv`; return outputDir ? path.join(outputDir, outputName) : outputName; } // Otherwise just change extension to .tsv const outputName = `${baseName}.tsv`; return outputDir ? path.join(outputDir, outputName) : outputName; } /** * Main execution function (Node.js only - for CLI usage) */ async function main() { if (!isNode) { console.error('CLI functionality is only available in Node.js environment'); return; } const fs = (await import('fs')).default; const args = process.argv.slice(2); if (args.length < 1) { console.log('Usage: node twl-linker.js <usfm_file> [output_file]'); console.log('Example: node twl-linker.js 46-ROM.usfm twl_ROM.tsv'); console.log(' node twl-linker.js test.usfm test.tsv'); console.log(' node twl-linker.js 01-GEN.usfm # outputs to twl_GEN.tsv'); process.exit(1); } const usfmFile = args[0]; const outputFile = args[1] || await generateOutputFilename(usfmFile); try { console.log(`Reading USFM file: ${usfmFile}`); const usfmContent = fs.readFileSync(usfmFile, 'utf8'); console.log('Generating semantic links...'); const tsvContent = generateTWL(usfmContent); console.log(`Writing output to: ${outputFile}`); fs.writeFileSync(outputFile, tsvContent); console.log('✅ Semantic linking complete!'); // Show enhanced statistics const lines = tsvContent.split('\n'); console.log(`📊 Generated ${lines.length - 1} links`); // Analyze the results const dataLines = lines.slice(1).filter(line => line.trim()); if (dataLines.length > 0) { const confidences = dataLines.map(line => parseFloat(line.split('\t')[8])).filter(c => !isNaN(c)); const disambiguations = dataLines.map(line => line.split('\t')[6] || ''); // Confidence statistics const avgConfidence = confidences.reduce((a, b) => a + b) / confidences.length; const highConfidence = confidences.filter(c => c >= 0.8).length; console.log(`📈 Average confidence: ${avgConfidence.toFixed(3)}`); console.log(`🎯 High confidence links (≥0.8): ${highConfidence}/${confidences.length}`); // Disambiguation statistics const singleMatches = disambiguations.filter(d => d === 'single').length; const autoDisambiguated = disambiguations.filter(d => d.startsWith('auto:')).length; const manualNeeded = disambiguations.filter(d => d.startsWith('manual:')).length; console.log(`🔍 Disambiguation breakdown:`); console.log(` • Single matches: ${singleMatches}`); console.log(` • Auto-disambiguated: ${autoDisambiguated}`); console.log(` • Manual review needed: ${manualNeeded}`); if (manualNeeded > 0) { console.log(`⚠️ ${manualNeeded} ambiguous terms need manual review`); } } } catch (error) { console.error('Error:', error.message); process.exit(1); } } // Export the main function and utilities for use as a module export { generateTWL, parseUSFM, findMatches, generateOutputFilename, contextDatabase, main // Export main for programmatic access if needed };