@zerospacegg/anrubic

import fs from 'fs'; import path from 'path'; // Lore chunk represents a searchable section of the document export interface LoreChunk { id: string; title: string; content: string; startLine: number; endLine: number; section: string; subsection?: string; } // Token entry maps tokens to chunks with frequency export interface TokenEntry { token: string; chunks: Array<{ chunkId: string; frequency: number; positions: number[]; // Character positions within chunk }>; totalFrequency: number; } // The main lore index structure export interface LoreIndex { chunks: Record<string, LoreChunk>; unigrams: Record<string, TokenEntry>; // Single words digraphs: Record<string, TokenEntry>; // Two-word phrases metadata: { totalChunks: number; totalTokens: number; totalDigraphs: number; lastUpdated: string; sourceFile: string; }; } // Search result with relevance scoring export interface SearchResult { chunk: LoreChunk; score: number; matchedTokens: string[]; context: string; // Snippet with highlighted matches } /** * Clean and normalize text for tokenization */ function cleanText(text: string): string { return text // Remove markdown formatting .replace(/\*\*(.*?)\*\*/g, '$1') // Bold .replace(/\*(.*?)\*/g, '$1') // Italic .replace(/`(.*?)`/g, '$1') // Code .replace(/\[(.*?)\]$.*?$/g, '$1') // Links // Normalize whitespace .replace(/\s+/g, ' ') .trim(); } /** * Tokenize text into clean words */ function tokenize(text: string): string[] { return cleanText(text) .toLowerCase() // Split on word boundaries, keeping apostrophes in contractions .split(/[^\w']+/) .filter(token => token.length > 1 && // Skip single characters !token.match(/^\d+$/) && // Skip pure numbers !['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'].includes(token) // Skip common stop words ); } /** * Generate digraphs (two-word phrases) from tokens */ function generateDigraphs(tokens: string[]): string[] { const digraphs: string[] = []; for (let i = 0; i < tokens.length - 1; i++) { digraphs.push(`${tokens[i]} ${tokens[i + 1]}`); } return digraphs; } /** * Parse markdown into structured chunks */ function parseMarkdownIntoChunks(content: string): LoreChunk[] { const lines = content.split('\n'); const chunks: LoreChunk[] = []; let currentSection = ''; let currentSubsection = ''; let chunkStart = 0; let chunkContent: string[] = []; let chunkTitle = 'Document Start'; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Handle headers if (line.startsWith('##')) { // Finish previous chunk if it has content if (chunkContent.length > 0) { chunks.push({ id: `chunk_${chunks.length}`, title: chunkTitle, content: chunkContent.join('\n').trim(), startLine: chunkStart + 1, endLine: i, section: currentSection, subsection: currentSubsection }); } // Start new chunk if (line.startsWith('### ')) { currentSubsection = line.replace('### ', '').trim(); chunkTitle = currentSubsection; } else if (line.startsWith('## ')) { currentSection = line.replace('## ', '').trim(); currentSubsection = ''; chunkTitle = currentSection; } chunkStart = i; chunkContent = [line]; } else { chunkContent.push(line); } } // Don't forget the last chunk if (chunkContent.length > 0) { chunks.push({ id: `chunk_${chunks.length}`, title: chunkTitle, content: chunkContent.join('\n').trim(), startLine: chunkStart + 1, endLine: lines.length, section: currentSection, subsection: currentSubsection }); } return chunks; } /** * Find all positions of a token within text */ function findTokenPositions(text: string, token: string): number[] { const positions: number[] = []; const cleanedText = cleanText(text).toLowerCase(); const searchToken = token.toLowerCase(); let startIndex = 0; while (true) { const index = cleanedText.indexOf(searchToken, startIndex); if (index === -1) break; positions.push(index); startIndex = index + 1; } return positions; } /** * Build inverted index for tokens */ function buildTokenIndex(chunks: LoreChunk[], isDigraph: boolean = false): Record<string, TokenEntry> { const index: Record<string, TokenEntry> = {}; for (const chunk of chunks) { const tokens = tokenize(chunk.content); const tokensToProcess = isDigraph ? generateDigraphs(tokens) : tokens; // Count frequency of each token in this chunk const tokenFrequency: Record<string, number> = {}; for (const token of tokensToProcess) { tokenFrequency[token] = (tokenFrequency[token] || 0) + 1; } // Add to inverted index for (const [token, frequency] of Object.entries(tokenFrequency)) { if (!index[token]) { index[token] = { token, chunks: [], totalFrequency: 0 }; } const positions = findTokenPositions(chunk.content, token); index[token].chunks.push({ chunkId: chunk.id, frequency, positions }); index[token].totalFrequency += frequency; } } return index; } /** * Build complete lore index from markdown file */ export function buildLoreIndex(filePath: string): LoreIndex { const content = fs.readFileSync(filePath, 'utf-8'); const chunks = parseMarkdownIntoChunks(content); // Build chunk lookup const chunkLookup: Record<string, LoreChunk> = {}; for (const chunk of chunks) { chunkLookup[chunk.id] = chunk; } // Build token indexes const unigrams = buildTokenIndex(chunks, false); const digraphs = buildTokenIndex(chunks, true); return { chunks: chunkLookup, unigrams, digraphs, metadata: { totalChunks: chunks.length, totalTokens: Object.keys(unigrams).length, totalDigraphs: Object.keys(digraphs).length, lastUpdated: new Date().toISOString(), sourceFile: path.basename(filePath) } }; } /** * Calculate TF-IDF score for search relevance */ function calculateTfIdfScore( tokenEntry: TokenEntry, chunkId: string, totalChunks: number ): number { const chunkEntry = tokenEntry.chunks.find(c => c.chunkId === chunkId); if (!chunkEntry) return 0; // Term Frequency: frequency of term in document const tf = chunkEntry.frequency; // Inverse Document Frequency: log(total docs / docs containing term) const idf = Math.log(totalChunks / tokenEntry.chunks.length); return tf * idf; } /** * Search the lore index for relevant chunks */ export function searchLore( index: LoreIndex, query: string, limit: number = 10 ): SearchResult[] { const queryTokens = tokenize(query); const queryDigraphs = generateDigraphs(queryTokens); // Collect all relevant chunks with scores const chunkScores: Record<string, { score: number; matchedTokens: Set<string>; }> = {}; // Search unigrams for (const token of queryTokens) { const tokenEntry = index.unigrams[token]; if (tokenEntry) { for (const chunkEntry of tokenEntry.chunks) { if (!chunkScores[chunkEntry.chunkId]) { chunkScores[chunkEntry.chunkId] = { score: 0, matchedTokens: new Set() }; } const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks); chunkScores[chunkEntry.chunkId].score += score; chunkScores[chunkEntry.chunkId].matchedTokens.add(token); } } } // Search digraphs (give them higher weight) for (const digraph of queryDigraphs) { const tokenEntry = index.digraphs[digraph]; if (tokenEntry) { for (const chunkEntry of tokenEntry.chunks) { if (!chunkScores[chunkEntry.chunkId]) { chunkScores[chunkEntry.chunkId] = { score: 0, matchedTokens: new Set() }; } const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks) * 2; // Digraph bonus chunkScores[chunkEntry.chunkId].score += score; chunkScores[chunkEntry.chunkId].matchedTokens.add(digraph); } } } // Convert to search results and sort by score const results: SearchResult[] = Object.entries(chunkScores) .map(([chunkId, { score, matchedTokens }]) => { const chunk = index.chunks[chunkId]; const matchedArray = Array.from(matchedTokens); // Generate context snippet (first 200 chars with matches highlighted) let context = chunk.content.substring(0, 200); for (const token of matchedArray) { // Simple highlighting (could be improved) const regex = new RegExp(`\\b${token}\\b`, 'gi'); context = context.replace(regex, `**${token}**`); } if (chunk.content.length > 200) { context += '...'; } return { chunk, score, matchedTokens: matchedArray, context }; }) .sort((a, b) => b.score - a.score) .slice(0, limit); return results; } /** * Get lore index statistics for debugging */ export function getLoreStats(index: LoreIndex): any { const topUnigrams = Object.entries(index.unigrams) .sort((a, b) => b[1].totalFrequency - a[1].totalFrequency) .slice(0, 20) .map(([token, entry]) => ({ token, frequency: entry.totalFrequency })); const topDigraphs = Object.entries(index.digraphs) .sort((a, b) => b[1].totalFrequency - a[1].totalFrequency) .slice(0, 20) .map(([token, entry]) => ({ token, frequency: entry.totalFrequency })); return { metadata: index.metadata, topUnigrams, topDigraphs, sampleChunks: Object.values(index.chunks).slice(0, 3).map(chunk => ({ id: chunk.id, title: chunk.title, section: chunk.section, lineRange: `${chunk.startLine}-${chunk.endLine}`, contentPreview: chunk.content.substring(0, 100) + '...' })) }; }