@zerospacegg/anrubic
Version:
Anrubic - ZeroSpace.gg MCP Server for AI agents to access game data
376 lines (328 loc) • 10.2 kB
text/typescript
import fs from 'fs';
import path from 'path';
// Lore chunk represents a searchable section of the document
export interface LoreChunk {
id: string;
title: string;
content: string;
startLine: number;
endLine: number;
section: string;
subsection?: string;
}
// Token entry maps tokens to chunks with frequency
export interface TokenEntry {
token: string;
chunks: Array<{
chunkId: string;
frequency: number;
positions: number[]; // Character positions within chunk
}>;
totalFrequency: number;
}
// The main lore index structure
export interface LoreIndex {
chunks: Record<string, LoreChunk>;
unigrams: Record<string, TokenEntry>; // Single words
digraphs: Record<string, TokenEntry>; // Two-word phrases
metadata: {
totalChunks: number;
totalTokens: number;
totalDigraphs: number;
lastUpdated: string;
sourceFile: string;
};
}
// Search result with relevance scoring
export interface SearchResult {
chunk: LoreChunk;
score: number;
matchedTokens: string[];
context: string; // Snippet with highlighted matches
}
/**
* Clean and normalize text for tokenization
*/
function cleanText(text: string): string {
return text
// Remove markdown formatting
.replace(/\*\*(.*?)\*\*/g, '$1') // Bold
.replace(/\*(.*?)\*/g, '$1') // Italic
.replace(/`(.*?)`/g, '$1') // Code
.replace(/\[(.*?)\]\(.*?\)/g, '$1') // Links
// Normalize whitespace
.replace(/\s+/g, ' ')
.trim();
}
/**
* Tokenize text into clean words
*/
function tokenize(text: string): string[] {
return cleanText(text)
.toLowerCase()
// Split on word boundaries, keeping apostrophes in contractions
.split(/[^\w']+/)
.filter(token =>
token.length > 1 && // Skip single characters
!token.match(/^\d+$/) && // Skip pure numbers
!['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'].includes(token) // Skip common stop words
);
}
/**
* Generate digraphs (two-word phrases) from tokens
*/
function generateDigraphs(tokens: string[]): string[] {
const digraphs: string[] = [];
for (let i = 0; i < tokens.length - 1; i++) {
digraphs.push(`${tokens[i]} ${tokens[i + 1]}`);
}
return digraphs;
}
/**
* Parse markdown into structured chunks
*/
function parseMarkdownIntoChunks(content: string): LoreChunk[] {
const lines = content.split('\n');
const chunks: LoreChunk[] = [];
let currentSection = '';
let currentSubsection = '';
let chunkStart = 0;
let chunkContent: string[] = [];
let chunkTitle = 'Document Start';
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Handle headers
if (line.startsWith('##')) {
// Finish previous chunk if it has content
if (chunkContent.length > 0) {
chunks.push({
id: `chunk_${chunks.length}`,
title: chunkTitle,
content: chunkContent.join('\n').trim(),
startLine: chunkStart + 1,
endLine: i,
section: currentSection,
subsection: currentSubsection
});
}
// Start new chunk
if (line.startsWith('### ')) {
currentSubsection = line.replace('### ', '').trim();
chunkTitle = currentSubsection;
} else if (line.startsWith('## ')) {
currentSection = line.replace('## ', '').trim();
currentSubsection = '';
chunkTitle = currentSection;
}
chunkStart = i;
chunkContent = [line];
} else {
chunkContent.push(line);
}
}
// Don't forget the last chunk
if (chunkContent.length > 0) {
chunks.push({
id: `chunk_${chunks.length}`,
title: chunkTitle,
content: chunkContent.join('\n').trim(),
startLine: chunkStart + 1,
endLine: lines.length,
section: currentSection,
subsection: currentSubsection
});
}
return chunks;
}
/**
* Find all positions of a token within text
*/
function findTokenPositions(text: string, token: string): number[] {
const positions: number[] = [];
const cleanedText = cleanText(text).toLowerCase();
const searchToken = token.toLowerCase();
let startIndex = 0;
while (true) {
const index = cleanedText.indexOf(searchToken, startIndex);
if (index === -1) break;
positions.push(index);
startIndex = index + 1;
}
return positions;
}
/**
* Build inverted index for tokens
*/
function buildTokenIndex(chunks: LoreChunk[], isDigraph: boolean = false): Record<string, TokenEntry> {
const index: Record<string, TokenEntry> = {};
for (const chunk of chunks) {
const tokens = tokenize(chunk.content);
const tokensToProcess = isDigraph ? generateDigraphs(tokens) : tokens;
// Count frequency of each token in this chunk
const tokenFrequency: Record<string, number> = {};
for (const token of tokensToProcess) {
tokenFrequency[token] = (tokenFrequency[token] || 0) + 1;
}
// Add to inverted index
for (const [token, frequency] of Object.entries(tokenFrequency)) {
if (!index[token]) {
index[token] = {
token,
chunks: [],
totalFrequency: 0
};
}
const positions = findTokenPositions(chunk.content, token);
index[token].chunks.push({
chunkId: chunk.id,
frequency,
positions
});
index[token].totalFrequency += frequency;
}
}
return index;
}
/**
* Build complete lore index from markdown file
*/
export function buildLoreIndex(filePath: string): LoreIndex {
const content = fs.readFileSync(filePath, 'utf-8');
const chunks = parseMarkdownIntoChunks(content);
// Build chunk lookup
const chunkLookup: Record<string, LoreChunk> = {};
for (const chunk of chunks) {
chunkLookup[chunk.id] = chunk;
}
// Build token indexes
const unigrams = buildTokenIndex(chunks, false);
const digraphs = buildTokenIndex(chunks, true);
return {
chunks: chunkLookup,
unigrams,
digraphs,
metadata: {
totalChunks: chunks.length,
totalTokens: Object.keys(unigrams).length,
totalDigraphs: Object.keys(digraphs).length,
lastUpdated: new Date().toISOString(),
sourceFile: path.basename(filePath)
}
};
}
/**
* Calculate TF-IDF score for search relevance
*/
function calculateTfIdfScore(
tokenEntry: TokenEntry,
chunkId: string,
totalChunks: number
): number {
const chunkEntry = tokenEntry.chunks.find(c => c.chunkId === chunkId);
if (!chunkEntry) return 0;
// Term Frequency: frequency of term in document
const tf = chunkEntry.frequency;
// Inverse Document Frequency: log(total docs / docs containing term)
const idf = Math.log(totalChunks / tokenEntry.chunks.length);
return tf * idf;
}
/**
* Search the lore index for relevant chunks
*/
export function searchLore(
index: LoreIndex,
query: string,
limit: number = 10
): SearchResult[] {
const queryTokens = tokenize(query);
const queryDigraphs = generateDigraphs(queryTokens);
// Collect all relevant chunks with scores
const chunkScores: Record<string, {
score: number;
matchedTokens: Set<string>;
}> = {};
// Search unigrams
for (const token of queryTokens) {
const tokenEntry = index.unigrams[token];
if (tokenEntry) {
for (const chunkEntry of tokenEntry.chunks) {
if (!chunkScores[chunkEntry.chunkId]) {
chunkScores[chunkEntry.chunkId] = {
score: 0,
matchedTokens: new Set()
};
}
const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks);
chunkScores[chunkEntry.chunkId].score += score;
chunkScores[chunkEntry.chunkId].matchedTokens.add(token);
}
}
}
// Search digraphs (give them higher weight)
for (const digraph of queryDigraphs) {
const tokenEntry = index.digraphs[digraph];
if (tokenEntry) {
for (const chunkEntry of tokenEntry.chunks) {
if (!chunkScores[chunkEntry.chunkId]) {
chunkScores[chunkEntry.chunkId] = {
score: 0,
matchedTokens: new Set()
};
}
const score = calculateTfIdfScore(tokenEntry, chunkEntry.chunkId, index.metadata.totalChunks) * 2; // Digraph bonus
chunkScores[chunkEntry.chunkId].score += score;
chunkScores[chunkEntry.chunkId].matchedTokens.add(digraph);
}
}
}
// Convert to search results and sort by score
const results: SearchResult[] = Object.entries(chunkScores)
.map(([chunkId, { score, matchedTokens }]) => {
const chunk = index.chunks[chunkId];
const matchedArray = Array.from(matchedTokens);
// Generate context snippet (first 200 chars with matches highlighted)
let context = chunk.content.substring(0, 200);
for (const token of matchedArray) {
// Simple highlighting (could be improved)
const regex = new RegExp(`\\b${token}\\b`, 'gi');
context = context.replace(regex, `**${token}**`);
}
if (chunk.content.length > 200) {
context += '...';
}
return {
chunk,
score,
matchedTokens: matchedArray,
context
};
})
.sort((a, b) => b.score - a.score)
.slice(0, limit);
return results;
}
/**
* Get lore index statistics for debugging
*/
export function getLoreStats(index: LoreIndex): any {
const topUnigrams = Object.entries(index.unigrams)
.sort((a, b) => b[1].totalFrequency - a[1].totalFrequency)
.slice(0, 20)
.map(([token, entry]) => ({ token, frequency: entry.totalFrequency }));
const topDigraphs = Object.entries(index.digraphs)
.sort((a, b) => b[1].totalFrequency - a[1].totalFrequency)
.slice(0, 20)
.map(([token, entry]) => ({ token, frequency: entry.totalFrequency }));
return {
metadata: index.metadata,
topUnigrams,
topDigraphs,
sampleChunks: Object.values(index.chunks).slice(0, 3).map(chunk => ({
id: chunk.id,
title: chunk.title,
section: chunk.section,
lineRange: `${chunk.startLine}-${chunk.endLine}`,
contentPreview: chunk.content.substring(0, 100) + '...'
}))
};
}