UNPKG

permamind

Version:

An MCP server that provides an immortal memory layer for AI agents and clients

719 lines (718 loc) 27.1 kB
// Removed unused import const DOC_SOURCES = [ { description: "Arweave ecosystem development guides", domain: "arweave", keywords: { primary: [ "arweave", "permaweb", "smartweave", "graphql", "transaction", "wallet", "bundling", "arfs", "arns", ], secondary: [ "permanent", "storage", "blockchain", "decentralized", "ar", "winston", "pst", "profit sharing", ], technical: [ "warp", "arweave-js", "ardrive", "arkb", "irys", "bundlr", "vouch", "smartweave contract", ], }, url: "https://fuel_permawebllms.permagate.io/arweave-llms.txt", }, { description: "AO computer system documentation", domain: "ao", keywords: { primary: [ "ao", "process", "message", "lua", "aos", "spawn", "scheduler", "autonomous", ], secondary: [ "actor", "hyper parallel", "computing", "decentralized", "holographic", "supercomputer", ], technical: [ "aoconnect", "betteridea", "hyperbeam", "wasm", "module", "cron", "handler", ], }, url: "https://fuel_permawebllms.permagate.io/ao-llms.txt", }, { description: "AR.IO ecosystem infrastructure", domain: "ario", keywords: { primary: [ "ar.io", "gateway", "arns", "wayfinder", "hosting", "deployment", ], secondary: [ "permaweb", "decentralized", "web3", "infrastructure", "indexing", "resolver", ], technical: [ "deploy", "archive", "content", "protocol", "node", "self-hosted", "configuration", ], }, url: "https://fuel_permawebllms.permagate.io/ario-llms.txt", }, { description: "HyperBEAM decentralized computing implementation", domain: "hyperbeam", keywords: { primary: [ "hyperbeam", "device", "wasm", "erlang", "distributed", "computation", ], secondary: [ "concurrent", "fault tolerant", "scalable", "trustless", "verifiable", "modular", ], technical: [ "tee", "trusted execution", "pipeline", "http api", "composable", "performance", ], }, url: "https://fuel_permawebllms.permagate.io/hyperbeam-llms.txt", }, { description: "Comprehensive Permaweb glossary", domain: "permaweb-glossary", keywords: { primary: [ "what is", "define", "definition", "explain", "glossary", "terminology", "meaning", ], secondary: [ "concept", "understand", "basic", "introduction", "overview", "guide", ], technical: [ "blockchain", "token", "economics", "cryptographic", "verification", "distributed", ], }, url: "https://fuel_permawebllms.permagate.io/permaweb-glossary-llms.txt", }, { description: "WAO documentation", domain: "wao", keywords: { primary: [ "wao", "hyperbeam", "devices", "codec", "hashpath", "ao unit", "distributed computing", "message routing", ], secondary: [ "encoding", "decoding", "tabm", "type annotated binary message", "testing framework", "in-memory", "verification", "provenance", ], technical: [ "flat@1.0", "structured@1.0", "httpsig@1.0", "erlang", "wasm", "nif", "graphql", "javascript sdk", "memory forking", "custom device", ], }, url: "https://permaweb-llm-fuel.vercel.app/wao-llms.txt", }, ]; export class PermawebDocs { cache = new Map(); cacheMaxAge = 24 * 60 * 60 * 1000; // 24 hours chunkSize = parseInt(process.env.CONTEXT_CHUNK_SIZE || "2000", 10); debugMode = process.env.DEBUG === "true"; defaultMaxResults = 20; fetchTimeoutMs = 30000; // 30 seconds relevanceThreshold = 2; tokensPerChar = 0.25; // Rough estimate: 4 chars ≈ 1 token /** * Helper for tests: extract unique domains from results */ static extractDomains(results) { return Array.from(new Set(results.map((r) => r.domain))); } /** * Clear cached documentation */ clearCache(domain) { if (domain) { this.cache.delete(domain); } else { this.cache.clear(); } } /** * Estimate total response size in tokens for results */ estimateResponseTokens(results) { return results.reduce((total, result) => { return total + this.estimateTokens(result.content); }, 0); } /** * Estimate token count for text content */ estimateTokens(text) { return Math.ceil(text.length * this.tokensPerChar); } /** * Get available documentation domains */ getAvailableDomains() { return DOC_SOURCES.map((source) => source.domain); } /** * Get cache status for all domains */ getCacheStatus() { const status = {}; for (const domain of this.getAvailableDomains()) { const cached = this.cache.get(domain); if (cached) { const age = Date.now() - cached.fetchedAt.getTime(); status[domain] = { age, loaded: true }; } else { status[domain] = { loaded: false }; } } return status; } /** * Check if documentation is loaded and fresh */ isDocLoaded(domain) { const cached = this.cache.get(domain); if (!cached) return false; const age = Date.now() - cached.fetchedAt.getTime(); return age < this.cacheMaxAge; } /** * Preload documentation for specific domains * Handles loading failures gracefully - logs warnings for failed domains */ async preload(domains = this.getAvailableDomains()) { // Use ensureDocsLoaded, which now handles failures gracefully await this.ensureDocsLoaded(domains); } /** * Query Permaweb documentation and return most relevant chunks. * Uses multiple search strategies to maximize result quality. * Handles domain loading failures gracefully - continues with available domains. */ async query(query, requestedDomains, maxResults = this.defaultMaxResults) { // Strategy 1: Standard search with detected domains const results = await this.executeSearchStrategy(query, requestedDomains, maxResults, "standard"); if (results.length > 0) { return results; } // Strategy 2: Expanded query search (if no results from strategy 1) const expandedResults = await this.executeSearchStrategy(query, requestedDomains, maxResults, "expanded"); if (expandedResults.length > 0) { return expandedResults; } // Strategy 3: Broad domain search (search all available domains) const broadResults = await this.executeSearchStrategy(query, requestedDomains, maxResults, "broad"); if (broadResults.length > 0) { return broadResults; } // Strategy 4: Relaxed matching (lower threshold, partial word matching) const relaxedResults = await this.executeSearchStrategy(query, requestedDomains, maxResults, "relaxed"); return relaxedResults; } /** * Calculate relevance of a chunk for a query and domain. */ calculateChunkRelevance(query, chunk, domain) { const source = DOC_SOURCES.find((s) => s.domain === domain); const content = chunk.toLowerCase(); const queryWords = query.toLowerCase().split(/\s+/); let score = 0; for (const word of queryWords) { if (content.includes(word)) score += 2; } const allKeywords = [ ...source.keywords.primary, ...source.keywords.secondary, ...source.keywords.technical, ]; for (const keyword of allKeywords) { if (content.includes(keyword)) score += 1; } return score; } /** * Split content into size-constrained chunks while preserving semantic boundaries. * @param content The content to chunk * @returns Array of size-appropriate chunks */ chunkBySizeAndSemantics(content) { if (content.length <= this.chunkSize) { return [content]; } const chunks = []; let remaining = content; while (remaining.length > this.chunkSize) { // Try to find the best semantic boundary within chunk size const boundaries = [ { pattern: /\n\n/g, priority: 1 }, // Paragraph breaks (highest priority) { pattern: /\. /g, priority: 2 }, // Sentence endings { pattern: / /g, priority: 3 }, // Word boundaries (lowest priority) ]; let bestBoundary = -1; for (const { pattern } of boundaries) { pattern.lastIndex = 0; // Reset regex state const searchText = remaining.substring(0, this.chunkSize); let match; let lastMatch = -1; while ((match = pattern.exec(searchText)) !== null) { lastMatch = match.index + match[0].length; // Prevent infinite loops with zero-width matches if (match[0].length === 0) { pattern.lastIndex = match.index + 1; } } if (lastMatch > bestBoundary) { bestBoundary = lastMatch; } } // If no good boundary found, split at chunk size if (bestBoundary === -1) { bestBoundary = this.chunkSize; } // Extract chunk and update remaining content const chunk = remaining.substring(0, bestBoundary).trim(); if (chunk) { chunks.push(chunk); } remaining = remaining.substring(bestBoundary).trim(); } // Add any remaining content if (remaining) { chunks.push(remaining); } return chunks; } /** * Split documentation content into logical chunks by domain with size constraints. * @param domain The documentation domain * @param content The full document content * @returns Array of chunked content strings */ chunkContent(domain, content) { // First split by document structure delimiters let initialChunks; if (domain === "permaweb-glossary") { // Split by double newlines (glossary entries) initialChunks = content .split(/\n\n{2,}/) .map((s) => s.trim()) .filter(Boolean); } else { // Split by '---' delimiters (most llms.txt) initialChunks = content .split(/^---+$/m) .map((s) => s.trim()) .filter(Boolean); } // Further chunk by size if any chunks exceed the limit const finalChunks = []; for (const chunk of initialChunks) { if (chunk.length <= this.chunkSize) { finalChunks.push(chunk); } else { // Split large chunks while preserving semantic boundaries const subChunks = this.chunkBySizeAndSemantics(chunk); finalChunks.push(...subChunks); } } return finalChunks; } /** * Enhanced domain detection with robust ranking and fallback */ detectRelevantDomains(query) { const domainScores = new Map(); const words = query.toLowerCase().split(/\s+/); // Score all domains - never filter to zero for (const source of DOC_SOURCES) { let score = 0; const allKeywords = [ ...source.keywords.primary.map((k) => ({ keyword: k, weight: 3 })), ...source.keywords.secondary.map((k) => ({ keyword: k, weight: 2 })), ...source.keywords.technical.map((k) => ({ keyword: k, weight: 2 })), ]; // Exact keyword matching for (const { keyword, weight } of allKeywords) { if (query.toLowerCase().includes(keyword.toLowerCase()) || words.some((word) => keyword.toLowerCase().includes(word))) { score += weight; } } // Fuzzy matching for partial word overlap for (const word of words) { if (word.length >= 3) { for (const { keyword, weight } of allKeywords) { if (keyword.toLowerCase().includes(word.substring(0, 3))) { score += weight * 0.3; // Reduced weight for fuzzy matches } } } } // Always give a base score to ensure no domain is completely excluded score += 0.1; domainScores.set(source.domain, score); } // Sort by score and return top domains const sortedDomains = Array.from(domainScores.entries()) .sort(([, a], [, b]) => b - a) .map(([domain]) => domain); // Adaptive domain count based on confidence const maxScore = Math.max(...domainScores.values()); const confidenceThreshold = 3; // Require at least primary keyword match for confidence if (maxScore >= confidenceThreshold) { // High confidence: return top 3 domains return sortedDomains.slice(0, 3); } else { // Low confidence: search more domains to avoid missing results return sortedDomains; // Search all domains } } /** * Enhanced document loading with retry logic * Handles partial failures gracefully - logs warnings but continues with successful domains */ async ensureDocsLoaded(domains) { const domainsToLoad = domains.filter((domain) => !this.isDocLoaded(domain)); if (domainsToLoad.length === 0) { return; // All domains already loaded } const loadPromises = domainsToLoad.map((domain) => this.loadDocumentationWithRetry(domain) .then(() => ({ domain, error: null, success: true })) .catch((error) => ({ domain, error, success: false }))); const results = await Promise.all(loadPromises); // Log warnings for failed domains but don't throw for (const result of results) { if (!result.success) { if (this.debugMode) { console.warn(`[PermawebDocs] Failed to load ${result.domain}: ${result.error?.message || "Unknown error"}`); } } } } /** * Execute a specific search strategy */ async executeSearchStrategy(query, requestedDomains, maxResults, strategy) { let domains; let searchQuery = query; let threshold = this.relevanceThreshold; // Configure strategy-specific parameters switch (strategy) { case "broad": // Search all available domains regardless of detection domains = this.getAvailableDomains(); break; case "expanded": domains = this.getSearchDomains(query, requestedDomains); searchQuery = this.expandQuery(query); break; case "relaxed": domains = this.getAvailableDomains(); threshold = Math.max(1, this.relevanceThreshold - 2); // Lower threshold break; case "standard": domains = this.getSearchDomains(query, requestedDomains); break; } if (this.debugMode) { console.log(`[PermawebDocs] Trying ${strategy} search strategy with domains: ${domains.join(", ")}`); } // Load required documents await this.ensureDocsLoaded(domains); const results = []; for (const domain of domains) { let cached = this.cache.get(domain); // Fallback: use stale cached content if available and fresh loading failed if (!cached || !this.isDocLoaded(domain)) { cached = this.cache.get(domain); // Get potentially stale content if (!cached) continue; if (this.debugMode) { console.log(`[PermawebDocs] Using potentially stale cached content for ${domain}`); } } const url = DOC_SOURCES.find((s) => s.domain === domain).url; const chunks = this.chunkContent(domain, cached.content); for (const chunk of chunks) { const relevanceScore = this.calculateChunkRelevance(searchQuery, chunk, domain); // Adjust matching criteria based on strategy const queryWords = searchQuery.toLowerCase().split(/\s+/); let containsQueryWord; if (strategy === "relaxed") { // More flexible matching for relaxed strategy containsQueryWord = queryWords.some((word) => { if (word.length >= 3) { // Partial word matching return chunk .toLowerCase() .includes(word.substring(0, Math.min(word.length, 4))); } return chunk.toLowerCase().includes(word); }); } else { // Standard exact word matching containsQueryWord = queryWords.some((word) => chunk.toLowerCase().includes(word)); } if (relevanceScore >= threshold && containsQueryWord) { results.push({ content: chunk, domain, isFullDocument: false, relevanceScore, url, }); } } } // Sort by relevance and return results return results .sort((a, b) => b.relevanceScore - a.relevanceScore) .slice(0, maxResults); } /** * Expand query with synonyms and related terms */ expandQuery(originalQuery) { const expansions = new Map([ ["ao", "ao computer autonomous objects processes"], ["architecture", "architecture design structure implementation"], ["ario", "ar.io gateway infrastructure hosting"], ["arweave", "arweave permaweb blockchain permanent storage"], ["benefits", "benefits advantages pros features capabilities"], ["codec", "codec encoding decoding tabm flat structured httpsig"], ["deployment", "deployment deploy hosting publishing"], ["development", "development dev building creating implementation"], // Technical computing terms ["devices", "devices codec hyperbeam wao modular computational"], ["encoding", "encoding decoding message codec tabm binary"], ["gateway", "gateway node infrastructure ar.io"], ["hashpath", "hashpath verification provenance chained hashes"], // Technology synonyms ["hyperbeam", "hyperbeam distributed computing wasm erlang"], // Concept expansions ["migrate", "migrate migration move transition switch"], ["nif", "nif erlang native implemented functions wasm"], ["process", "process autonomous object computation"], ["testing", "testing framework in-memory ao unit emulation"], // Common permaweb terms ["token", "token cryptocurrency digital asset pst"], ["wallet", "wallet arweave key management"], ["wao", "wao hyperbeam devices codec hashpath distributed computing"], ]); let expandedQuery = originalQuery; const queryWords = originalQuery.toLowerCase().split(/\s+/); for (const word of queryWords) { if (expansions.has(word)) { expandedQuery += " " + expansions.get(word); } } return expandedQuery; } /** * Get search domains based on query and requested domains */ getSearchDomains(query, requestedDomains) { if (requestedDomains && requestedDomains.length > 0) { return requestedDomains.filter((d) => this.getAvailableDomains().includes(d)); } const domains = this.detectRelevantDomains(query); // Always include glossary for definition/what is queries if (/what is|define|definition|glossary|meaning|explain/i.test(query) && !domains.includes("permaweb-glossary")) { domains.push("permaweb-glossary"); } return domains; } async loadDocumentation(domain) { const source = DOC_SOURCES.find((s) => s.domain === domain); if (!source) { throw new Error(`Unknown domain: ${domain}`); } // Create AbortController for timeout const abortController = new AbortController(); const timeoutId = setTimeout(() => { abortController.abort(); }, this.fetchTimeoutMs); try { const response = await fetch(source.url, { signal: abortController.signal, }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const content = await response.text(); if (!content || content.trim().length === 0) { throw new Error("Empty content received"); } // Check content size and warn if extremely large const contentSizeMB = content.length / (1024 * 1024); if (this.debugMode && contentSizeMB > 5) { console.warn(`Large documentation file for ${domain}: ${contentSizeMB.toFixed(2)}MB`); } // Validate content can be chunked without issues try { const testChunks = this.chunkContent(domain, content.substring(0, Math.min(content.length, 10000))); if (testChunks.length === 0) { throw new Error("Content chunking produced no results"); } } catch (chunkError) { throw new Error(`Content chunking failed: ${chunkError instanceof Error ? chunkError.message : "Unknown chunking error"}`); } this.cache.set(domain, { content, fetchedAt: new Date(), }); if (this.debugMode) { const chunkCount = this.chunkContent(domain, content).length; console.log(`Successfully loaded ${domain}: ${chunkCount} chunks from ${contentSizeMB.toFixed(2)}MB`); } } catch (error) { if (error instanceof Error && error.name === "AbortError") { throw new Error(`Failed to load ${domain} documentation: Request timed out after ${this.fetchTimeoutMs}ms`); } throw new Error(`Failed to load ${domain} documentation: ${error instanceof Error ? error.message : "Unknown error"}`); } finally { clearTimeout(timeoutId); } } /** * Load documentation with retry logic for better reliability */ async loadDocumentationWithRetry(domain, maxRetries = 2) { let lastError = new Error("Unknown error"); for (let attempt = 0; attempt <= maxRetries; attempt++) { try { await this.loadDocumentation(domain); return; } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); // Don't retry on timeout or termination errors - they're likely to fail again const isTimeout = lastError.message.includes("timed out after"); const isTerminated = lastError.message.includes("terminated"); if (attempt < maxRetries && !isTimeout && !isTerminated) { const delayMs = Math.pow(2, attempt) * 1000; await new Promise((resolve) => setTimeout(resolve, delayMs)); } else { break; } } } throw new Error(`Failed to load ${domain} after ${maxRetries + 1} attempts: ${lastError.message}`); } } export const permawebDocs = new PermawebDocs();