UNPKG

@afterxleep/doc-bot

Version:

Generic MCP server for intelligent documentation access in any project

503 lines (414 loc) 15.6 kB
import fs from 'fs-extra'; import path from 'path'; import { glob } from 'glob'; import yaml from 'yaml'; class DocumentationService { constructor(docsPath, manifestLoader = null) { this.docsPath = docsPath; this.manifestLoader = manifestLoader; // Keep for backward compatibility but not required this.documents = new Map(); this.lastScanned = null; } async initialize() { await this.loadDocuments(); } async reload() { this.documents.clear(); this.lastScanned = null; await this.loadDocuments(); } async loadDocuments() { try { if (!await fs.pathExists(this.docsPath)) { // Silently return if path doesn't exist - this is normal for MCP servers return; } const pattern = path.join(this.docsPath, '**/*.{md,mdx,mdc}'); // Use explicit glob options to ensure recursive scanning const files = glob.sync(pattern, { absolute: true, // Return absolute paths dot: true, // Include hidden files/folders follow: true, // Follow symbolic links nodir: true, // Only return files, not directories ignore: ['**/node_modules/**', '**/.git/**'] // Ignore common non-doc folders }); // Log scanning info if verbose mode is enabled if (process.env.DOC_BOT_VERBOSE === 'true') { console.log(`[DocumentationService] Scanning pattern: ${pattern}`); console.log(`[DocumentationService] Found ${files.length} document(s)`); } for (const filePath of files) { await this.loadDocument(filePath); } this.lastScanned = new Date(); // Log final loaded count if verbose if (process.env.DOC_BOT_VERBOSE === 'true') { console.log(`[DocumentationService] Successfully loaded ${this.documents.size} document(s)`); } } catch (error) { console.error('Error loading documents:', error); } } async loadDocument(filePath) { try { const content = await fs.readFile(filePath, 'utf8'); const relativePath = path.relative(this.docsPath, filePath); // Parse frontmatter if present const { metadata, content: documentContent } = this.parseFrontmatter(content); const document = { fileName: relativePath, filePath: filePath, content: documentContent, metadata: metadata || {}, lastModified: (await fs.stat(filePath)).mtime }; this.documents.set(relativePath, document); // Log individual document loading if verbose if (process.env.DOC_BOT_VERBOSE === 'true') { const folderPath = path.dirname(relativePath); console.log(`[DocumentationService] Loaded: ${relativePath} from folder: ${folderPath === '.' ? 'root' : folderPath}`); } } catch (error) { console.error(`Error loading document ${filePath}:`, error); } } parseFrontmatter(content) { const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/; const match = content.match(frontmatterRegex); if (match) { try { const metadata = yaml.parse(match[1]); return { metadata, content: match[2] }; } catch (error) { // Silently skip files with invalid frontmatter } } return { metadata: {}, content: content }; } async getAllDocuments() { return Array.from(this.documents.values()); } async searchDocuments(query) { if (!query || query.trim() === '') { return []; } const searchTerms = this.parseQuery(query); const results = []; for (const doc of this.documents.values()) { const score = this.calculateAdvancedRelevanceScore(doc, searchTerms, query); // Only include documents with meaningful relevance (5% or higher) // This filters out documents that only have weak partial matches if (score >= 5.0) { // Extract a relevant snippet from the content const snippet = this.extractRelevantSnippet(doc.content, searchTerms, query); results.push({ ...doc, relevanceScore: score, snippet: snippet, matchedTerms: this.getMatchedTerms(doc, searchTerms) }); } } // Sort by relevance score return results.sort((a, b) => b.relevanceScore - a.relevanceScore); } parseQuery(query) { // Split by spaces and remove common stop words const stopWords = new Set(['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'how', 'what', 'where', 'when']); return query.toLowerCase() .split(/\s+/) .map(term => term.replace(/[^a-z0-9]/g, '')) // Remove punctuation .filter(term => term.length > 1 && !stopWords.has(term)); } calculateAdvancedRelevanceScore(doc, searchTerms, originalQuery) { let totalScore = 0; const content = doc.content.toLowerCase(); const title = (doc.metadata?.title || doc.fileName).toLowerCase(); const description = (doc.metadata?.description || '').toLowerCase(); // Exact phrase match bonus (highest priority) if (content.includes(originalQuery.toLowerCase()) || title.includes(originalQuery.toLowerCase())) { totalScore += 20; } let matchedTerms = 0; const termScores = []; for (const term of searchTerms) { let termScore = 0; // Title matches (highest weight) if (title.includes(term)) { termScore += 15; matchedTerms++; } // Description matches (high weight) if (description.includes(term)) { termScore += 10; matchedTerms++; } // Keyword exact matches (very high weight) if (doc.metadata?.keywords) { const keywords = Array.isArray(doc.metadata.keywords) ? doc.metadata.keywords : [doc.metadata.keywords]; for (const keyword of keywords) { const keywordLower = keyword.toLowerCase(); if (keywordLower === term) { termScore += 12; // Exact keyword match matchedTerms++; } else if (keywordLower.includes(term) || term.includes(keywordLower)) { termScore += 8; // Partial keyword match matchedTerms++; } } } // Content matches with frequency weighting const contentMatches = (content.match(new RegExp(this.escapeRegExp(term), 'g')) || []).length; if (contentMatches > 0) { termScore += Math.min(contentMatches * 2, 10); // Cap at 10 to prevent spam matchedTerms++; } // Fuzzy matching for typos (lower weight) if (termScore === 0) { const fuzzyScore = this.calculateFuzzyMatch(term, [title, description, content.substring(0, 500)].join(' ')); termScore += fuzzyScore; if (fuzzyScore > 0) matchedTerms++; } termScores.push(termScore); } // Calculate final score totalScore += termScores.reduce((sum, score) => sum + score, 0); // Bonus for matching multiple terms const termCoverage = matchedTerms / searchTerms.length; totalScore *= (0.5 + termCoverage); // 50% base + coverage bonus // Bonus for shorter documents (more focused) const docLength = content.length; if (docLength < 2000) { totalScore *= 1.1; } // Normalize score (0-100 scale) return Math.min(totalScore / 10, 100); } escapeRegExp(string) { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } calculateFuzzyMatch(term, text) { // Simple fuzzy matching - check for partial matches const words = text.toLowerCase().split(/\s+/); let maxScore = 0; for (const word of words) { if (word.includes(term) || term.includes(word)) { maxScore = Math.max(maxScore, 2); } else if (this.levenshteinDistance(term, word) <= 2 && Math.min(term.length, word.length) > 3) { maxScore = Math.max(maxScore, 1); } } return maxScore; } levenshteinDistance(str1, str2) { const matrix = Array(str2.length + 1).fill(null).map(() => Array(str1.length + 1).fill(null)); for (let i = 0; i <= str1.length; i++) matrix[0][i] = i; for (let j = 0; j <= str2.length; j++) matrix[j][0] = j; for (let j = 1; j <= str2.length; j++) { for (let i = 1; i <= str1.length; i++) { const indicator = str1[i - 1] === str2[j - 1] ? 0 : 1; matrix[j][i] = Math.min( matrix[j][i - 1] + 1, matrix[j - 1][i] + 1, matrix[j - 1][i - 1] + indicator ); } } return matrix[str2.length][str1.length]; } calculateRelevanceScore(doc, searchTerm) { // Legacy method - keep for backward compatibility return this.calculateAdvancedRelevanceScore(doc, [searchTerm], searchTerm); } /** * Extract a relevant snippet from content that shows context around matched terms * @param {string} content - The document content * @param {string[]} searchTerms - The search terms * @param {string} originalQuery - The original query * @returns {string} A relevant snippet */ extractRelevantSnippet(content, searchTerms, originalQuery) { const contentLower = content.toLowerCase(); const snippetLength = 200; let bestSnippet = ''; let bestScore = 0; // First, try to find exact phrase match if (originalQuery.length > 3) { const phraseIndex = contentLower.indexOf(originalQuery.toLowerCase()); if (phraseIndex !== -1) { const start = Math.max(0, phraseIndex - 50); const end = Math.min(content.length, phraseIndex + originalQuery.length + 150); return this.cleanSnippet(content.substring(start, end), start > 0, end < content.length); } } // Find the best snippet containing the most search terms const lines = content.split('\n'); for (let i = 0; i < lines.length; i++) { const line = lines[i]; const lineLower = line.toLowerCase(); // Skip empty lines and frontmatter if (!line.trim() || line.startsWith('---')) continue; // Count matching terms in this line and surrounding context let score = 0; let matchCount = 0; for (const term of searchTerms) { if (lineLower.includes(term.toLowerCase())) { matchCount++; // Higher score for terms in headers if (line.startsWith('#')) { score += 10; } else { score += 5; } } } if (matchCount > 0) { // Get context around this line const contextStart = Math.max(0, i - 1); const contextEnd = Math.min(lines.length, i + 3); const contextLines = lines.slice(contextStart, contextEnd); const snippet = contextLines.join(' ').trim(); if (score > bestScore && snippet.length > 20) { bestScore = score; bestSnippet = snippet; } } } // If no good snippet found, return the description or first meaningful paragraph if (!bestSnippet) { const metadata = this.extractMetadata(content); if (metadata.description) { return metadata.description; } // Find first non-empty paragraph after frontmatter const contentWithoutFrontmatter = content.replace(/^---[\s\S]*?---\n*/m, ''); const paragraphs = contentWithoutFrontmatter.split(/\n\n+/); for (const para of paragraphs) { const cleaned = para.trim(); if (cleaned && !cleaned.startsWith('#') && cleaned.length > 30) { return this.cleanSnippet(cleaned.substring(0, snippetLength), false, cleaned.length > snippetLength); } } } return this.cleanSnippet(bestSnippet.substring(0, snippetLength), false, bestSnippet.length > snippetLength); } /** * Clean and format a snippet for display */ cleanSnippet(snippet, hasStart, hasEnd) { // Remove multiple spaces and clean up let cleaned = snippet.replace(/\s+/g, ' ').trim(); // Remove markdown formatting for readability cleaned = cleaned.replace(/\*\*/g, ''); cleaned = cleaned.replace(/`/g, ''); // Add ellipsis if truncated if (hasStart) cleaned = '...' + cleaned; if (hasEnd) cleaned = cleaned + '...'; return cleaned; } /** * Get the terms that matched in this document */ getMatchedTerms(doc, searchTerms) { const matched = []; const contentLower = doc.content.toLowerCase(); const titleLower = (doc.metadata?.title || doc.fileName).toLowerCase(); const descriptionLower = (doc.metadata?.description || '').toLowerCase(); for (const term of searchTerms) { const termLower = term.toLowerCase(); if (titleLower.includes(termLower) || descriptionLower.includes(termLower) || contentLower.includes(termLower)) { matched.push(term); } } return matched; } isAlwaysApplyDoc(doc) { const metadata = doc?.metadata; if (!metadata) { return false; } const flags = [metadata.alwaysApply, metadata.always_apply, metadata.always]; return flags.some(value => value === true || value === 'true'); } getAlwaysApplyDocs() { const results = []; for (const doc of this.documents.values()) { if (this.isAlwaysApplyDoc(doc)) { results.push(doc); } } return results; } async getContextualDocs(filePath) { const matchingDocs = []; const seen = new Set(); const addDoc = (doc) => { if (!seen.has(doc.fileName)) { matchingDocs.push(doc); seen.add(doc.fileName); } }; // Find documents with matching patterns for (const doc of this.documents.values()) { if (this.isAlwaysApplyDoc(doc)) { addDoc(doc); continue; } // Check if document has file patterns in frontmatter const patterns = doc.metadata?.filePatterns || doc.metadata?.applies || []; const patternArray = Array.isArray(patterns) ? patterns : [patterns]; for (const pattern of patternArray) { if (pattern && this.matchesPattern(filePath, pattern)) { addDoc(doc); break; // Don't add the same doc multiple times } } } return matchingDocs; } matchesPattern(filePath, pattern) { // Simple glob-like pattern matching const regexPattern = pattern .replace(/\*/g, '.*') .replace(/\?/g, '.') .replace(/\[([^\]]+)\]/g, '($1)'); const regex = new RegExp(regexPattern, 'i'); return regex.test(filePath); } getDocument(fileName) { return this.documents.get(fileName); } getDocumentsByCategory(category) { const results = []; for (const doc of this.documents.values()) { if (doc.metadata?.category === category) { results.push(doc); } } return results; } async getDocumentIndex() { const index = []; for (const doc of this.documents.values()) { index.push({ title: doc.metadata?.title || doc.fileName, description: doc.metadata?.description || '', fileName: doc.fileName, lastUpdated: doc.lastModified.toISOString() }); } // Sort by title for consistent ordering return index.sort((a, b) => a.title.localeCompare(b.title)); } } export { DocumentationService };