@boundless-oss/atlas

import { JSONSchema7 } from 'json-schema'; import { randomUUID } from 'crypto'; import { createHash } from 'crypto'; import { createTool, createSuccessResult, createErrorResult } from '../../core/tool-framework.js'; import { ToolRegistration, RequestContext } from '../../core/types.js'; import { CommonSchemas } from '../../core/validation.js'; import { promises as fs } from 'fs'; import path from 'path'; /** * RAG Retrieval Tools - 12-Factor MCP Implementation * * Implements Factor 2: Deterministic Execution with structured outputs * Implements Factor 3: Stateless Processes with RequestContext * Implements Factor 4: Structured Outputs for LLM consumption */ // Input type interfaces interface RAGSearchInput { query: string; limit?: number; threshold?: number; collection?: string; filters?: Record<string, any>; } interface RAGIndexDocumentInput { path: string; } interface RAGIndexDirectoryInput { path: string; } interface RAGIndexCollectionInput { collection: string; } interface RAGClearIndexInput { collection?: string; } /** * Convert embedding vector to/from binary storage */ function embeddingToBuffer(embedding: Float32Array): Buffer { return Buffer.from(embedding.buffer); } function bufferToEmbedding(buffer: Buffer): Float32Array { return new Float32Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 4); } /** * Calculate text hash for caching */ function calculateTextHash(text: string): string { return createHash('sha256').update(text).digest('hex'); } /** * Search through indexed documents using semantic search */ const ragSearchTool = createTool<RAGSearchInput, any>({ name: 'rag_search', description: 'Search through indexed documents using semantic search', category: 'rag-retrieval', readOnly: true, inputSchema: { type: 'object', properties: { query: { type: 'string', description: 'Search query text', minLength: 1, maxLength: 1000 }, limit: { type: 'integer', description: 'Maximum number of results (default: 10)', minimum: 1, maximum: 100, default: 10 }, threshold: { type: 'number', description: 'Minimum similarity threshold (0-1)', minimum: 0, maximum: 1, default: 0.5 }, collection: { type: 'string', description: 'Limit search to a specific collection', maxLength: 200 }, filters: { type: 'object', description: 'Metadata filters to apply', additionalProperties: true } }, required: ['query'], additionalProperties: false } as JSONSchema7, async execute(input: RAGSearchInput, context: RequestContext) { try { const startTime = Date.now(); // This is a placeholder for the actual embedding model integration // In a real implementation, we would use the embedding model here // For now, we'll simulate search with text matching let query = ` SELECT c.id, c.content, c.chunk_index, c.chunk_type, c.metadata, d.id as document_id, d.path as document_path, d.title as document_title FROM rag_chunks c JOIN rag_documents d ON c.document_id = d.id WHERE c.project_id = ? `; const params: any[] = [context.projectId || 'default']; // Add collection filter if specified if (input.collection) { query += ` AND EXISTS ( SELECT 1 FROM rag_collection_documents cd JOIN rag_collections col ON cd.collection_id = col.id WHERE cd.document_id = d.id AND col.name = ? AND col.project_id = ? ) `; params.push(input.collection, context.projectId || 'default'); } // For now, use simple text search // In production, this would use vector similarity query += ` AND c.content LIKE ? ORDER BY c.id LIMIT ?`; params.push(`%${input.query}%`, input.limit || 10); const result = await context.db.query(query, params); if (!result.success) { return createErrorResult({ code: 'DATABASE_ERROR', message: 'Failed to search documents', details: { error: result.error }, category: 'system' }); } const chunks = result.data || []; const executionTime = Date.now() - startTime; // Log search to history await context.db.run( `INSERT INTO rag_search_history (id, project_id, query, limit_count, threshold, filters, result_count, execution_time) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, [ randomUUID(), context.projectId || 'default', input.query, input.limit || 10, input.threshold || 0.5, JSON.stringify(input.filters || {}), chunks.length, executionTime ] ); return createSuccessResult({ results: chunks.map((chunk: any) => ({ chunk: { id: chunk.id, content: chunk.content, index: chunk.chunk_index, type: chunk.chunk_type, metadata: JSON.parse(chunk.metadata || '{}') }, document: { id: chunk.document_id, path: chunk.document_path, title: chunk.document_title }, score: 0.8 // Placeholder score })), query: input.query, executionTime, message: `Found ${chunks.length} relevant chunk${chunks.length !== 1 ? 's' : ''}` }); } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to search: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Index a single document for RAG */ const ragIndexDocumentTool = createTool<RAGIndexDocumentInput, any>({ name: 'rag_index_document', description: 'Index a single document for RAG', category: 'rag-retrieval', inputSchema: { type: 'object', properties: { path: { type: 'string', description: 'Path to the document to index (relative to project root)', minLength: 1, maxLength: 500 } }, required: ['path'], additionalProperties: false } as JSONSchema7, async execute(input: RAGIndexDocumentInput, context: RequestContext) { try { const projectPath = process.cwd(); const documentPath = path.resolve(projectPath, input.path); // Check if file exists try { await fs.access(documentPath); } catch { return createErrorResult({ code: 'FILE_NOT_FOUND', message: `Document not found: ${input.path}`, category: 'validation' }); } // Read file content const content = await fs.readFile(documentPath, 'utf-8'); const stats = await fs.stat(documentPath); // Extract metadata const title = path.basename(documentPath, path.extname(documentPath)); const documentId = randomUUID(); // Check if document already exists const existingDoc = await context.db.get( 'SELECT id FROM rag_documents WHERE project_id = ? AND path = ?', [context.projectId || 'default', input.path] ); if (existingDoc.success && existingDoc.data) { // Update existing document await context.db.run( `UPDATE rag_documents SET content = ?, size = ?, last_modified = ?, updated_at = ?, embedding_status = ? WHERE id = ?`, [ content, stats.size, stats.mtimeMs, Date.now(), 'pending', existingDoc.data.id ] ); // Delete old chunks await context.db.run( 'DELETE FROM rag_chunks WHERE document_id = ?', [existingDoc.data.id] ); } else { // Insert new document const result = await context.db.run( `INSERT INTO rag_documents (id, project_id, path, content, title, size, last_modified, embedding_status) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, [ documentId, context.projectId || 'default', input.path, content, title, stats.size, stats.mtimeMs, 'pending' ] ); if (!result.success) { return createErrorResult({ code: 'DATABASE_ERROR', message: 'Failed to index document', details: { error: result.error }, category: 'system' }); } } // Create chunks (simplified for now) const chunkSize = 500; const chunkOverlap = 50; const chunks = []; for (let i = 0; i < content.length; i += chunkSize - chunkOverlap) { const chunkContent = content.substring(i, i + chunkSize); if (chunkContent.trim()) { chunks.push({ id: randomUUID(), documentId: existingDoc.data?.id || documentId, content: chunkContent, index: chunks.length, startOffset: i, endOffset: Math.min(i + chunkSize, content.length), type: 'paragraph' // Simplified }); } } // Insert chunks for (const chunk of chunks) { await context.db.run( `INSERT INTO rag_chunks (id, project_id, document_id, content, chunk_index, start_offset, end_offset, chunk_type) VALUES (?, ?, ?, ?, ?, ?, ?, ?)`, [ chunk.id, context.projectId || 'default', chunk.documentId, chunk.content, chunk.index, chunk.startOffset, chunk.endOffset, chunk.type ] ); } // Update chunk count await context.db.run( 'UPDATE rag_documents SET chunk_count = ?, embedding_status = ? WHERE id = ?', [chunks.length, 'completed', existingDoc.data?.id || documentId] ); return createSuccessResult({ document: { id: existingDoc.data?.id || documentId, path: input.path, title, chunkCount: chunks.length }, message: `Successfully indexed document: ${input.path}`, details: `Created ${chunks.length} chunks from ${stats.size} bytes` }); } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to index document: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Index all markdown documents in a directory */ const ragIndexDirectoryTool = createTool<RAGIndexDirectoryInput, any>({ name: 'rag_index_directory', description: 'Index all markdown documents in a directory', category: 'rag-retrieval', inputSchema: { type: 'object', properties: { path: { type: 'string', description: 'Path to the directory to index (relative to project root)', minLength: 1, maxLength: 500 } }, required: ['path'], additionalProperties: false } as JSONSchema7, async execute(input: RAGIndexDirectoryInput, context: RequestContext) { try { const projectPath = process.cwd(); const directoryPath = path.resolve(projectPath, input.path); // Check if directory exists try { const stats = await fs.stat(directoryPath); if (!stats.isDirectory()) { return createErrorResult({ code: 'NOT_A_DIRECTORY', message: `Not a directory: ${input.path}`, category: 'validation' }); } } catch { return createErrorResult({ code: 'DIRECTORY_NOT_FOUND', message: `Directory not found: ${input.path}`, category: 'validation' }); } // Find all markdown files const files = await findMarkdownFiles(directoryPath); const results = { indexed: 0, failed: 0, errors: [] as string[] }; // Index each file for (const file of files) { const relativePath = path.relative(projectPath, file); const indexResult = await ragIndexDocumentTool.execute( { path: relativePath }, context ); if (indexResult.success) { results.indexed++; } else { results.failed++; results.errors.push(`${relativePath}: ${indexResult.error?.message}`); } } return createSuccessResult({ summary: { totalFiles: files.length, indexed: results.indexed, failed: results.failed }, errors: results.errors, message: `Indexed ${results.indexed} document${results.indexed !== 1 ? 's' : ''} from ${input.path}` }); } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to index directory: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Index a predefined collection of documents */ const ragIndexCollectionTool = createTool<RAGIndexCollectionInput, any>({ name: 'rag_index_collection', description: 'Index a predefined collection of documents', category: 'rag-retrieval', inputSchema: { type: 'object', properties: { collection: { type: 'string', description: 'Name of the collection to index', minLength: 1, maxLength: 200 } }, required: ['collection'], additionalProperties: false } as JSONSchema7, async execute(input: RAGIndexCollectionInput, context: RequestContext) { try { // Get collection const collectionResult = await context.db.get( 'SELECT * FROM rag_collections WHERE project_id = ? AND name = ?', [context.projectId || 'default', input.collection] ); if (!collectionResult.success || !collectionResult.data) { // Check default collections from config const configResult = await context.db.get( 'SELECT * FROM rag_config WHERE project_id = ?', [context.projectId || 'default'] ); // Create default collections if needed if (input.collection === 'docs' || input.collection === 'readme') { const collectionId = randomUUID(); const paths = input.collection === 'docs' ? ['./docs'] : ['./README.md', './docs/README.md']; await context.db.run( `INSERT INTO rag_collections (id, project_id, name, description, paths) VALUES (?, ?, ?, ?, ?)`, [ collectionId, context.projectId || 'default', input.collection, input.collection === 'docs' ? 'Documentation files' : 'README files', JSON.stringify(paths) ] ); // Re-fetch the collection const newCollection = await context.db.get( 'SELECT * FROM rag_collections WHERE id = ?', [collectionId] ); if (newCollection.data) { collectionResult.data = newCollection.data; } } if (!collectionResult.data) { return createErrorResult({ code: 'COLLECTION_NOT_FOUND', message: `Collection not found: ${input.collection}`, category: 'validation' }); } } const collection = collectionResult.data; const paths = JSON.parse(collection.paths || '[]'); const results = { indexed: 0, failed: 0, errors: [] as string[] }; // Index each path in the collection for (const collectionPath of paths) { try { const stats = await fs.stat(path.resolve(process.cwd(), collectionPath)); if (stats.isDirectory()) { // Index directory const dirResult = await ragIndexDirectoryTool.execute( { path: collectionPath }, context ); if (dirResult.success) { results.indexed += dirResult.data.summary.indexed; results.failed += dirResult.data.summary.failed; results.errors.push(...dirResult.data.errors); } else { results.failed++; results.errors.push(`${collectionPath}: ${dirResult.error?.message}`); } } else { // Index single file const fileResult = await ragIndexDocumentTool.execute( { path: collectionPath }, context ); if (fileResult.success) { results.indexed++; // Add to collection await context.db.run( `INSERT OR IGNORE INTO rag_collection_documents (collection_id, document_id) VALUES (?, ?)`, [collection.id, fileResult.data.document.id] ); } else { results.failed++; results.errors.push(`${collectionPath}: ${fileResult.error?.message}`); } } } catch (error) { results.failed++; results.errors.push(`${collectionPath}: ${error instanceof Error ? error.message : 'Unknown error'}`); } } // Update collection stats await context.db.run( `UPDATE rag_collections SET document_count = ?, last_indexed = ?, updated_at = ? WHERE id = ?`, [results.indexed, Date.now(), Date.now(), collection.id] ); return createSuccessResult({ collection: { name: input.collection, documentsIndexed: results.indexed, failed: results.failed }, errors: results.errors, message: `Indexed collection "${input.collection}": ${results.indexed} documents` }); } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to index collection: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Get statistics about the RAG index */ const ragGetStatsTool = createTool<{}, any>({ name: 'rag_get_stats', description: 'Get statistics about the RAG index', category: 'rag-retrieval', readOnly: true, inputSchema: { type: 'object', properties: {}, additionalProperties: false } as JSONSchema7, async execute(input: {}, context: RequestContext) { try { // Get document stats const docStats = await context.db.get( `SELECT COUNT(*) as total_documents, SUM(chunk_count) as total_chunks, SUM(size) as total_size FROM rag_documents WHERE project_id = ?`, [context.projectId || 'default'] ); // Get collection stats const collStats = await context.db.query( `SELECT COUNT(*) as total_collections FROM rag_collections WHERE project_id = ?`, [context.projectId || 'default'] ); // Get last indexed time const lastIndexed = await context.db.get( `SELECT MAX(created_at) as last_indexed FROM rag_documents WHERE project_id = ?`, [context.projectId || 'default'] ); // Get collections details const collections = await context.db.query( `SELECT name, document_count, chunk_count FROM rag_collections WHERE project_id = ?`, [context.projectId || 'default'] ); const stats = { totalDocuments: docStats.data?.total_documents || 0, totalChunks: docStats.data?.total_chunks || 0, totalCollections: collStats.data?.[0]?.total_collections || 0, indexSize: docStats.data?.total_size || 0, lastIndexed: lastIndexed.data?.last_indexed ? new Date(lastIndexed.data.last_indexed).toISOString() : 'Never', collections: (collections.data || []).reduce((acc: any, col: any) => { acc[col.name] = { documentCount: col.document_count || 0, chunkCount: col.chunk_count || 0, sizeBytes: 0 // Would need to calculate from documents }; return acc; }, {}) }; return createSuccessResult({ stats, message: 'RAG index statistics retrieved successfully' }); } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to get statistics: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Clear all indexed documents */ const ragClearIndexTool = createTool<RAGClearIndexInput, any>({ name: 'rag_clear_index', description: 'Clear all indexed documents', category: 'rag-retrieval', inputSchema: { type: 'object', properties: { collection: { type: 'string', description: 'Clear only documents in this collection', maxLength: 200 } }, additionalProperties: false } as JSONSchema7, async execute(input: RAGClearIndexInput, context: RequestContext) { try { if (input.collection) { // Clear specific collection const collection = await context.db.get( 'SELECT id FROM rag_collections WHERE project_id = ? AND name = ?', [context.projectId || 'default', input.collection] ); if (!collection.success || !collection.data) { return createErrorResult({ code: 'COLLECTION_NOT_FOUND', message: `Collection not found: ${input.collection}`, category: 'validation' }); } // Delete documents in collection await context.db.run( `DELETE FROM rag_documents WHERE id IN ( SELECT document_id FROM rag_collection_documents WHERE collection_id = ? )`, [collection.data.id] ); // Clear collection stats await context.db.run( 'UPDATE rag_collections SET document_count = 0, chunk_count = 0 WHERE id = ?', [collection.data.id] ); return createSuccessResult({ message: `Cleared collection "${input.collection}"`, collection: input.collection }); } else { // Clear all documents await context.db.run( 'DELETE FROM rag_documents WHERE project_id = ?', [context.projectId || 'default'] ); // Clear all collection stats await context.db.run( 'UPDATE rag_collections SET document_count = 0, chunk_count = 0 WHERE project_id = ?', [context.projectId || 'default'] ); return createSuccessResult({ message: 'Cleared all indexed documents' }); } } catch (error) { return createErrorResult({ code: 'EXECUTION_ERROR', message: `Failed to clear index: ${error instanceof Error ? error.message : 'Unknown error'}`, category: 'execution' }); } } }); /** * Helper function to find markdown files recursively */ async function findMarkdownFiles(dir: string): Promise<string[]> { const files: string[] = []; const entries = await fs.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (entry.isDirectory() && !entry.name.startsWith('.')) { files.push(...await findMarkdownFiles(fullPath)); } else if (entry.isFile() && entry.name.match(/\.(md|markdown)$/i)) { files.push(fullPath); } } return files; } /** * Setup RAG retrieval tools */ export async function setupRAGRetrievalTools(): Promise<ToolRegistration> { return { module: 'rag-retrieval', tools: [ ragSearchTool, ragIndexDocumentTool, ragIndexDirectoryTool, ragIndexCollectionTool, ragGetStatsTool, ragClearIndexTool ] }; }