@boundless-oss/atlas

import { describe, it, expect, beforeEach, vi } from 'vitest'; import { promises as fs } from 'fs'; import { RAGPipeline } from '../rag-pipeline.js'; import { LocalEmbeddingModel } from '../embeddings.js'; import { InMemoryVectorStore } from '../vector-store.js'; import { MarkdownDocumentProcessor } from '../document-processor.js'; import type { RAGConfig, RAGSearchQuery, RAGDocument } from '../types.js'; vi.mock('fs', () => ({ promises: { readFile: vi.fn(), readdir: vi.fn(), stat: vi.fn(), mkdir: vi.fn(), writeFile: vi.fn(), access: vi.fn() } })); const mockFs = vi.mocked(fs); describe('RAGPipeline', () => { let pipeline: RAGPipeline; let config: RAGConfig; let embeddingModel: LocalEmbeddingModel; let vectorStore: InMemoryVectorStore; let documentProcessor: MarkdownDocumentProcessor; beforeEach(() => { vi.clearAllMocks(); config = { enabled: true, indexPaths: ['/docs'], chunkSize: 500, chunkOverlap: 50, embeddingModel: 'test-model', collections: { default: { name: 'default', paths: ['/docs'], description: 'Default collection' } }, persistencePath: '.atlas/rag', autoIndex: false, cacheEmbeddings: true }; embeddingModel = new LocalEmbeddingModel(); vectorStore = new InMemoryVectorStore(embeddingModel); documentProcessor = new MarkdownDocumentProcessor(); pipeline = new RAGPipeline(config, embeddingModel, vectorStore, documentProcessor); }); describe('initialization', () => { it('should initialize with provided components', async () => { await pipeline.initialize(); expect(pipeline).toBeDefined(); }); it('should create persistence directory on initialization', async () => { mockFs.access.mockRejectedValueOnce(new Error('Not found')); mockFs.mkdir.mockResolvedValueOnce(undefined); await pipeline.initialize(); expect(mockFs.mkdir).toHaveBeenCalledWith('.atlas/rag', { recursive: true }); }); it('should load existing index if available', async () => { mockFs.access.mockResolvedValueOnce(undefined); const mockLoadSpy = vi.spyOn(vectorStore, 'load').mockResolvedValueOnce(undefined); await pipeline.initialize(); expect(mockLoadSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json'); }); }); describe('indexDocument', () => { it('should index a single document', async () => { const documentPath = '/docs/test.md'; const content = '# Test Document\n\nThis is test content.'; mockFs.readFile.mockResolvedValueOnce(content); const document = await pipeline.indexDocument(documentPath); expect(document).toBeDefined(); expect(document.path).toBe(documentPath); expect(document.content).toBe(content); expect(document.chunks).toBeDefined(); expect(document.chunks!.length).toBeGreaterThan(0); }); it('should process and store chunks', async () => { const documentPath = '/docs/test.md'; const content = '# Test Document\n\nParagraph 1.\n\nParagraph 2.'; mockFs.readFile.mockResolvedValueOnce(content); const addChunksSpy = vi.spyOn(vectorStore, 'addChunks'); await pipeline.indexDocument(documentPath); expect(addChunksSpy).toHaveBeenCalled(); const chunks = addChunksSpy.mock.calls[0][0]; expect(chunks.length).toBeGreaterThan(0); expect(chunks[0].documentId).toBeDefined(); }); it('should handle document processing errors', async () => { const documentPath = '/docs/test.md'; mockFs.readFile.mockRejectedValueOnce(new Error('File not found')); await expect(pipeline.indexDocument(documentPath)).rejects.toThrow('File not found'); }); }); describe('indexDirectory', () => { it('should index all markdown files in directory', async () => { mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md', 'other.txt'] as any); mockFs.stat.mockImplementation((path) => { if (path.endsWith('.md')) { return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any); } return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any); }); mockFs.readFile.mockResolvedValue('# Test Content'); const results = await pipeline.indexDirectory('/docs'); expect(results.indexed).toBe(2); expect(results.failed).toBe(0); expect(results.documents).toHaveLength(2); }); it('should recursively index subdirectories', async () => { mockFs.readdir .mockResolvedValueOnce(['subdir', 'file.md'] as any) .mockResolvedValueOnce(['nested.md'] as any); mockFs.stat.mockImplementation((path) => { if (path.endsWith('subdir')) { return Promise.resolve({ isFile: () => false, isDirectory: () => true } as any); } return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any); }); mockFs.readFile.mockResolvedValue('# Content'); const results = await pipeline.indexDirectory('/docs'); expect(results.indexed).toBe(2); expect(mockFs.readdir).toHaveBeenCalledTimes(2); }); it('should handle indexing errors gracefully', async () => { mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md'] as any); mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any); mockFs.readFile .mockRejectedValueOnce(new Error('Read error')) .mockResolvedValueOnce('# Good content'); const results = await pipeline.indexDirectory('/docs'); expect(results.indexed).toBe(1); expect(results.failed).toBe(1); expect(results.errors).toHaveLength(1); expect(results.errors[0]).toContain('Read error'); }); it('should call progress callback', async () => { mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md'] as any); mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any); mockFs.readFile.mockResolvedValue('# Content'); const progressCallback = vi.fn(); await pipeline.indexDirectory('/docs', progressCallback); expect(progressCallback).toHaveBeenCalledWith(1, 2); expect(progressCallback).toHaveBeenCalledWith(2, 2); }); }); describe('search', () => { beforeEach(async () => { // Pre-populate the vector store with test data const doc1: RAGDocument = { id: 'doc1', path: '/docs/ml.md', content: 'Machine learning content', metadata: { title: 'ML Guide', created: new Date().toISOString(), modified: new Date().toISOString(), fileSize: 100 }, chunks: [{ id: 'chunk1', documentId: 'doc1', content: 'Machine learning is a subset of AI', index: 0, metadata: { startOffset: 0, endOffset: 35, type: 'text' } }] }; const doc2: RAGDocument = { id: 'doc2', path: '/docs/ai.md', content: 'Artificial intelligence content', metadata: { title: 'AI Guide', created: new Date().toISOString(), modified: new Date().toISOString(), fileSize: 150 }, chunks: [{ id: 'chunk2', documentId: 'doc2', content: 'AI encompasses machine learning and more', index: 0, metadata: { startOffset: 0, endOffset: 40, type: 'text' } }] }; await vectorStore.addChunks([...doc1.chunks!, ...doc2.chunks!]); }); it('should search across all documents', async () => { const query: RAGSearchQuery = { query: 'machine learning', limit: 5 }; const results = await pipeline.search(query); expect(results).toBeDefined(); expect(results.length).toBeGreaterThan(0); expect(results[0].chunk.content).toContain('learning'); }); it('should respect search limit', async () => { const query: RAGSearchQuery = { query: 'AI', limit: 1 }; const results = await pipeline.search(query); expect(results).toHaveLength(1); }); it('should apply metadata filters', async () => { const query: RAGSearchQuery = { query: 'learning', limit: 5, filters: { documentId: 'doc1' } }; const results = await pipeline.search(query); expect(results.every(r => r.chunk.documentId === 'doc1')).toBe(true); }); it('should apply similarity threshold', async () => { const query: RAGSearchQuery = { query: 'completely unrelated query', limit: 5, threshold: 0.8 }; const results = await pipeline.search(query); expect(results.every(r => r.score >= 0.8)).toBe(true); }); }); describe('getStats', () => { it('should return pipeline statistics', async () => { // Add some test data await vectorStore.addChunk({ id: 'test-chunk', documentId: 'test-doc', content: 'Test content', index: 0, metadata: { startOffset: 0, endOffset: 12, type: 'text' } }); const stats = await pipeline.getStats(); expect(stats).toBeDefined(); expect(stats.totalDocuments).toBe(1); expect(stats.totalChunks).toBe(1); expect(stats.indexSize).toBeGreaterThan(0); }); }); describe('clearIndex', () => { it('should clear all indexed data', async () => { await vectorStore.addChunk({ id: 'test-chunk', documentId: 'test-doc', content: 'Test content', index: 0, metadata: { startOffset: 0, endOffset: 12, type: 'text' } }); expect(vectorStore.size()).toBe(1); await pipeline.clearIndex(); expect(vectorStore.size()).toBe(0); }); }); describe('persistence', () => { it('should save index to disk', async () => { mockFs.writeFile.mockResolvedValueOnce(undefined); const saveSpy = vi.spyOn(vectorStore, 'save'); await pipeline.saveIndex(); expect(saveSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json'); }); it('should load index from disk', async () => { const loadSpy = vi.spyOn(vectorStore, 'load').mockResolvedValueOnce(undefined); await pipeline.loadIndex(); expect(loadSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json'); }); }); describe('collection management', () => { it('should index specific collection', async () => { mockFs.readdir.mockResolvedValueOnce(['file.md'] as any); mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any); mockFs.readFile.mockResolvedValue('# Content'); const results = await pipeline.indexCollection('default'); expect(results.indexed).toBe(1); expect(mockFs.readdir).toHaveBeenCalledWith('/docs'); }); it('should throw error for non-existent collection', async () => { await expect(pipeline.indexCollection('non-existent')).rejects.toThrow('Collection not found'); }); }); });