@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
369 lines (299 loc) • 11.4 kB
text/typescript
import { describe, it, expect, beforeEach, vi } from 'vitest';
import { promises as fs } from 'fs';
import { RAGPipeline } from '../rag-pipeline.js';
import { LocalEmbeddingModel } from '../embeddings.js';
import { InMemoryVectorStore } from '../vector-store.js';
import { MarkdownDocumentProcessor } from '../document-processor.js';
import type { RAGConfig, RAGSearchQuery, RAGDocument } from '../types.js';
vi.mock('fs', () => ({
promises: {
readFile: vi.fn(),
readdir: vi.fn(),
stat: vi.fn(),
mkdir: vi.fn(),
writeFile: vi.fn(),
access: vi.fn()
}
}));
const mockFs = vi.mocked(fs);
describe('RAGPipeline', () => {
let pipeline: RAGPipeline;
let config: RAGConfig;
let embeddingModel: LocalEmbeddingModel;
let vectorStore: InMemoryVectorStore;
let documentProcessor: MarkdownDocumentProcessor;
beforeEach(() => {
vi.clearAllMocks();
config = {
enabled: true,
indexPaths: ['/docs'],
chunkSize: 500,
chunkOverlap: 50,
embeddingModel: 'test-model',
collections: {
default: {
name: 'default',
paths: ['/docs'],
description: 'Default collection'
}
},
persistencePath: '.atlas/rag',
autoIndex: false,
cacheEmbeddings: true
};
embeddingModel = new LocalEmbeddingModel();
vectorStore = new InMemoryVectorStore(embeddingModel);
documentProcessor = new MarkdownDocumentProcessor();
pipeline = new RAGPipeline(config, embeddingModel, vectorStore, documentProcessor);
});
describe('initialization', () => {
it('should initialize with provided components', async () => {
await pipeline.initialize();
expect(pipeline).toBeDefined();
});
it('should create persistence directory on initialization', async () => {
mockFs.access.mockRejectedValueOnce(new Error('Not found'));
mockFs.mkdir.mockResolvedValueOnce(undefined);
await pipeline.initialize();
expect(mockFs.mkdir).toHaveBeenCalledWith('.atlas/rag', { recursive: true });
});
it('should load existing index if available', async () => {
mockFs.access.mockResolvedValueOnce(undefined);
const mockLoadSpy = vi.spyOn(vectorStore, 'load').mockResolvedValueOnce(undefined);
await pipeline.initialize();
expect(mockLoadSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json');
});
});
describe('indexDocument', () => {
it('should index a single document', async () => {
const documentPath = '/docs/test.md';
const content = '# Test Document\n\nThis is test content.';
mockFs.readFile.mockResolvedValueOnce(content);
const document = await pipeline.indexDocument(documentPath);
expect(document).toBeDefined();
expect(document.path).toBe(documentPath);
expect(document.content).toBe(content);
expect(document.chunks).toBeDefined();
expect(document.chunks!.length).toBeGreaterThan(0);
});
it('should process and store chunks', async () => {
const documentPath = '/docs/test.md';
const content = '# Test Document\n\nParagraph 1.\n\nParagraph 2.';
mockFs.readFile.mockResolvedValueOnce(content);
const addChunksSpy = vi.spyOn(vectorStore, 'addChunks');
await pipeline.indexDocument(documentPath);
expect(addChunksSpy).toHaveBeenCalled();
const chunks = addChunksSpy.mock.calls[0][0];
expect(chunks.length).toBeGreaterThan(0);
expect(chunks[0].documentId).toBeDefined();
});
it('should handle document processing errors', async () => {
const documentPath = '/docs/test.md';
mockFs.readFile.mockRejectedValueOnce(new Error('File not found'));
await expect(pipeline.indexDocument(documentPath)).rejects.toThrow('File not found');
});
});
describe('indexDirectory', () => {
it('should index all markdown files in directory', async () => {
mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md', 'other.txt'] as any);
mockFs.stat.mockImplementation((path) => {
if (path.endsWith('.md')) {
return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any);
}
return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any);
});
mockFs.readFile.mockResolvedValue('# Test Content');
const results = await pipeline.indexDirectory('/docs');
expect(results.indexed).toBe(2);
expect(results.failed).toBe(0);
expect(results.documents).toHaveLength(2);
});
it('should recursively index subdirectories', async () => {
mockFs.readdir
.mockResolvedValueOnce(['subdir', 'file.md'] as any)
.mockResolvedValueOnce(['nested.md'] as any);
mockFs.stat.mockImplementation((path) => {
if (path.endsWith('subdir')) {
return Promise.resolve({ isFile: () => false, isDirectory: () => true } as any);
}
return Promise.resolve({ isFile: () => true, isDirectory: () => false } as any);
});
mockFs.readFile.mockResolvedValue('# Content');
const results = await pipeline.indexDirectory('/docs');
expect(results.indexed).toBe(2);
expect(mockFs.readdir).toHaveBeenCalledTimes(2);
});
it('should handle indexing errors gracefully', async () => {
mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md'] as any);
mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any);
mockFs.readFile
.mockRejectedValueOnce(new Error('Read error'))
.mockResolvedValueOnce('# Good content');
const results = await pipeline.indexDirectory('/docs');
expect(results.indexed).toBe(1);
expect(results.failed).toBe(1);
expect(results.errors).toHaveLength(1);
expect(results.errors[0]).toContain('Read error');
});
it('should call progress callback', async () => {
mockFs.readdir.mockResolvedValueOnce(['file1.md', 'file2.md'] as any);
mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any);
mockFs.readFile.mockResolvedValue('# Content');
const progressCallback = vi.fn();
await pipeline.indexDirectory('/docs', progressCallback);
expect(progressCallback).toHaveBeenCalledWith(1, 2);
expect(progressCallback).toHaveBeenCalledWith(2, 2);
});
});
describe('search', () => {
beforeEach(async () => {
// Pre-populate the vector store with test data
const doc1: RAGDocument = {
id: 'doc1',
path: '/docs/ml.md',
content: 'Machine learning content',
metadata: {
title: 'ML Guide',
created: new Date().toISOString(),
modified: new Date().toISOString(),
fileSize: 100
},
chunks: [{
id: 'chunk1',
documentId: 'doc1',
content: 'Machine learning is a subset of AI',
index: 0,
metadata: {
startOffset: 0,
endOffset: 35,
type: 'text'
}
}]
};
const doc2: RAGDocument = {
id: 'doc2',
path: '/docs/ai.md',
content: 'Artificial intelligence content',
metadata: {
title: 'AI Guide',
created: new Date().toISOString(),
modified: new Date().toISOString(),
fileSize: 150
},
chunks: [{
id: 'chunk2',
documentId: 'doc2',
content: 'AI encompasses machine learning and more',
index: 0,
metadata: {
startOffset: 0,
endOffset: 40,
type: 'text'
}
}]
};
await vectorStore.addChunks([...doc1.chunks!, ...doc2.chunks!]);
});
it('should search across all documents', async () => {
const query: RAGSearchQuery = {
query: 'machine learning',
limit: 5
};
const results = await pipeline.search(query);
expect(results).toBeDefined();
expect(results.length).toBeGreaterThan(0);
expect(results[0].chunk.content).toContain('learning');
});
it('should respect search limit', async () => {
const query: RAGSearchQuery = {
query: 'AI',
limit: 1
};
const results = await pipeline.search(query);
expect(results).toHaveLength(1);
});
it('should apply metadata filters', async () => {
const query: RAGSearchQuery = {
query: 'learning',
limit: 5,
filters: { documentId: 'doc1' }
};
const results = await pipeline.search(query);
expect(results.every(r => r.chunk.documentId === 'doc1')).toBe(true);
});
it('should apply similarity threshold', async () => {
const query: RAGSearchQuery = {
query: 'completely unrelated query',
limit: 5,
threshold: 0.8
};
const results = await pipeline.search(query);
expect(results.every(r => r.score >= 0.8)).toBe(true);
});
});
describe('getStats', () => {
it('should return pipeline statistics', async () => {
// Add some test data
await vectorStore.addChunk({
id: 'test-chunk',
documentId: 'test-doc',
content: 'Test content',
index: 0,
metadata: {
startOffset: 0,
endOffset: 12,
type: 'text'
}
});
const stats = await pipeline.getStats();
expect(stats).toBeDefined();
expect(stats.totalDocuments).toBe(1);
expect(stats.totalChunks).toBe(1);
expect(stats.indexSize).toBeGreaterThan(0);
});
});
describe('clearIndex', () => {
it('should clear all indexed data', async () => {
await vectorStore.addChunk({
id: 'test-chunk',
documentId: 'test-doc',
content: 'Test content',
index: 0,
metadata: {
startOffset: 0,
endOffset: 12,
type: 'text'
}
});
expect(vectorStore.size()).toBe(1);
await pipeline.clearIndex();
expect(vectorStore.size()).toBe(0);
});
});
describe('persistence', () => {
it('should save index to disk', async () => {
mockFs.writeFile.mockResolvedValueOnce(undefined);
const saveSpy = vi.spyOn(vectorStore, 'save');
await pipeline.saveIndex();
expect(saveSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json');
});
it('should load index from disk', async () => {
const loadSpy = vi.spyOn(vectorStore, 'load').mockResolvedValueOnce(undefined);
await pipeline.loadIndex();
expect(loadSpy).toHaveBeenCalledWith('.atlas/rag/vector-store.json');
});
});
describe('collection management', () => {
it('should index specific collection', async () => {
mockFs.readdir.mockResolvedValueOnce(['file.md'] as any);
mockFs.stat.mockResolvedValue({ isFile: () => true, isDirectory: () => false } as any);
mockFs.readFile.mockResolvedValue('# Content');
const results = await pipeline.indexCollection('default');
expect(results.indexed).toBe(1);
expect(mockFs.readdir).toHaveBeenCalledWith('/docs');
});
it('should throw error for non-existent collection', async () => {
await expect(pipeline.indexCollection('non-existent')).rejects.toThrow('Collection not found');
});
});
});