UNPKG

@boundless-oss/atlas

Version:

Atlas - MCP Server for comprehensive startup project management

233 lines (191 loc) 6.92 kB
import { promises as fs } from 'fs'; import path from 'path'; import type { RAGVectorStore, RAGChunk, RAGSearchResult, EmbeddingModel, EmbeddingVector, VectorStoreStats } from './types.js'; export class InMemoryVectorStore implements RAGVectorStore { private chunks: Map<string, RAGChunk> = new Map(); private embeddings: Map<string, EmbeddingVector> = new Map(); private documentChunks: Map<string, Set<string>> = new Map(); constructor(private embeddingModel: EmbeddingModel) {} async addChunk(chunk: RAGChunk): Promise<void> { // Generate embedding if not provided let embedding = chunk.embedding; if (!embedding) { embedding = await this.embeddingModel.embedSingle(chunk.content); } // Store chunk and embedding this.chunks.set(chunk.id, chunk); this.embeddings.set(chunk.id, embedding); // Track document-chunk relationship if (!this.documentChunks.has(chunk.documentId)) { this.documentChunks.set(chunk.documentId, new Set()); } this.documentChunks.get(chunk.documentId)!.add(chunk.id); } async addChunks(chunks: RAGChunk[]): Promise<void> { if (chunks.length === 0) return; // Separate chunks with and without embeddings const chunksNeedingEmbedding: RAGChunk[] = []; const chunksWithEmbedding: RAGChunk[] = []; for (const chunk of chunks) { if (chunk.embedding) { chunksWithEmbedding.push(chunk); } else { chunksNeedingEmbedding.push(chunk); } } // Batch generate embeddings for chunks that need them let generatedEmbeddings: EmbeddingVector[] = []; if (chunksNeedingEmbedding.length > 0) { const texts = chunksNeedingEmbedding.map(c => c.content); generatedEmbeddings = await this.embeddingModel.embed(texts); } // Store all chunks for (const chunk of chunksWithEmbedding) { this.chunks.set(chunk.id, chunk); this.embeddings.set(chunk.id, chunk.embedding!); if (!this.documentChunks.has(chunk.documentId)) { this.documentChunks.set(chunk.documentId, new Set()); } this.documentChunks.get(chunk.documentId)!.add(chunk.id); } for (let i = 0; i < chunksNeedingEmbedding.length; i++) { const chunk = chunksNeedingEmbedding[i]; const embedding = generatedEmbeddings[i]; this.chunks.set(chunk.id, chunk); this.embeddings.set(chunk.id, embedding); if (!this.documentChunks.has(chunk.documentId)) { this.documentChunks.set(chunk.documentId, new Set()); } this.documentChunks.get(chunk.documentId)!.add(chunk.id); } } async search(query: string, k: number): Promise<RAGSearchResult[]> { if (this.chunks.size === 0) return []; // Generate query embedding const queryEmbedding = await this.embeddingModel.embedSingle(query); // Calculate similarities for all chunks const results: RAGSearchResult[] = []; for (const [chunkId, chunk] of this.chunks) { const chunkEmbedding = this.embeddings.get(chunkId); if (!chunkEmbedding) continue; const score = this.embeddingModel.cosineSimilarity(queryEmbedding, chunkEmbedding); results.push({ chunk, score, document: { id: chunk.documentId, path: chunk.documentId // Using documentId as path for now } }); } // Sort by score and return top k results.sort((a, b) => b.score - a.score); return results.slice(0, k); } async searchWithFilters( query: string, k: number, filters: Record<string, any> ): Promise<RAGSearchResult[]> { if (this.chunks.size === 0) return []; // Generate query embedding const queryEmbedding = await this.embeddingModel.embedSingle(query); // Calculate similarities for filtered chunks const results: RAGSearchResult[] = []; for (const [chunkId, chunk] of this.chunks) { // Apply filters let matchesFilters = true; for (const [key, value] of Object.entries(filters)) { if (chunk.metadata[key] !== value) { matchesFilters = false; break; } } if (!matchesFilters) continue; const chunkEmbedding = this.embeddings.get(chunkId); if (!chunkEmbedding) continue; const score = this.embeddingModel.cosineSimilarity(queryEmbedding, chunkEmbedding); results.push({ chunk, score, document: { id: chunk.documentId, path: chunk.documentId // Using documentId as path for now } }); } // Sort by score and return top k results.sort((a, b) => b.score - a.score); return results.slice(0, k); } async removeDocument(documentId: string): Promise<void> { const chunkIds = this.documentChunks.get(documentId); if (!chunkIds) return; // Remove all chunks for this document for (const chunkId of chunkIds) { this.chunks.delete(chunkId); this.embeddings.delete(chunkId); } // Remove document tracking this.documentChunks.delete(documentId); } async clear(): Promise<void> { this.chunks.clear(); this.embeddings.clear(); this.documentChunks.clear(); } size(): number { return this.chunks.size; } getStats(): VectorStoreStats { const totalChunks = this.chunks.size; const totalDocuments = this.documentChunks.size; const embeddingDimension = this.embeddingModel.dimension; // Calculate memory usage (rough estimate) const chunkMemory = totalChunks * 1000; // Rough estimate per chunk const embeddingMemory = totalChunks * embeddingDimension * 4; // 4 bytes per float const memoryUsage = chunkMemory + embeddingMemory; return { totalChunks, totalDocuments, embeddingDimension, memoryUsage }; } async save(filePath: string): Promise<void> { const data = { chunks: Array.from(this.chunks.entries()).map(([id, chunk]) => ({ id, chunk, embedding: Array.from(this.embeddings.get(id) || []) })), documentChunks: Array.from(this.documentChunks.entries()).map(([docId, chunkIds]) => ({ docId, chunkIds: Array.from(chunkIds) })) }; await fs.mkdir(path.dirname(filePath), { recursive: true }); await fs.writeFile(filePath, JSON.stringify(data, null, 2)); } async load(filePath: string): Promise<void> { const content = await fs.readFile(filePath, 'utf-8'); const data = JSON.parse(content); this.clear(); // Restore chunks and embeddings for (const item of data.chunks) { this.chunks.set(item.id, item.chunk); this.embeddings.set(item.id, new Float32Array(item.embedding)); } // Restore document-chunk relationships for (const item of data.documentChunks) { this.documentChunks.set(item.docId, new Set(item.chunkIds)); } } }