@boundless-oss/atlas
Version:
Atlas - MCP Server for comprehensive startup project management
233 lines (191 loc) • 6.92 kB
text/typescript
import { promises as fs } from 'fs';
import path from 'path';
import type {
RAGVectorStore,
RAGChunk,
RAGSearchResult,
EmbeddingModel,
EmbeddingVector,
VectorStoreStats
} from './types.js';
export class InMemoryVectorStore implements RAGVectorStore {
private chunks: Map<string, RAGChunk> = new Map();
private embeddings: Map<string, EmbeddingVector> = new Map();
private documentChunks: Map<string, Set<string>> = new Map();
constructor(private embeddingModel: EmbeddingModel) {}
async addChunk(chunk: RAGChunk): Promise<void> {
// Generate embedding if not provided
let embedding = chunk.embedding;
if (!embedding) {
embedding = await this.embeddingModel.embedSingle(chunk.content);
}
// Store chunk and embedding
this.chunks.set(chunk.id, chunk);
this.embeddings.set(chunk.id, embedding);
// Track document-chunk relationship
if (!this.documentChunks.has(chunk.documentId)) {
this.documentChunks.set(chunk.documentId, new Set());
}
this.documentChunks.get(chunk.documentId)!.add(chunk.id);
}
async addChunks(chunks: RAGChunk[]): Promise<void> {
if (chunks.length === 0) return;
// Separate chunks with and without embeddings
const chunksNeedingEmbedding: RAGChunk[] = [];
const chunksWithEmbedding: RAGChunk[] = [];
for (const chunk of chunks) {
if (chunk.embedding) {
chunksWithEmbedding.push(chunk);
} else {
chunksNeedingEmbedding.push(chunk);
}
}
// Batch generate embeddings for chunks that need them
let generatedEmbeddings: EmbeddingVector[] = [];
if (chunksNeedingEmbedding.length > 0) {
const texts = chunksNeedingEmbedding.map(c => c.content);
generatedEmbeddings = await this.embeddingModel.embed(texts);
}
// Store all chunks
for (const chunk of chunksWithEmbedding) {
this.chunks.set(chunk.id, chunk);
this.embeddings.set(chunk.id, chunk.embedding!);
if (!this.documentChunks.has(chunk.documentId)) {
this.documentChunks.set(chunk.documentId, new Set());
}
this.documentChunks.get(chunk.documentId)!.add(chunk.id);
}
for (let i = 0; i < chunksNeedingEmbedding.length; i++) {
const chunk = chunksNeedingEmbedding[i];
const embedding = generatedEmbeddings[i];
this.chunks.set(chunk.id, chunk);
this.embeddings.set(chunk.id, embedding);
if (!this.documentChunks.has(chunk.documentId)) {
this.documentChunks.set(chunk.documentId, new Set());
}
this.documentChunks.get(chunk.documentId)!.add(chunk.id);
}
}
async search(query: string, k: number): Promise<RAGSearchResult[]> {
if (this.chunks.size === 0) return [];
// Generate query embedding
const queryEmbedding = await this.embeddingModel.embedSingle(query);
// Calculate similarities for all chunks
const results: RAGSearchResult[] = [];
for (const [chunkId, chunk] of this.chunks) {
const chunkEmbedding = this.embeddings.get(chunkId);
if (!chunkEmbedding) continue;
const score = this.embeddingModel.cosineSimilarity(queryEmbedding, chunkEmbedding);
results.push({
chunk,
score,
document: {
id: chunk.documentId,
path: chunk.documentId // Using documentId as path for now
}
});
}
// Sort by score and return top k
results.sort((a, b) => b.score - a.score);
return results.slice(0, k);
}
async searchWithFilters(
query: string,
k: number,
filters: Record<string, any>
): Promise<RAGSearchResult[]> {
if (this.chunks.size === 0) return [];
// Generate query embedding
const queryEmbedding = await this.embeddingModel.embedSingle(query);
// Calculate similarities for filtered chunks
const results: RAGSearchResult[] = [];
for (const [chunkId, chunk] of this.chunks) {
// Apply filters
let matchesFilters = true;
for (const [key, value] of Object.entries(filters)) {
if (chunk.metadata[key] !== value) {
matchesFilters = false;
break;
}
}
if (!matchesFilters) continue;
const chunkEmbedding = this.embeddings.get(chunkId);
if (!chunkEmbedding) continue;
const score = this.embeddingModel.cosineSimilarity(queryEmbedding, chunkEmbedding);
results.push({
chunk,
score,
document: {
id: chunk.documentId,
path: chunk.documentId // Using documentId as path for now
}
});
}
// Sort by score and return top k
results.sort((a, b) => b.score - a.score);
return results.slice(0, k);
}
async removeDocument(documentId: string): Promise<void> {
const chunkIds = this.documentChunks.get(documentId);
if (!chunkIds) return;
// Remove all chunks for this document
for (const chunkId of chunkIds) {
this.chunks.delete(chunkId);
this.embeddings.delete(chunkId);
}
// Remove document tracking
this.documentChunks.delete(documentId);
}
async clear(): Promise<void> {
this.chunks.clear();
this.embeddings.clear();
this.documentChunks.clear();
}
size(): number {
return this.chunks.size;
}
getStats(): VectorStoreStats {
const totalChunks = this.chunks.size;
const totalDocuments = this.documentChunks.size;
const embeddingDimension = this.embeddingModel.dimension;
// Calculate memory usage (rough estimate)
const chunkMemory = totalChunks * 1000; // Rough estimate per chunk
const embeddingMemory = totalChunks * embeddingDimension * 4; // 4 bytes per float
const memoryUsage = chunkMemory + embeddingMemory;
return {
totalChunks,
totalDocuments,
embeddingDimension,
memoryUsage
};
}
async save(filePath: string): Promise<void> {
const data = {
chunks: Array.from(this.chunks.entries()).map(([id, chunk]) => ({
id,
chunk,
embedding: Array.from(this.embeddings.get(id) || [])
})),
documentChunks: Array.from(this.documentChunks.entries()).map(([docId, chunkIds]) => ({
docId,
chunkIds: Array.from(chunkIds)
}))
};
await fs.mkdir(path.dirname(filePath), { recursive: true });
await fs.writeFile(filePath, JSON.stringify(data, null, 2));
}
async load(filePath: string): Promise<void> {
const content = await fs.readFile(filePath, 'utf-8');
const data = JSON.parse(content);
this.clear();
// Restore chunks and embeddings
for (const item of data.chunks) {
this.chunks.set(item.id, item.chunk);
this.embeddings.set(item.id, new Float32Array(item.embedding));
}
// Restore document-chunk relationships
for (const item of data.documentChunks) {
this.documentChunks.set(item.docId, new Set(item.chunkIds));
}
}
}