UNPKG

@escher-dbai/rag-module

Version:

Enterprise RAG module with chat context storage, vector search, and session management. Complete chat history retrieval and streaming content extraction for Electron apps.

477 lines (384 loc) 13.3 kB
const fs = require('fs-extra'); const path = require('path'); const { EventEmitter } = require('events'); const { v4: uuidv4 } = require('uuid'); /** * Local File-Based Vector Store * Pure JavaScript implementation with no external dependencies */ class LocalFileVectorStore extends EventEmitter { constructor(basePath, configManager, encryptionService) { super(); this.basePath = basePath; this.configManager = configManager; this.encryptionService = encryptionService; this.dataPath = path.join(basePath, 'data'); this.documentsFile = path.join(this.dataPath, 'documents.json'); this.searchIndexFile = path.join(this.dataPath, 'search-index.json'); this.dimensions = null; this.initialized = false; // In-memory cache for performance this.documentCache = new Map(); this.searchIndex = null; this.maxCacheSize = 1000; } setDimensions(dimensions) { this.dimensions = dimensions; } async initialize() { try { await fs.ensureDir(this.dataPath); // Initialize documents file if it doesn't exist if (!await fs.pathExists(this.documentsFile)) { await this._saveDocuments({ documents: [], count: 0, dimensions: this.dimensions, created: new Date().toISOString(), lastModified: new Date().toISOString() }); } // Initialize search index if (!await fs.pathExists(this.searchIndexFile)) { await this._saveSearchIndex({ byService: {}, byRegion: {}, byCloud: {}, byKeywords: {}, lastUpdated: new Date().toISOString() }); } // Load search index into memory this.searchIndex = await this._loadSearchIndex(); this.initialized = true; this.emit('initialized', 'Local file vector store initialized'); return true; } catch (error) { this.emit('error', `Failed to initialize local file vector store: ${error.message}`); throw error; } } async insertDocument(document) { this._ensureInitialized(); try { // Load existing documents const data = await this._loadDocuments(); // Check if document already exists (update case) const existingIndex = data.documents.findIndex(doc => doc.id === document.id); const newDocument = { id: document.id, content: document.content, embedding: document.embedding, metadata: { ...document.metadata, created_at: existingIndex === -1 ? new Date().toISOString() : data.documents[existingIndex].metadata.created_at, updated_at: new Date().toISOString() } }; if (existingIndex !== -1) { // Update existing document data.documents[existingIndex] = newDocument; } else { // Add new document data.documents.push(newDocument); } // Update search index await this._updateSearchIndex(newDocument); // Save documents data.count = data.documents.length; data.lastModified = new Date().toISOString(); await this._saveDocuments(data); // Update cache this._updateCache(newDocument); this.emit('document-inserted', { id: document.id }); return uuidv4(); // Return a unique vector ID } catch (error) { this.emit('error', `Failed to insert document: ${error.message}`); throw error; } } async search(queryEmbedding, options = {}) { this._ensureInitialized(); try { const { limit = 10, scoreThreshold = 0.5, filter } = options; // Load documents (use cache if possible) const documents = await this._getFilteredDocuments(filter); // Calculate similarities const results = []; for (const doc of documents) { const similarity = this._cosineSimilarity(queryEmbedding, doc.embedding); if (similarity >= scoreThreshold) { results.push({ id: doc.id, content: doc.content, metadata: doc.metadata, score: similarity }); } } // Sort by similarity (highest first) and limit return results .sort((a, b) => b.score - a.score) .slice(0, limit); } catch (error) { this.emit('error', `Search failed: ${error.message}`); throw error; } } async getById(id) { this._ensureInitialized(); try { // Check cache first if (this.documentCache.has(id)) { return this.documentCache.get(id); } // Load from file const data = await this._loadDocuments(); const document = data.documents.find(doc => doc.id === id); if (document) { this._updateCache(document); return document; } return null; } catch (error) { this.emit('error', `Failed to get document by ID: ${error.message}`); return null; } } async listDocuments(options = {}) { this._ensureInitialized(); try { const { limit = 50, filter } = options; const documents = await this._getFilteredDocuments(filter); return documents .slice(0, limit) .map(doc => ({ id: doc.id, content: doc.content, metadata: doc.metadata })); } catch (error) { this.emit('error', `Failed to list documents: ${error.message}`); throw error; } } async updateDocument(id, updates) { this._ensureInitialized(); try { const data = await this._loadDocuments(); const docIndex = data.documents.findIndex(doc => doc.id === id); if (docIndex === -1) { return false; } // Update document data.documents[docIndex] = { ...data.documents[docIndex], ...updates, metadata: { ...data.documents[docIndex].metadata, ...updates.metadata, updated_at: new Date().toISOString() } }; // Update search index if metadata changed if (updates.metadata) { await this._updateSearchIndex(data.documents[docIndex]); } // Save changes data.lastModified = new Date().toISOString(); await this._saveDocuments(data); // Update cache this._updateCache(data.documents[docIndex]); this.emit('document-updated', { id }); return true; } catch (error) { this.emit('error', `Failed to update document: ${error.message}`); return false; } } async deleteDocument(id) { this._ensureInitialized(); try { const data = await this._loadDocuments(); const originalLength = data.documents.length; data.documents = data.documents.filter(doc => doc.id !== id); if (data.documents.length === originalLength) { return false; // Document not found } // Remove from search index await this._removeFromSearchIndex(id); // Save changes data.count = data.documents.length; data.lastModified = new Date().toISOString(); await this._saveDocuments(data); // Remove from cache this.documentCache.delete(id); this.emit('document-deleted', { id }); return true; } catch (error) { this.emit('error', `Failed to delete document: ${error.message}`); return false; } } async getStats() { this._ensureInitialized(); try { const data = await this._loadDocuments(); return { totalPoints: data.count, dimensions: this.dimensions, storageType: 'local-files', lastModified: data.lastModified }; } catch (error) { return { totalPoints: 0, dimensions: this.dimensions, storageType: 'local-files', error: error.message }; } } // Private methods async _loadDocuments() { try { return await fs.readJson(this.documentsFile); } catch (error) { return { documents: [], count: 0, dimensions: this.dimensions, created: new Date().toISOString(), lastModified: new Date().toISOString() }; } } async _saveDocuments(data) { await fs.writeJson(this.documentsFile, data, { spaces: 2 }); } async _loadSearchIndex() { try { return await fs.readJson(this.searchIndexFile); } catch (error) { return { byService: {}, byRegion: {}, byCloud: {}, byKeywords: {}, lastUpdated: new Date().toISOString() }; } } async _saveSearchIndex(index) { await fs.writeJson(this.searchIndexFile, index, { spaces: 2 }); } async _updateSearchIndex(document) { const metadata = document.metadata; // Add to service index if (metadata.service) { if (!this.searchIndex.byService[metadata.service]) { this.searchIndex.byService[metadata.service] = []; } if (!this.searchIndex.byService[metadata.service].includes(document.id)) { this.searchIndex.byService[metadata.service].push(document.id); } } // Add to region index if (metadata.region) { if (!this.searchIndex.byRegion[metadata.region]) { this.searchIndex.byRegion[metadata.region] = []; } if (!this.searchIndex.byRegion[metadata.region].includes(document.id)) { this.searchIndex.byRegion[metadata.region].push(document.id); } } // Add to cloud index if (metadata.cloud) { if (!this.searchIndex.byCloud[metadata.cloud]) { this.searchIndex.byCloud[metadata.cloud] = []; } if (!this.searchIndex.byCloud[metadata.cloud].includes(document.id)) { this.searchIndex.byCloud[metadata.cloud].push(document.id); } } this.searchIndex.lastUpdated = new Date().toISOString(); await this._saveSearchIndex(this.searchIndex); } async _removeFromSearchIndex(documentId) { // Remove from all indices Object.values(this.searchIndex.byService).forEach(ids => { const index = ids.indexOf(documentId); if (index !== -1) ids.splice(index, 1); }); Object.values(this.searchIndex.byRegion).forEach(ids => { const index = ids.indexOf(documentId); if (index !== -1) ids.splice(index, 1); }); Object.values(this.searchIndex.byCloud).forEach(ids => { const index = ids.indexOf(documentId); if (index !== -1) ids.splice(index, 1); }); this.searchIndex.lastUpdated = new Date().toISOString(); await this._saveSearchIndex(this.searchIndex); } async _getFilteredDocuments(filter) { if (!filter || Object.keys(filter).length === 0) { const data = await this._loadDocuments(); return data.documents; } // Use search index for faster filtering let candidateIds = null; if (filter.service && this.searchIndex.byService[filter.service]) { candidateIds = this.searchIndex.byService[filter.service]; } if (filter.region && this.searchIndex.byRegion[filter.region]) { const regionIds = this.searchIndex.byRegion[filter.region]; candidateIds = candidateIds ? candidateIds.filter(id => regionIds.includes(id)) : regionIds; } if (filter.cloud && this.searchIndex.byCloud[filter.cloud]) { const cloudIds = this.searchIndex.byCloud[filter.cloud]; candidateIds = candidateIds ? candidateIds.filter(id => cloudIds.includes(id)) : cloudIds; } // Load documents const data = await this._loadDocuments(); if (candidateIds) { return data.documents.filter(doc => candidateIds.includes(doc.id)); } // Fallback to full scan with metadata filtering return data.documents.filter(doc => { if (filter.service && doc.metadata.service !== filter.service) return false; if (filter.region && doc.metadata.region !== filter.region) return false; if (filter.cloud && doc.metadata.cloud !== filter.cloud) return false; return true; }); } _cosineSimilarity(vecA, vecB) { if (vecA.length !== vecB.length) { throw new Error('Vectors must have the same dimensions'); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < vecA.length; i++) { dotProduct += vecA[i] * vecB[i]; normA += vecA[i] * vecA[i]; normB += vecB[i] * vecB[i]; } const magnitude = Math.sqrt(normA) * Math.sqrt(normB); return magnitude === 0 ? 0 : dotProduct / magnitude; } _updateCache(document) { // Simple LRU cache implementation if (this.documentCache.size >= this.maxCacheSize) { const firstKey = this.documentCache.keys().next().value; this.documentCache.delete(firstKey); } this.documentCache.set(document.id, document); } _ensureInitialized() { if (!this.initialized) { throw new Error('Local file vector store must be initialized before use'); } } } module.exports = LocalFileVectorStore;