UNPKG

ruvector-extensions

Version:

Advanced features for ruvector: embeddings, UI, exports, temporal tracking, and persistence

1,041 lines (909 loc) 27.6 kB
/** * Database Persistence Module for ruvector-extensions * * Provides comprehensive database persistence capabilities including: * - Multiple save formats (JSON, Binary/MessagePack, SQLite) * - Incremental saves (only changed data) * - Snapshot management (create, list, restore, delete) * - Export/import functionality * - Compression support * - Progress callbacks for large operations * * @module persistence */ import { promises as fs } from 'fs'; import { createReadStream, createWriteStream } from 'fs'; import * as path from 'path'; import * as crypto from 'crypto'; import { pipeline } from 'stream/promises'; import type { VectorEntry, DbOptions, DbStats } from 'ruvector'; // VectorDB is a class, not a type - we need to use InstanceType type VectorDBInstance = any; // Will be replaced with actual instance type // ============================================================================ // Types and Interfaces // ============================================================================ /** * Supported persistence formats */ export type PersistenceFormat = 'json' | 'binary' | 'sqlite'; /** * Compression algorithms */ export type CompressionType = 'none' | 'gzip' | 'brotli'; /** * Progress callback for long-running operations */ export type ProgressCallback = (progress: { /** Operation being performed */ operation: string; /** Current progress (0-100) */ percentage: number; /** Number of items processed */ current: number; /** Total items to process */ total: number; /** Human-readable message */ message: string; }) => void; /** * Persistence configuration options */ export interface PersistenceOptions { /** Base directory for persistence files */ baseDir: string; /** Default format for saves */ format?: PersistenceFormat; /** Enable compression */ compression?: CompressionType; /** Enable incremental saves */ incremental?: boolean; /** Auto-save interval in milliseconds (0 = disabled) */ autoSaveInterval?: number; /** Maximum number of snapshots to keep */ maxSnapshots?: number; /** Batch size for large operations */ batchSize?: number; } /** * Database snapshot metadata */ export interface SnapshotMetadata { /** Snapshot identifier */ id: string; /** Human-readable name */ name: string; /** Creation timestamp */ timestamp: number; /** Vector count at snapshot time */ vectorCount: number; /** Database dimension */ dimension: number; /** Format used */ format: PersistenceFormat; /** Whether compressed */ compressed: boolean; /** File size in bytes */ fileSize: number; /** Checksum for integrity */ checksum: string; /** Additional metadata */ metadata?: Record<string, any>; } /** * Serialized database state */ export interface DatabaseState { /** Format version for compatibility */ version: string; /** Database configuration */ options: DbOptions; /** Database statistics */ stats: DbStats; /** Vector entries */ vectors: VectorEntry[]; /** Index state (opaque) */ indexState?: any; /** Additional metadata */ metadata?: Record<string, any>; /** Timestamp of save */ timestamp: number; /** Checksum for integrity */ checksum?: string; } /** * Incremental save state */ interface IncrementalState { /** Last save timestamp */ lastSave: number; /** Vector IDs at last save */ vectorIds: Set<string>; /** Checksum of last save */ checksum: string; } /** * Export options */ export interface ExportOptions { /** Output file path */ path: string; /** Export format */ format?: PersistenceFormat; /** Enable compression */ compress?: boolean; /** Include index state */ includeIndex?: boolean; /** Progress callback */ onProgress?: ProgressCallback; } /** * Import options */ export interface ImportOptions { /** Input file path */ path: string; /** Expected format (auto-detect if not specified) */ format?: PersistenceFormat; /** Whether to clear database before import */ clear?: boolean; /** Verify checksum */ verifyChecksum?: boolean; /** Progress callback */ onProgress?: ProgressCallback; } // ============================================================================ // Database Persistence Manager // ============================================================================ /** * Main persistence manager for VectorDB instances * * @example * ```typescript * const db = new VectorDB({ dimension: 384 }); * const persistence = new DatabasePersistence(db, { * baseDir: './data', * format: 'binary', * compression: 'gzip', * incremental: true * }); * * // Save database * await persistence.save({ onProgress: (p) => console.log(p.message) }); * * // Create snapshot * const snapshot = await persistence.createSnapshot('before-update'); * * // Restore from snapshot * await persistence.restoreSnapshot(snapshot.id); * ``` */ export class DatabasePersistence { private db: VectorDBInstance; private options: Required<PersistenceOptions>; private incrementalState: IncrementalState | null = null; private autoSaveTimer: NodeJS.Timeout | null = null; /** * Create a new database persistence manager * * @param db - VectorDB instance to manage * @param options - Persistence configuration */ constructor(db: VectorDBInstance, options: PersistenceOptions) { this.db = db; this.options = { baseDir: options.baseDir, format: options.format || 'json', compression: options.compression || 'none', incremental: options.incremental ?? false, autoSaveInterval: options.autoSaveInterval ?? 0, maxSnapshots: options.maxSnapshots ?? 10, batchSize: options.batchSize ?? 1000, }; this.initialize(); } /** * Initialize persistence system */ private async initialize(): Promise<void> { // Create base directory if it doesn't exist await fs.mkdir(this.options.baseDir, { recursive: true }); await fs.mkdir(path.join(this.options.baseDir, 'snapshots'), { recursive: true }); // Start auto-save if configured if (this.options.autoSaveInterval > 0) { this.startAutoSave(); } // Load incremental state if exists if (this.options.incremental) { await this.loadIncrementalState(); } } // ========================================================================== // Save Operations // ========================================================================== /** * Save database to disk * * @param options - Save options * @returns Path to saved file */ async save(options: { path?: string; format?: PersistenceFormat; compress?: boolean; onProgress?: ProgressCallback; } = {}): Promise<string> { const format = options.format || this.options.format; const compress = options.compress ?? (this.options.compression !== 'none'); const savePath = options.path || this.getDefaultSavePath(format, compress); const state = await this.serializeDatabase(options.onProgress); if (options.onProgress) { options.onProgress({ operation: 'save', percentage: 80, current: 4, total: 5, message: 'Writing to disk...', }); } await this.writeStateToFile(state, savePath, format, compress); if (this.options.incremental) { await this.updateIncrementalState(state); } if (options.onProgress) { options.onProgress({ operation: 'save', percentage: 100, current: 5, total: 5, message: 'Save completed', }); } return savePath; } /** * Save only changed data (incremental save) * * @param options - Save options * @returns Path to saved file or null if no changes */ async saveIncremental(options: { path?: string; format?: PersistenceFormat; onProgress?: ProgressCallback; } = {}): Promise<string | null> { if (!this.incrementalState) { // First save, do full save return this.save(options); } const stats = this.db.stats(); const currentVectors = await this.getAllVectorIds(); // Detect changes const added = currentVectors.filter(id => !this.incrementalState!.vectorIds.has(id)); const removed = Array.from(this.incrementalState!.vectorIds).filter( id => !currentVectors.includes(id) ); if (added.length === 0 && removed.length === 0) { // No changes return null; } if (options.onProgress) { options.onProgress({ operation: 'incremental-save', percentage: 20, current: 1, total: 5, message: `Found ${added.length} new and ${removed.length} removed vectors`, }); } // For now, do a full save with changes // In a production system, you'd implement delta encoding return this.save(options); } /** * Load database from disk * * @param options - Load options */ async load(options: { path: string; format?: PersistenceFormat; verifyChecksum?: boolean; onProgress?: ProgressCallback; }): Promise<void> { const format = options.format || this.detectFormat(options.path); if (options.onProgress) { options.onProgress({ operation: 'load', percentage: 10, current: 1, total: 5, message: 'Reading from disk...', }); } const state = await this.readStateFromFile(options.path, format); if (options.verifyChecksum && state.checksum) { if (options.onProgress) { options.onProgress({ operation: 'load', percentage: 30, current: 2, total: 5, message: 'Verifying checksum...', }); } const computed = this.computeChecksum(state); if (computed !== state.checksum) { throw new Error('Checksum verification failed - file may be corrupted'); } } await this.deserializeDatabase(state, options.onProgress); if (options.onProgress) { options.onProgress({ operation: 'load', percentage: 100, current: 5, total: 5, message: 'Load completed', }); } } // ========================================================================== // Snapshot Management // ========================================================================== /** * Create a snapshot of the current database state * * @param name - Human-readable snapshot name * @param metadata - Additional metadata to store * @returns Snapshot metadata */ async createSnapshot( name: string, metadata?: Record<string, any> ): Promise<SnapshotMetadata> { const id = crypto.randomUUID(); const timestamp = Date.now(); const stats = this.db.stats(); const snapshotPath = path.join( this.options.baseDir, 'snapshots', `${id}.${this.options.format}` ); await this.save({ path: snapshotPath, format: this.options.format, compress: this.options.compression !== 'none', }); const fileStats = await fs.stat(snapshotPath); const checksum = await this.computeFileChecksum(snapshotPath); const snapshotMetadata: SnapshotMetadata = { id, name, timestamp, vectorCount: stats.count, dimension: stats.dimension, format: this.options.format, compressed: this.options.compression !== 'none', fileSize: fileStats.size, checksum, metadata, }; // Save metadata const metadataPath = path.join( this.options.baseDir, 'snapshots', `${id}.meta.json` ); await fs.writeFile(metadataPath, JSON.stringify(snapshotMetadata, null, 2)); // Clean up old snapshots await this.cleanupOldSnapshots(); return snapshotMetadata; } /** * List all available snapshots * * @returns Array of snapshot metadata, sorted by timestamp (newest first) */ async listSnapshots(): Promise<SnapshotMetadata[]> { const snapshotsDir = path.join(this.options.baseDir, 'snapshots'); const files = await fs.readdir(snapshotsDir); const metadataFiles = files.filter(f => f.endsWith('.meta.json')); const snapshots: SnapshotMetadata[] = []; for (const file of metadataFiles) { const content = await fs.readFile(path.join(snapshotsDir, file), 'utf-8'); snapshots.push(JSON.parse(content)); } return snapshots.sort((a, b) => b.timestamp - a.timestamp); } /** * Restore database from a snapshot * * @param snapshotId - Snapshot ID to restore * @param options - Restore options */ async restoreSnapshot( snapshotId: string, options: { verifyChecksum?: boolean; onProgress?: ProgressCallback; } = {} ): Promise<void> { const snapshotsDir = path.join(this.options.baseDir, 'snapshots'); const metadataPath = path.join(snapshotsDir, `${snapshotId}.meta.json`); let metadata: SnapshotMetadata; try { const content = await fs.readFile(metadataPath, 'utf-8'); metadata = JSON.parse(content); } catch (error) { throw new Error(`Snapshot ${snapshotId} not found`); } const snapshotPath = path.join(snapshotsDir, `${snapshotId}.${metadata.format}`); if (options.verifyChecksum) { if (options.onProgress) { options.onProgress({ operation: 'restore', percentage: 10, current: 1, total: 5, message: 'Verifying snapshot integrity...', }); } const checksum = await this.computeFileChecksum(snapshotPath); if (checksum !== metadata.checksum) { throw new Error('Snapshot checksum verification failed - file may be corrupted'); } } await this.load({ path: snapshotPath, format: metadata.format, verifyChecksum: false, // Already verified above if needed onProgress: options.onProgress, }); } /** * Delete a snapshot * * @param snapshotId - Snapshot ID to delete */ async deleteSnapshot(snapshotId: string): Promise<void> { const snapshotsDir = path.join(this.options.baseDir, 'snapshots'); const metadataPath = path.join(snapshotsDir, `${snapshotId}.meta.json`); let metadata: SnapshotMetadata; try { const content = await fs.readFile(metadataPath, 'utf-8'); metadata = JSON.parse(content); } catch (error) { throw new Error(`Snapshot ${snapshotId} not found`); } const snapshotPath = path.join(snapshotsDir, `${snapshotId}.${metadata.format}`); await Promise.all([ fs.unlink(snapshotPath).catch(() => {}), fs.unlink(metadataPath).catch(() => {}), ]); } // ========================================================================== // Export/Import // ========================================================================== /** * Export database to a file * * @param options - Export options */ async export(options: ExportOptions): Promise<void> { const format = options.format || 'json'; const compress = options.compress ?? false; const state = await this.serializeDatabase(options.onProgress); if (!options.includeIndex) { delete state.indexState; } await this.writeStateToFile(state, options.path, format, compress); } /** * Import database from a file * * @param options - Import options */ async import(options: ImportOptions): Promise<void> { if (options.clear) { this.db.clear(); } await this.load({ path: options.path, format: options.format, verifyChecksum: options.verifyChecksum, onProgress: options.onProgress, }); } // ========================================================================== // Auto-Save // ========================================================================== /** * Start automatic saves at configured interval */ startAutoSave(): void { if (this.autoSaveTimer) { return; // Already running } this.autoSaveTimer = setInterval(async () => { try { if (this.options.incremental) { await this.saveIncremental(); } else { await this.save(); } } catch (error) { console.error('Auto-save failed:', error); } }, this.options.autoSaveInterval); } /** * Stop automatic saves */ stopAutoSave(): void { if (this.autoSaveTimer) { clearInterval(this.autoSaveTimer); this.autoSaveTimer = null; } } /** * Cleanup and shutdown */ async shutdown(): Promise<void> { this.stopAutoSave(); // Do final save if auto-save was enabled if (this.options.autoSaveInterval > 0) { await this.save(); } } // ========================================================================== // Private Helper Methods // ========================================================================== /** * Serialize database to state object */ private async serializeDatabase( onProgress?: ProgressCallback ): Promise<DatabaseState> { if (onProgress) { onProgress({ operation: 'serialize', percentage: 10, current: 1, total: 5, message: 'Collecting database statistics...', }); } const stats = this.db.stats(); const vectors: VectorEntry[] = []; if (onProgress) { onProgress({ operation: 'serialize', percentage: 30, current: 2, total: 5, message: 'Extracting vectors...', }); } // Extract all vectors const vectorIds = await this.getAllVectorIds(); for (let i = 0; i < vectorIds.length; i++) { const vector = this.db.get(vectorIds[i]); if (vector) { vectors.push(vector); } if (onProgress && i % this.options.batchSize === 0) { const percentage = 30 + Math.floor((i / vectorIds.length) * 40); onProgress({ operation: 'serialize', percentage, current: i, total: vectorIds.length, message: `Extracted ${i}/${vectorIds.length} vectors...`, }); } } const state: DatabaseState = { version: '1.0.0', options: { dimension: stats.dimension, metric: stats.metric as any, }, stats, vectors, timestamp: Date.now(), }; if (onProgress) { onProgress({ operation: 'serialize', percentage: 90, current: 4, total: 5, message: 'Computing checksum...', }); } state.checksum = this.computeChecksum(state); return state; } /** * Deserialize state object into database */ private async deserializeDatabase( state: DatabaseState, onProgress?: ProgressCallback ): Promise<void> { if (onProgress) { onProgress({ operation: 'deserialize', percentage: 40, current: 2, total: 5, message: 'Clearing existing data...', }); } this.db.clear(); if (onProgress) { onProgress({ operation: 'deserialize', percentage: 50, current: 3, total: 5, message: 'Inserting vectors...', }); } // Insert vectors in batches for (let i = 0; i < state.vectors.length; i += this.options.batchSize) { const batch = state.vectors.slice(i, i + this.options.batchSize); this.db.insertBatch(batch); if (onProgress) { const percentage = 50 + Math.floor((i / state.vectors.length) * 40); onProgress({ operation: 'deserialize', percentage, current: i, total: state.vectors.length, message: `Inserted ${i}/${state.vectors.length} vectors...`, }); } } if (onProgress) { onProgress({ operation: 'deserialize', percentage: 95, current: 4, total: 5, message: 'Rebuilding index...', }); } // Rebuild index this.db.buildIndex(); } /** * Write state to file in specified format */ private async writeStateToFile( state: DatabaseState, filePath: string, format: PersistenceFormat, compress: boolean ): Promise<void> { await fs.mkdir(path.dirname(filePath), { recursive: true }); let data: Buffer; switch (format) { case 'json': data = Buffer.from(JSON.stringify(state, null, compress ? 0 : 2)); break; case 'binary': // Use simple JSON for now - in production, use MessagePack data = Buffer.from(JSON.stringify(state)); break; case 'sqlite': // SQLite implementation would go here throw new Error('SQLite format not yet implemented'); default: throw new Error(`Unsupported format: ${format}`); } if (compress) { const { gzip, brotliCompress } = await import('zlib'); const { promisify } = await import('util'); if (this.options.compression === 'gzip') { const gzipAsync = promisify(gzip); data = await gzipAsync(data); } else if (this.options.compression === 'brotli') { const brotliAsync = promisify(brotliCompress); data = await brotliAsync(data); } } await fs.writeFile(filePath, data); } /** * Read state from file in specified format */ private async readStateFromFile( filePath: string, format: PersistenceFormat ): Promise<DatabaseState> { let data = await fs.readFile(filePath); // Detect and decompress if needed if (this.isCompressed(data)) { const { gunzip, brotliDecompress } = await import('zlib'); const { promisify } = await import('util'); // Try gzip first try { const gunzipAsync = promisify(gunzip); data = await gunzipAsync(data); } catch { // Try brotli const brotliAsync = promisify(brotliDecompress); data = await brotliAsync(data); } } switch (format) { case 'json': case 'binary': return JSON.parse(data.toString()); case 'sqlite': throw new Error('SQLite format not yet implemented'); default: throw new Error(`Unsupported format: ${format}`); } } /** * Get all vector IDs from database */ private async getAllVectorIds(): Promise<string[]> { // This is a workaround - in production, VectorDB should provide an iterator const stats = this.db.stats(); const ids: string[] = []; // Try to get vectors by attempting sequential IDs // This is inefficient and should be replaced with a proper API for (let i = 0; i < stats.count * 2; i++) { const vector = this.db.get(String(i)); if (vector) { ids.push(vector.id); } if (ids.length >= stats.count) { break; } } return ids; } /** * Compute checksum of state object */ private computeChecksum(state: DatabaseState): string { const { checksum, ...stateWithoutChecksum } = state; const data = JSON.stringify(stateWithoutChecksum); return crypto.createHash('sha256').update(data).digest('hex'); } /** * Compute checksum of file */ private async computeFileChecksum(filePath: string): Promise<string> { return new Promise((resolve, reject) => { const hash = crypto.createHash('sha256'); const stream = createReadStream(filePath); stream.on('data', data => hash.update(data)); stream.on('end', () => resolve(hash.digest('hex'))); stream.on('error', reject); }); } /** * Detect file format from extension */ private detectFormat(filePath: string): PersistenceFormat { const ext = path.extname(filePath).toLowerCase(); if (ext === '.json') return 'json'; if (ext === '.bin' || ext === '.msgpack') return 'binary'; if (ext === '.db' || ext === '.sqlite') return 'sqlite'; return this.options.format; } /** * Check if data is compressed */ private isCompressed(data: Buffer): boolean { // Gzip magic number: 1f 8b if (data[0] === 0x1f && data[1] === 0x8b) return true; // Brotli doesn't have a magic number, but we can try to decompress return false; } /** * Get default save path */ private getDefaultSavePath(format: PersistenceFormat, compress: boolean): string { const ext = format === 'json' ? 'json' : format === 'binary' ? 'bin' : 'db'; const compressExt = compress ? `.${this.options.compression}` : ''; return path.join(this.options.baseDir, `database.${ext}${compressExt}`); } /** * Load incremental state */ private async loadIncrementalState(): Promise<void> { const statePath = path.join(this.options.baseDir, '.incremental.json'); try { const content = await fs.readFile(statePath, 'utf-8'); const data = JSON.parse(content); this.incrementalState = { lastSave: data.lastSave, vectorIds: new Set(data.vectorIds), checksum: data.checksum, }; } catch { // No incremental state yet } } /** * Update incremental state after save */ private async updateIncrementalState(state: DatabaseState): Promise<void> { const vectorIds = state.vectors.map(v => v.id); this.incrementalState = { lastSave: Date.now(), vectorIds: new Set(vectorIds), checksum: state.checksum || '', }; const statePath = path.join(this.options.baseDir, '.incremental.json'); await fs.writeFile( statePath, JSON.stringify({ lastSave: this.incrementalState.lastSave, vectorIds: Array.from(this.incrementalState.vectorIds), checksum: this.incrementalState.checksum, }) ); } /** * Clean up old snapshots beyond max limit */ private async cleanupOldSnapshots(): Promise<void> { const snapshots = await this.listSnapshots(); if (snapshots.length <= this.options.maxSnapshots) { return; } const toDelete = snapshots.slice(this.options.maxSnapshots); for (const snapshot of toDelete) { await this.deleteSnapshot(snapshot.id); } } } // ============================================================================ // Utility Functions // ============================================================================ /** * Format file size in human-readable format * * @param bytes - File size in bytes * @returns Formatted string (e.g., "1.5 MB") */ export function formatFileSize(bytes: number): string { const units = ['B', 'KB', 'MB', 'GB', 'TB']; let size = bytes; let unitIndex = 0; while (size >= 1024 && unitIndex < units.length - 1) { size /= 1024; unitIndex++; } return `${size.toFixed(2)} ${units[unitIndex]}`; } /** * Format timestamp as ISO string * * @param timestamp - Unix timestamp in milliseconds * @returns ISO formatted date string */ export function formatTimestamp(timestamp: number): string { return new Date(timestamp).toISOString(); } /** * Estimate memory usage of database state * * @param state - Database state * @returns Estimated memory usage in bytes */ export function estimateMemoryUsage(state: DatabaseState): number { // Rough estimation const vectorSize = state.stats.dimension * 4; // 4 bytes per float const metadataSize = 100; // Average metadata size const totalVectorSize = state.vectors.length * (vectorSize + metadataSize); const overheadSize = JSON.stringify(state).length; return totalVectorSize + overheadSize; }