@n2flowjs/nbase
Version:
Neural Vector Database for efficient similarity search
1,127 lines (1,000 loc) • 86.7 kB
text/typescript
// --- START OF FILE partitioned_vector_db.ts ---
// partitioned_db_optimized.ts
import { EventEmitter } from 'events';
import { existsSync, promises as fs, mkdirSync } from 'fs';
import { LRUCache } from 'lru-cache'; // Using a robust LRU cache library
import path from 'path';
import HNSW from '../ann/hnsw'; // Assuming HNSW is a class for the clustering algorithm
import { log } from '../utils/log';
import defaultSystemConfiguration from '../config';
import { BuildIndexHNSWOptions, ClusteredVectorDBOptions, DBStats, DistanceMetric, HNSWStats, PartitionConfig, PartitionedDBEventData, PartitionedDBStats, PartitionedVectorDBInterface, PartitionedVectorDBOptions, SearchOptions, SearchResult, TypedEventEmitter, Vector, VectorData } from '../types'; // Adjust path as needed
import { ClusteredVectorDB } from './clustered_vector_db';
// --- Types ---
const DEFAULT_PARTITION_CAPACITY = 100000;
const DEFAULT_MAX_ACTIVE_PARTITIONS = 3; // Keep a few partitions warm
const HNSW_INDEX_DIR_NAME = 'hnsw';
const HNSW_INDEX_FILE_NAME = 'hnsw_index.json'; // Using binary for HNSW potentially
/**
* PartitionedVectorDB distributes vectors across multiple ClusteredVectorDB partitions
* for improved scalability and performance with very large datasets.
* It uses an LRU cache to manage loaded partitions in memory and integrates HNSW index persistence.
*
* Storage strategy:
* - Uses ClusteredVectorDB as the partition implementation
* - Each partition uses optimized binary storage for vectors and JSON for metadata (handled by ClusteredVectorDB)
* - HNSW indices are stored separately per partition.
* - Partitions are stored in separate directories with their own config files
*/
/**
* The `PartitionedVectorDB` class provides a partitioned, in-memory vector database
* with support for clustering, HNSW indexing, and LRU-based partition management.
* It is designed to handle large-scale vector data by dividing it into manageable
* partitions, each with its own configuration and storage.
* ### Features:
* - **Partition Management**: Automatically manages partitions with configurable capacity.
* - **LRU Cache**: Keeps a limited number of partitions in memory for efficient access.
* - **HNSW Indexing**: Supports approximate nearest neighbor search using HNSW indices.
* - **Auto-Partitioning**: Automatically creates and activates new partitions when needed.
* - **Persistence**: Saves and loads partition configurations and data to/from disk.
* - **Event-Driven**: Emits events for lifecycle operations like initialization, partition loading, and errors.
* ### Usage:
* 1. Create an instance of `PartitionedVectorDB` with desired options.
* 2. Use methods like `addVector`, `bulkAdd`, `findNearest`, and `findNearestHNSW` to interact with the database.
* 3. Manage partitions using methods like `createPartition`, `setActivePartition`, and `getPartition`.
* 4. Save and load the database state using `save` and `load`.
* ### Events:
* - `db:initialized`: Emitted when the database is fully initialized.
* - `partition:loaded`: Emitted when a partition is loaded into memory.
* - `partition:unloaded`: Emitted when a partition is evicted from memory.
* - `partition:error`: Emitted when an error occurs during partition operations.
* - `vector:add`: Emitted when a vector is added to a partition.
* - `vector:delete`: Emitted when a vector is deleted from a partition.
* - `db:close`: Emitted when the database is closed.
* ### Example:
* ```typescript
* const db = new PartitionedVectorDB({
* partitionsDir: './data/partitions',
* partitionCapacity: 1000,
* maxActivePartitions: 5,
* autoCreatePartitions: true,
* vectorSize: 128,
* });
*
* await db.initializationPromise; // Wait for initialization
*
* // Add a vector
* const { partitionId, vectorId } = await db.addVector(undefined, [0.1, 0.2, 0.3], { label: 'example' });
*
* // Search for nearest neighbors
* const results = await db.findNearest([0.1, 0.2, 0.3], 5);
*
* // Save the database state
* await db.save();
*
* // Close the database
* await db.close();
* ```
* ### Constructor Options:
* - `partitionsDir`: Directory where partition data is stored.
* - `partitionCapacity`: Maximum number of vectors per partition.
* - `maxActivePartitions`: Maximum number of partitions to keep in memory.
* - `autoCreatePartitions`: Whether to automatically create new partitions when needed.
* - `vectorSize`: Suggested size of vectors (optional).
* - `useCompression`: Whether to enable compression for partition data.
* - `clusterOptions`: Default options for clustered vector databases.
* - `autoLoadHNSW`: Whether to automatically load HNSW indices.
* ### Methods:
* - `addVector`: Adds a single vector to the active partition.
* - `bulkAdd`: Adds multiple vectors across partitions.
* - `findNearest`: Finds nearest neighbors using standard search.
* - `findNearestHNSW`: Finds nearest neighbors using HNSW indices.
* - `createPartition`: Creates a new partition.
* - `setActivePartition`: Sets the active partition.
* - `getPartition`: Loads and retrieves a specific partition.
* - `getActivePartition`: Retrieves the currently active partition.
* - `save`: Saves the database state, including partitions and indices.
* - `load`: Loads the database state from disk.
* - `close`: Closes the database, saving state and releasing resources.
* - `buildIndexHNSW`: Builds HNSW indices for specified or all loaded partitions.
* - `saveHNSWIndices`: Saves HNSW indices for specified or all loaded partitions.
* - `loadHNSWIndices`: Loads HNSW indices for specified or all loaded partitions.
* - `getStats`: Retrieves database statistics.
* - `getVector`: Retrieves a vector by ID.
* - `getMetadata`: Retrieves metadata for a vector by ID.
* - `deleteVector`: Deletes a vector by ID.
* - `updateMetadata`: Updates metadata for a vector by ID.
* ### Internal Methods:
* - `_initialize`: Handles asynchronous initialization of the database.
* - `_loadPartition`: Loads a specific partition into memory.
* - `_saveHNSWIndex`: Saves the HNSW index for a partition.
* - `_loadHNSWIndex`: Loads the HNSW index for a partition.
* - `_ensureActivePartitionHasCapacity`: Ensures the active partition has enough capacity.
* - `_saveSinglePartitionConfig`: Saves a single partition configuration to disk.
* - `_loadPartitionConfigs`: Loads all partition configurations from disk.
* ### Notes:
* - This class is designed for scenarios where vector data is too large to fit into memory at once.
* - It relies on partitioning and LRU caching to manage memory usage efficiently.
* - HNSW indexing provides fast approximate nearest neighbor search but requires additional memory.
*/
export class PartitionedVectorDB extends (EventEmitter as new () => TypedEventEmitter<PartitionedDBEventData>) implements PartitionedVectorDBInterface {
private readonly partitionsDir: string;
private readonly partitionCapacity: number;
private readonly maxActivePartitions: number;
private readonly autoCreatePartitions: boolean;
private readonly vectorSize: number;
private readonly useCompression: boolean; // Passed down to partitions
private readonly defaultClusterOptions: Omit<ClusteredVectorDBOptions, 'clusterSize'>;
private readonly autoLoadHNSW: boolean; // Option to auto-load HNSW indices
private readonly runKMeansOnLoad: boolean; // Option for K-Means on load
// In-memory state
private partitionConfigs: Map<string, PartitionConfig>; // All known configs
private loadedPartitions: LRUCache<string, ClusteredVectorDB>; // LRU Cache for loaded DBs
private hnswIndices: Map<string, HNSW>; // Manage HNSW indices per partition ID
private activePartitionId: string | null;
private isInitialized: boolean = false;
public initializationPromise: Promise<void>;
private saveConfigPromise: Promise<void> | null = null;
private isClosing: boolean = false; // Flag to prevent operations during close
constructor(options: PartitionedVectorDBOptions = {}) {
super();
log('info', '[PartitionedVectorDB] Initializing with options:', JSON.stringify(options, null, 2));
this.partitionsDir = options.partitionsDir || path.join(process.cwd(), 'database', 'partitions');
this.partitionCapacity = options.partitionCapacity || DEFAULT_PARTITION_CAPACITY;
this.maxActivePartitions = options.maxActivePartitions || DEFAULT_MAX_ACTIVE_PARTITIONS;
this.autoCreatePartitions = options.autoCreatePartitions !== false; // Default true
this.vectorSize = options.vectorSize ?? defaultSystemConfiguration.defaults.vectorSize;
this.useCompression = options.useCompression ?? false; // Default false
this.defaultClusterOptions = options.clusterOptions ?? {};
this.autoLoadHNSW = options.autoLoadHNSW ?? true; // Default true
this.runKMeansOnLoad = options.runKMeansOnLoad ?? defaultSystemConfiguration.indexing.runKMeansOnLoad; // Default false
log('info', `[PartitionedVectorDB] Configuration:
- partitionsDir: ${this.partitionsDir}
- partitionCapacity: ${this.partitionCapacity}
- maxActivePartitions: ${this.maxActivePartitions}
- autoCreatePartitions: ${this.autoCreatePartitions}
- vectorSize: ${this.vectorSize ?? 'not specified'}
- useCompression: ${this.useCompression}
- autoLoadHNSW: ${this.autoLoadHNSW}
- runKMeansOnLoad: ${this.runKMeansOnLoad}`);
this.partitionConfigs = new Map();
this.hnswIndices = new Map();
this.activePartitionId = null;
// --- Initialize LRU Cache ---
this.loadedPartitions = new LRUCache<string, ClusteredVectorDB>({
max: this.maxActivePartitions,
// Dispose function called when an item is removed (evicted)
dispose: async (dbInstance, partitionId, reason) => {
log('info', `[PartitionedVectorDB] Disposing partition ${partitionId} from memory (Reason: ${reason}).`);
// Save is handled by the main save() method or explicitly before eviction if needed.
// Close the DB instance to release resources.
const hnswIndex = this.hnswIndices.get(partitionId);
if (hnswIndex) {
// Decide if HNSW index should be saved on eviction - maybe not, rely on explicit save?
// await this._saveHNSWIndex(partitionId); // Optional: save index on eviction
this.hnswIndices.delete(partitionId); // Remove from memory map
log('info', `[PartitionedVectorDB] Unloaded HNSW index for evicted partition ${partitionId}`);
}
try {
// Close partition DB (releases file handles, etc., but VectorDB.close might save if path set - review VectorDB.close)
// Ideally, saving is orchestrated explicitly via PartitionedVectorDB.save()
await dbInstance.close();
this.emit('partition:unloaded', { id: partitionId });
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error closing partition ${partitionId} during dispose:`, error);
this.emit('partition:error', {
id: partitionId,
error,
operation: 'dispose',
});
}
},
});
// Ensure partitions directory exists
try {
if (!existsSync(this.partitionsDir)) {
mkdirSync(this.partitionsDir, { recursive: true });
}
} catch (err: any) {
// Fatal if we cannot ensure the base directory exists
throw new Error(`FATAL: Could not create or access partitions directory: ${this.partitionsDir} - ${err.message}`);
}
// Defer actual loading to an async method
this.initializationPromise = this._initialize(options.autoLoadPartitions !== false);
}
/** Checks if the database is initialized and ready for operations. */
IsReady(): boolean {
return this.isInitialized && !this.isClosing;
}
/**
* Ensure initialization is complete before performing operations.
*/
private async _ensureInitialized(force: boolean = false): Promise<void> {
if (this.isClosing) throw new Error('Database is closing or closed.');
if (!this.isInitialized && !force) {
await this.initializationPromise;
}
}
/**
* Asynchronous initialization: Loads configs and potentially active partitions & indices.
*/
private async _initialize(autoLoad: boolean): Promise<void> {
if (this.isInitialized) return;
log('info', `[PartitionedVectorDB] Starting initialization (autoLoad: ${autoLoad})`);
try {
// 1. Load all partition configurations first
await this._loadPartitionConfigs();
log('info', `[PartitionedVectorDB] Loaded ${this.partitionConfigs.size} partition configurations.`);
// 2. Determine which partitions to load initially (e.g., active one)
const partitionsToLoad: string[] = [];
if (autoLoad && this.activePartitionId) {
partitionsToLoad.push(this.activePartitionId);
// Optionally load more based on LRU or other criteria if needed
}
// 3. Load partitions and potentially their HNSW indices in parallel
if (partitionsToLoad.length > 0) {
log('info', `[PartitionedVectorDB] Auto-loading initial partitions: [${partitionsToLoad.join(', ')}]`);
await Promise.all(partitionsToLoad.map((id) => this._loadPartition(id, this.autoLoadHNSW)));
log('info', `[PartitionedVectorDB] Initial partitions loaded (${this.loadedPartitions.size} in memory, ${this.hnswIndices.size} HNSW indices loaded).`);
} else {
log('info', '[PartitionedVectorDB] No initial partitions specified for auto-loading.');
}
this.isInitialized = true;
log('info', `[PartitionedVectorDB] Initialization complete. Active: ${this.activePartitionId ?? 'None'}`);
this.emit('db:initialized', {
partitionCount: this.partitionConfigs.size,
loadedCount: this.loadedPartitions.size,
activeId: this.activePartitionId,
});
} catch (err: any) {
log('error', `[PartitionedVectorDB] FATAL: Error during initialization:`, err);
this.emit('partition:error', { error: err, operation: 'initialize' });
// Potentially set a flag indicating failed initialization?
throw err; // Re-throw to signal failure
}
}
/**
* Load all partition configuration files from the directory.
* Finds the active partition or sets one if needed.
*/
private async _loadPartitionConfigs(): Promise<void> {
log('info', `[PartitionedVectorDB] Loading partition configurations from ${this.partitionsDir}`);
this.partitionConfigs.clear();
let foundActiveId: string | null = null;
const configsRead: PartitionConfig[] = [];
try {
const entries = await fs.readdir(this.partitionsDir, {
withFileTypes: true,
});
const partitionDirs = entries.filter((e) => e.isDirectory());
log('info', `[PartitionedVectorDB] Found ${partitionDirs.length} potential partition directories.`);
for (const dir of partitionDirs) {
const configPath = path.join(this.partitionsDir, dir.name, `${dir.name}.config.json`);
if (existsSync(configPath)) {
log('info', `[PartitionedVectorDB] Attempting to load config: ${configPath}`);
try {
const content = await fs.readFile(configPath, 'utf8');
const config = JSON.parse(content) as PartitionConfig;
// Basic validation
if (config.id && config.dbDirName === dir.name) {
this.partitionConfigs.set(config.id, config);
configsRead.push(config);
log('info', `[PartitionedVectorDB] Loaded config for partition: ${config.id} (Dir: ${dir.name}, Active: ${config.active}, Vectors: ${config.vectorCount})`);
if (config.active) {
if (foundActiveId && foundActiveId !== config.id) {
log('warn', `[PartitionedVectorDB] Multiple active partitions defined! Found ${config.id} after ${foundActiveId}. Deactivating ${config.id}.`);
config.active = false;
// Schedule a save to fix the inconsistency?
this.scheduleSaveConfigs();
} else {
foundActiveId = config.id;
}
}
} else {
log('warn', `[PartitionedVectorDB] Invalid partition config format or mismatched ID/DirName: ${configPath}`);
}
} catch (e: any) {
log('warn', `[PartitionedVectorDB] Error reading/parsing partition config ${configPath}:`, e);
}
} else {
log('info', `[PartitionedVectorDB] No config file found in directory: ${dir.name}`);
}
}
this.activePartitionId = foundActiveId;
log('info', `[PartitionedVectorDB] Active partition ID after scan: ${this.activePartitionId ?? 'None'}`);
// If no active partition found, try to set one or create the first one
if (!this.activePartitionId && this.partitionConfigs.size > 0) {
// Find the first config (order might not be guaranteed, consider sorting by name/ID if needed)
const firstConfig = this.partitionConfigs.values().next().value as PartitionConfig | undefined;
if (firstConfig) {
log('info', `[PartitionedVectorDB] No active partition found, activating first available: ${firstConfig.id}`);
firstConfig.active = true;
this.activePartitionId = firstConfig.id;
this.scheduleSaveConfigs(); // Save the change
}
} else if (!this.activePartitionId && this.autoCreatePartitions) {
log('info', '[PartitionedVectorDB] No partitions found, creating initial partition.');
// Call createPartition but skip initialization check within it
await this.createPartition(`p-${Date.now()}`, 'Initial Partition', {
setActive: true,
skipInitializationCheck: true,
});
// Re-fetch active ID potentially set by createPartition
this.activePartitionId = Array.from(this.partitionConfigs.values()).find((c) => c.active)?.id ?? null;
}
this.emit('partitions:loaded', {
count: this.partitionConfigs.size,
active: this.activePartitionId,
});
} catch (error: any) {
if (error.code === 'ENOENT' && !existsSync(this.partitionsDir)) {
log('warn', `[PartitionedVectorDB] Partitions directory ${this.partitionsDir} not found. It will be created when needed.`);
// If autoCreate is on, the first partition creation will handle it.
} else {
log('error', '[PartitionedVectorDB] Error listing or reading partition configs:', error);
throw error; // Propagate other errors
}
}
}
/**
* Loads a specific partition's DB instance into the LRU cache if not already present.
* Optionally loads the HNSW index as well.
* Returns the loaded DB instance or null on failure.
*/
private async _loadPartition(
partitionId: string,
loadHNSW: boolean = this.autoLoadHNSW // Use instance default
): Promise<ClusteredVectorDB | null> {
if (this.isClosing) return null; // Prevent loading during close
const cachedDb = this.loadedPartitions.get(partitionId);
if (cachedDb) {
// If DB is already loaded, ensure HNSW is loaded if requested and not already loaded
if (loadHNSW && !this.hnswIndices.has(partitionId)) {
await this._loadHNSWIndex(partitionId, cachedDb); // Pass the DB instance
}
return cachedDb;
}
const config = this.partitionConfigs.get(partitionId);
if (!config) {
log('warn', `[PartitionedVectorDB] Partition config not found for ID: ${partitionId}. Cannot load.`);
return null;
}
// Construct paths relative to the main partitions directory
const partitionDirPath = path.join(this.partitionsDir, config.dbDirName);
const dbBasePath = path.join(partitionDirPath, 'data');
log('info', `[PartitionedVectorDB] Loading partition ${partitionId} DB from base path: ${dbBasePath}`);
try {
// Ensure the specific partition directory exists
if (!existsSync(partitionDirPath)) {
await fs.mkdir(partitionDirPath, { recursive: true });
log('info', `[PartitionedVectorDB] Created directory for partition ${partitionId}: ${partitionDirPath}`);
}
// Also ensure the data directory exists for a new partition
const dataDir = path.dirname(dbBasePath);
if (!existsSync(dataDir)) {
await fs.mkdir(dataDir, { recursive: true });
log('info', `[PartitionedVectorDB] Created data directory for partition ${partitionId}: ${dataDir}`);
}
const metaFilePath = path.join(dbBasePath, 'meta.json');
const vectorFilePath = path.join(dbBasePath, 'vec.bin');
const clusterFilePath = path.join(dbBasePath, 'cluster.json');
if (!existsSync(metaFilePath)) {
log('info', '`[PartitionedVectorDB] Meta file not found, creating new one.`');
await fs.writeFile(metaFilePath, JSON.stringify({}), 'utf8');
}
if (!existsSync(vectorFilePath)) {
log('info', '`[PartitionedVectorDB] Vector file not found, creating new one.`');
await fs.writeFile(vectorFilePath, Buffer.alloc(0));
}
if (!existsSync(clusterFilePath)) {
log('info', '`[PartitionedVectorDB] Vector file not found, creating new one.`');
await fs.writeFile(clusterFilePath, JSON.stringify({}), 'utf8');
}
const hnswIndexDir = path.join(partitionDirPath, HNSW_INDEX_DIR_NAME);
const hnswIndexPath = path.join(hnswIndexDir, HNSW_INDEX_FILE_NAME);
if (!existsSync(hnswIndexDir)) {
log('info', `[PartitionedVectorDB] HNSW index directory not found, creating new one.`);
await fs.mkdir(hnswIndexDir, { recursive: true });
}
if (!existsSync(hnswIndexPath)) {
log('info', `[PartitionedVectorDB] HNSW index file not found, creating new one.`);
await fs.writeFile(hnswIndexPath, JSON.stringify(defaultSystemConfiguration.indexing.hnsw), 'utf8');
}
// --- Load the ClusteredVectorDB ---
const clusterDbOptions: ClusteredVectorDBOptions = {
...this.defaultClusterOptions,
clusterSize: config.clusterSize, // Use specific or default
useCompression: this.useCompression, // Pass down compression setting
runKMeansOnLoad: this.runKMeansOnLoad, // Pass down K-Means option
};
const vectorDB = new ClusteredVectorDB(
this.vectorSize, // Pass the suggested vector size
dbBasePath, // Pass the base path for data files
clusterDbOptions
);
await vectorDB.load(); // Wait for initialization
// Successfully loaded the DB, add to LRU cache
this.loadedPartitions.set(partitionId, vectorDB);
log('info', `[PartitionedVectorDB] Partition DB ${partitionId} loaded. Vector count: ${vectorDB.getVectorCount()}`);
// --- Optionally Load HNSW Index ---
log('info', `[PartitionedVectorDB] Loading HNSW index for partition ${partitionId}`);
if (loadHNSW) {
await this._loadHNSWIndex(partitionId, vectorDB);
}
log('info', `[PartitionedVectorDB] HNSW index loaded for partition ${partitionId}`);
this.emit('partition:loaded', {
id: partitionId,
name: config.name,
vectorCount: vectorDB.getVectorCount(),
hnswLoaded: this.hnswIndices.has(partitionId),
});
// --- Sync vector count ---
log('info', `[PartitionedVectorDB] Syncing vector count for partition ${partitionId}`);
const loadedCount = vectorDB.getVectorCount();
log('info', `[PartitionedVectorDB] Loaded vector count: ${loadedCount}`);
if (config.vectorCount !== loadedCount) {
log('warn', `[PartitionedVectorDB] Partition ${partitionId}: Config count (${config.vectorCount}) differs from loaded DB count (${loadedCount}). Updating config.`);
config.vectorCount = loadedCount;
this.scheduleSaveConfigs(); // Save updated count later
}
return vectorDB;
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error loading partition DB ${partitionId} from ${dbBasePath}:`, error);
// Clean up potentially partially loaded state? Remove from cache if added?
this.loadedPartitions.delete(partitionId);
this.hnswIndices.delete(partitionId); // Ensure HNSW is also removed if DB load failed
this.emit('partition:error', {
id: partitionId,
error,
operation: 'loadPartitionDB',
});
return null;
}
}
/** Loads the HNSW index for a given partition ID if it exists. */
private async _loadHNSWIndex(partitionId: string, dbInstance: ClusteredVectorDB): Promise<boolean> {
log('info', `[PartitionedVectorDB] Loading HNSW index for partition ${partitionId}`);
if (this.hnswIndices.has(partitionId)) {
log('info', `[PartitionedVectorDB] HNSW index for ${partitionId} already loaded.`);
return true; // Already loaded
}
if (this.isClosing) return false;
const config = this.partitionConfigs.get(partitionId);
if (!config) {
log('warn', `[PartitionedVectorDB] Cannot load HNSW index: Config not found for ${partitionId}`);
return false;
}
const indexDir = path.join(this.partitionsDir, config.dbDirName, HNSW_INDEX_DIR_NAME);
const indexPath = path.join(indexDir, HNSW_INDEX_FILE_NAME);
if (existsSync(indexPath)) {
log('info', `[PartitionedVectorDB] Loading HNSW index for partition ${partitionId} from ${indexPath}`);
try {
const hnswIndex = await HNSW.loadIndex(indexPath, dbInstance);
this.hnswIndices.set(partitionId, hnswIndex);
log('info', `[PartitionedVectorDB] Successfully loaded HNSW index for ${partitionId}. Nodes: ${hnswIndex.getNodeCount()}`);
this.emit('partition:indexLoaded', {
id: partitionId,
indexType: 'hnsw',
path: indexPath,
});
return true;
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error loading HNSW index for partition ${partitionId} from ${indexPath}:`, error.message || error);
this.emit('partition:error', {
id: partitionId,
error,
operation: 'loadHNSWIndex',
});
return false;
}
} else {
log('info', `[PartitionedVectorDB] HNSW index file not found for partition ${partitionId} at ${indexPath}. Index not loaded.`);
return false; // Index file doesn't exist
}
}
/** Saves the HNSW index for a given partition ID. */
private async _saveHNSWIndex(partitionId: string): Promise<boolean> {
log('info', `[PartitionedVectorDB] Saving HNSW index for partition ${partitionId}`);
const hnswIndex = this.hnswIndices.get(partitionId);
const config = this.partitionConfigs.get(partitionId);
if (!hnswIndex) {
log('info', `[PartitionedVectorDB] No HNSW index instance found in memory for partition ${partitionId}. Skipping save.`);
return false;
}
if (!config) {
log('warn', `[PartitionedVectorDB] Cannot save HNSW index: Config not found for ${partitionId}`);
return false;
}
if (this.isClosing) {
log('warn', `[PartitionedVectorDB] Skipping HNSW index save for ${partitionId} during close operation (already handled or closing).`);
return false;
}
const indexDir = path.join(this.partitionsDir, config.dbDirName, HNSW_INDEX_DIR_NAME);
const indexPath = path.join(indexDir, HNSW_INDEX_FILE_NAME);
log('info', `[PartitionedVectorDB] Saving HNSW index for partition ${partitionId} to ${indexPath}`);
try {
// Ensure directory exists
if (!existsSync(indexDir)) {
await fs.mkdir(indexDir, { recursive: true });
}
await hnswIndex.saveIndex(indexPath); // HNSW handles the actual saving
log('info', `[PartitionedVectorDB] Successfully saved HNSW index for ${partitionId}.`);
this.emit('partition:indexSaved', {
id: partitionId,
indexType: 'hnsw',
path: indexPath,
});
return true;
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error saving HNSW index for partition ${partitionId} to ${indexPath}:`, error);
this.emit('partition:error', {
id: partitionId,
error,
operation: 'saveHNSWIndex',
path: indexPath,
});
return false;
}
}
/**
* Get a partition instance by ID. Loads it (and its index if configured) if necessary.
*/
async getPartition(id: string): Promise<ClusteredVectorDB | null> {
log('info', `[PartitionedVectorDB] Getting partition ${id}...`);
await this._ensureInitialized();
// _loadPartition handles cache checking, loading DB, and potentially HNSW index
return this._loadPartition(id); // Uses instance default for loading HNSW
}
/**
* Get the currently active partition instance. Loads it if necessary.
*/
async getActivePartition(): Promise<ClusteredVectorDB | null> {
log('info', `[PartitionedVectorDB] Getting active partition...`);
await this._ensureInitialized();
if (!this.activePartitionId) {
log('warn', '[PartitionedVectorDB] No active partition is set.');
return null;
}
return this._loadPartition(this.activePartitionId); // Loads DB and potentially HNSW
}
// =====================================================================
// Public API Methods (Add, Search, Delete, Stats, etc.)
// =====================================================================
/**
* Explicitly save the entire state: configs, loaded partition data, and loaded HNSW indices.
*/
async save(): Promise<void> {
await this._ensureInitialized();
if (this.isClosing) {
log('warn', '[PartitionedVectorDB] Attempted to save while closing.');
return;
}
log('info', '[PartitionedVectorDB] Starting comprehensive save...');
// 1. Save all configurations (ensures counts, active status, etc., are up-to-date)
// Use await on the debounced save to ensure it finishes before proceeding
await this.savePartitionConfigs();
log('info', `[PartitionedVectorDB] Partition configurations saved. Active partition: ${this.activePartitionId}`);
// Ensure the save promise is resolved before proceeding
if (this.saveConfigPromise) await this.saveConfigPromise; // Ensure pending config save finishes
// 2. Save data for all *loaded* partitions in parallel
const loadedPartitionIds = Array.from(this.loadedPartitions.keys());
log('info', `[PartitionedVectorDB] Saving data for ${loadedPartitionIds.length} loaded partitions...`);
const partitionSavePromises = loadedPartitionIds.map(async (id) => {
const partition = this.loadedPartitions.peek(id); // Use peek to avoid altering LRU order
if (partition) {
try {
// Check if the underlying DB instance exists and has a save method
if (typeof partition.save === 'function') {
await partition.save(); // Call the save method of ClusteredVectorDB/VectorDB
log('info', `[PartitionedVectorDB] Saved data for partition ${id}`);
return true;
} else {
log('warn', `[PartitionedVectorDB] Partition ${id} instance cannot be saved (missing save method or wrong type).`);
return false;
}
} catch (error) {
log('error', `[PartitionedVectorDB] Error saving data for partition ${id}:`, error);
this.emit('partition:error', {
id,
error,
operation: 'savePartitionData',
});
return false; // Indicate failure for this partition
}
}
return true; // Partition not found in cache (shouldn't happen with keys()), consider it success?
});
// 3. Save all *loaded* HNSW indices in parallel
const loadedHnswIds = Array.from(this.hnswIndices.keys());
log('info', `[PartitionedVectorDB] Saving ${loadedHnswIds.length} loaded HNSW indices...`);
const hnswSavePromises = loadedHnswIds.map((id) => this._saveHNSWIndex(id));
// Wait for all saves to complete
const [partitionResults, hnswResults] = await Promise.all([Promise.all(partitionSavePromises), Promise.all(hnswSavePromises)]);
const successfulPartitions = partitionResults.filter((r) => r).length;
const successfulHnsw = hnswResults.filter((r) => r).length;
log('info', `[PartitionedVectorDB] Comprehensive save complete. Partitions saved: ${successfulPartitions}/${loadedPartitionIds.length}. HNSW indices saved: ${successfulHnsw}/${loadedHnswIds.length}.`);
this.emit('db:saved', {
partitionsSaved: successfulPartitions,
indicesSaved: successfulHnsw,
});
}
/**
* Loads partition configurations and optionally pre-loads data/indices.
* This is typically called during initialization but can be called manually.
*/
async load(): Promise<void> {
if (this.isInitialized && !this.isClosing) {
log('warn', '[PartitionedVectorDB] Database already initialized. Call close() before loading again.');
return;
}
this.isClosing = false; // Reset closing flag if re-loading
this.isInitialized = false; // Reset initialization flag
// Reset internal state before loading
this.loadedPartitions.clear();
this.hnswIndices.clear();
this.partitionConfigs.clear();
this.activePartitionId = null;
log('info', '[PartitionedVectorDB] Starting manual load process...');
// Re-run the initialization logic, including loading configs and initial partitions/indices
this.initializationPromise = this._initialize(this.autoLoadHNSW); // Use constructor options
await this.initializationPromise;
log('info', '[PartitionedVectorDB] Manual load process finished.');
this.emit('db:loaded', {
partitionCount: this.partitionConfigs.size,
loadedCount: this.loadedPartitions.size,
activeId: this.activePartitionId,
});
}
/**
* Build HNSW indices for specified or all loaded partitions
* Ensures partition is loaded before building.
*/
async buildIndexHNSW(partitionId?: string, options?: BuildIndexHNSWOptions): Promise<void> {
await this._ensureInitialized(options?.force);
const buildSingleIndex = async (id: string): Promise<void> => {
log('info', `[PartitionedVectorDB] Building HNSW index for partition ${id}...`);
const partition = await this.getPartition(id); // Ensures partition DB is loaded
if (!partition) {
log('error', `[PartitionedVectorDB] Cannot build HNSW index: Partition ${id} not found or could not be loaded.`);
return;
}
let hnswIndex = this.hnswIndices.get(id);
if (!hnswIndex) {
log('info', `[PartitionedVectorDB] Creating new HNSW index instance for partition ${id} before building.`);
hnswIndex = new HNSW(partition); // Pass the loaded partition DB
this.hnswIndices.set(id, hnswIndex);
}
log('info', `[PartitionedVectorDB] Building HNSW index for partition ${id}...`);
try {
await hnswIndex.buildIndex({
...options,
// Wrap progress callback to emit event
progressCallback: (progress) => {
options?.progressCallback?.(progress); // Call original callback if provided
this.emit('partition:indexProgress', {
id,
progress,
operation: 'buildHNSW',
});
},
});
log('info', `[PartitionedVectorDB] HNSW index built successfully for partition ${id}.`);
this.emit('partition:indexed', { id, indexType: 'hnsw' });
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error building HNSW index for partition ${id}:`, error);
this.emit('partition:error', {
id,
error,
operation: 'buildHNSWIndex',
});
}
};
if (partitionId) {
await buildSingleIndex(partitionId);
} else {
// Build for all currently *loaded* partitions in parallel
const partitionIds = Array.from(this.loadedPartitions.keys());
log('info', `[PartitionedVectorDB] Building HNSW indices for ${partitionIds.length} loaded partitions in parallel...`);
await Promise.all(partitionIds.map((id) => buildSingleIndex(id)));
log('info', `[PartitionedVectorDB] Finished building HNSW indices for loaded partitions.`);
}
}
/**
* Find nearest neighbors using HNSW indices across specified or all *loaded* partitions.
* Optimized for parallel search. Loads partitions/indices if needed.
*/
async findNearestHNSW(
query: Vector,
k: number = 10,
options: SearchOptions & {
partitionIds?: string[];
exactDimensions?: boolean;
} = {}
): Promise<SearchResult[]> {
await this._ensureInitialized();
const queryVector = query instanceof Float32Array ? query : new Float32Array(query);
// Determine target partitions: provided list OR all configured partitions (load on demand)
// Decide whether to search *all* configured or just *currently loaded*
// Let's search specified OR all *loaded* by default for performance.
// If you need to search *all* partitions (loading unloaded ones), adjust the logic.
const targetPartitionIds = options.partitionIds
? options.partitionIds.filter((id) => this.partitionConfigs.has(id)) // Filter valid provided IDs
: Array.from(this.loadedPartitions.keys()); // Default to currently loaded
if (targetPartitionIds.length === 0) {
log('warn', '[PartitionedVectorDB] No valid partitions specified or loaded to search with HNSW.');
return [];
}
log('info', `[PartitionedVectorDB] Performing HNSW search on partitions: [${targetPartitionIds.join(', ')}]`);
// Perform search in parallel
const searchResultsNested = await Promise.all(
targetPartitionIds.map(async (partitionId) => {
try {
// 1. Ensure Partition DB is loaded
const partition = await this._loadPartition(partitionId, false); // Load DB only first
if (!partition) {
log('warn', `[PartitionedVectorDB] Skipping HNSW search on partition ${partitionId}: Could not load DB.`);
return [];
}
// 2. Ensure HNSW Index is loaded (or try loading it)
let hnswIndex = this.hnswIndices.get(partitionId);
if (!hnswIndex) {
const loaded = await this._loadHNSWIndex(partitionId, partition);
if (loaded) {
hnswIndex = this.hnswIndices.get(partitionId);
} else {
// Optional: Build index on the fly if not found? Risky for performance.
// log('info', `[PartitionedVectorDB] HNSW index for ${partitionId} not found. Building on-the-fly for search.`);
// hnswIndex = new HNSW(partition);
// await hnswIndex.buildIndex(); // Consider build options
// this.hnswIndices.set(partitionId, hnswIndex);
log('warn', `[PartitionedVectorDB] Skipping HNSW search on partition ${partitionId}: Index not loaded and not found.`);
return []; // Skip if index cannot be loaded/created
}
}
// 3. Perform the search on the loaded index
if (hnswIndex) {
return await hnswIndex.findNearest(queryVector, k, {
...options,
filter: options.filter, // Pass down filter
});
} else {
return []; // Should not happen if logic above is correct
}
} catch (error) {
log('error', `[PartitionedVectorDB] Error during HNSW search for partition ${partitionId}:`, error);
this.emit('partition:error', {
id: partitionId,
error,
operation: 'searchHNSW',
});
return []; // Return empty results for this partition on error
}
})
);
// Flatten results, sort by distance, and take top k
const mergedResults = searchResultsNested.flat();
mergedResults.sort((a, b) => a.dist - b.dist);
return mergedResults.slice(0, k);
}
/**
* Explicitly save HNSW indices for specified or all *loaded* partitions.
*/
async saveHNSWIndices(partitionId?: string): Promise<void> {
await this._ensureInitialized();
const idsToSave = partitionId ? [partitionId] : Array.from(this.hnswIndices.keys()); // Save only loaded indices
if (idsToSave.length === 0) {
log('info', '[PartitionedVectorDB] No HNSW indices loaded or specified to save.');
return;
}
log('info', `[PartitionedVectorDB] Saving HNSW indices for partitions: [${idsToSave.join(', ')}]`);
await Promise.all(idsToSave.map((id) => this._saveHNSWIndex(id)));
log('info', '[PartitionedVectorDB] Finished saving HNSW indices.');
}
/**
* Explicitly load HNSW indices for specified or all *loaded* partitions.
* Requires the partition DB to be loaded first.
*/
async loadHNSWIndices(partitionId?: string): Promise<void> {
await this._ensureInitialized();
const loadIndexForPartition = async (id: string): Promise<void> => {
const partition = this.loadedPartitions.peek(id); // Check if DB is loaded without changing LRU order
if (!partition) {
log('warn', `[PartitionedVectorDB] Cannot load HNSW index for ${id}: Partition DB not loaded.`);
// Optionally load the DB first: await this._loadPartition(id, false);
return;
}
if (this.hnswIndices.has(id)) {
log('info', `[PartitionedVectorDB] HNSW index for ${id} is already loaded.`);
return;
}
await this._loadHNSWIndex(id, partition); // Attempt to load
};
const idsToLoad = partitionId ? [partitionId] : Array.from(this.loadedPartitions.keys()); // Try loading for all loaded partitions
if (idsToLoad.length === 0) {
log('info', '[PartitionedVectorDB] No partitions loaded or specified to load HNSW indices for.');
return;
}
log('info', `[PartitionedVectorDB] Loading HNSW indices for partitions: [${idsToLoad.join(', ')}]`);
await Promise.all(idsToLoad.map((id) => loadIndexForPartition(id)));
log('info', `[PartitionedVectorDB] Finished loading HNSW indices. Indices in memory: ${this.hnswIndices.size}`);
}
/** Get HNSW stats */
getHNSWStats(partitionId: string): HNSWStats | null {
if (!this.isInitialized) return null;
const hnswIndex = this.hnswIndices.get(partitionId);
return hnswIndex ? hnswIndex.getStats() : null;
}
/**
* Close the partitioned database, saving state and releasing resources.
*/
async close(): Promise<void> {
if (this.isInitialized) {
log('warn', '[PartitionedVectorDB] Close operation called before initialization.');
return;
}
if (this.isClosing) {
log('warn', '[PartitionedVectorDB] Close operation already in progress.');
return;
}
log('info', '[PartitionedVectorDB] Closing database...');
this.isClosing = true;
// 1. Ensure initialization finished (to avoid race conditions)
// We might be closing before initialization fully completed
try {
await this.initializationPromise;
} catch (initError) {
log('warn', '[PartitionedVectorDB] Initialization failed, proceeding with close anyway:', initError);
}
// 2. Perform final save of everything loaded
try {
await this.save(); // Comprehensive save of configs, partitions, indices
} catch (saveError) {
log('error', '[PartitionedVectorDB] Error during final save operation:', saveError);
// Continue closing even if save fails
}
// 3. Clear the LRU cache - this triggers dispose which calls close() on individual DBs
// Dispose should NOT save again, just release resources.
this.loadedPartitions.clear();
// 4. Clear HNSW index map (dispose might have already removed some)
this.hnswIndices.clear();
// 5. Clear partition configs
this.partitionConfigs.clear();
// 6. Reset state
this.activePartitionId = null;
this.isInitialized = false; // Mark as not initialized
// Keep isClosing = true
this.emit('db:close', undefined);
log('info', '[PartitionedVectorDB] Database closed.');
}
// --- Configuration Saving ---
/** Saves all partition configurations (debounced). */
async savePartitionConfigs(): Promise<void> {
if (this.isClosing) return; // Don't save during close triggered by 'save' itself
if (!this.saveConfigPromise) {
this.saveConfigPromise = (async () => {
// await new Promise((resolve) => setTimeout(resolve, 500)); // Simple debounce delay
log('info', '[PartitionedVectorDB] Debounced saving of partition configurations...');
const configsToSave = Array.from(this.partitionConfigs.values());
try {
const savePromises = configsToSave.map((config) => this._saveSinglePartitionConfig(config));
await Promise.all(savePromises);
log('info', `[PartitionedVectorDB] Saved ${configsToSave.length} partition configurations.`);
this.emit('config:saved', undefined);
} catch (error: any) {
log('error', '[PartitionedVectorDB] Error saving one or more partition configs:', error);
// Emit specific error?
} finally {
this.saveConfigPromise = null; // Release lock
}
})();
}
return this.saveConfigPromise;
}
/** Schedules a config save if one isn't already pending. */
private scheduleSaveConfigs(): void {
if (!this.saveConfigPromise && !this.isClosing) {
this.savePartitionConfigs();
}
}
/** Save a single partition configuration file. */
private async _saveSinglePartitionConfig(config: PartitionConfig): Promise<void> {
if (this.isClosing) return; // Prevent saving during close
const partitionDir = path.join(this.partitionsDir, config.dbDirName);
const configPath = path.join(partitionDir, `${config.id}.config.json`); // Store config inside partition dir
try {
// Ensure directory exists before writing config
if (!existsSync(partitionDir)) {
await fs.mkdir(partitionDir, { recursive: true });
}
await fs.writeFile(configPath, JSON.stringify(config, null, 2), 'utf8');
} catch (error: any) {
log('error', `[PartitionedVectorDB] Error saving config ${config.id} to ${configPath}:`, error);
this.emit('partition:error', {
id: config.id,
error,
operation: 'saveConfig',
path: configPath,
});
throw error; // Re-throw
}
}
/** Create a new partition. */
async createPartition(
id: string,
name: string,
options: {
description?: string;
properties?: Record<string, any>;
setActive?: boolean;
clusterSize?: number;
skipInitializationCheck?: boolean; // Internal flag
} = {}
): Promise<string> {
// Allow skipping check only for internal calls during initial setup
if (!options.skipInitializationCheck) {
await this._ensureInitialized();
}
if (!/^[a-zA-Z0-9._-]+$/.test(id)) {
throw new Error('Partition ID must contain only alphanumeric, underscore, hyphen, or dot characters');
}
if (this.partitionConfigs.has(id)) {
throw new Error(`Partition with ID ${id} already exists`);
}
const dirName = id; // Use the ID as the directory name for simplicity and uniqueness
const partitionDataDir = path.join(this.partitionsDir, dirName);
const partitionDataDirData = path.join(partitionDataDir, 'data');
log('info', `[PartitionedVectorDB] Creating new partition '${name}' (ID: ${id}) in directory: ${partitionDataDir}`);
try {
if (!existsSync(partitionDataDir)) {
await fs.mkdir(partitionDataDir, { recursive: true });
}
if (!existsSync(partitionDataDirData)) {
await fs.mkdir(partitionDataDirData, { recursive: true });
}
} catch (error: any) {
log('error', `[PartitionedVectorDB] Failed to create directory for new partition ${id}: ${partitionDataDir}`, error);
this.emit('partition:error', {
id,
error,
operation: 'createDir',
path: partitionDataDir,
});
throw new Error(`Failed to create directory for partition ${id}: ${error.message}`);
}
const newConfig: PartitionConfig = {
id,
name,
dbDirName: dirName,
active: false, // Activation handled later
vectorCount: 0,
description: options.description,
properties: options.properties,
clusterSize: options.clusterSize, // Use specific or let underlying DB use default
};
// Add to in-memory map *before* saving and loading
this.partitionConfigs.set(id, newConfig);
// Save the new config immediately (important!)
try {
await this._saveSinglePartitionConfig(newConfig);
} catch (saveError) {
// If saving config fails, rollback the creation?
this.partitionConfigs.delete(id); // Remove from memory
log('error', `[PartitionedVectorDB] Failed to save config for new partition ${id}. Rolling back creation.`);
// Optionally try to delete the created directory?
throw saveError;
}
// Ensure all required files are created
try {
const clusterDbOptions: ClusteredVectorDBOptions = {
clusterSize: options.clusterSize,
useCompression: this.useCompression,
runKMeansOnLoad: this.runKMeansOnLoad, // Pass down K-Means option
};
const vectorDB = new ClusteredVectorDB(
this.vectorSize,
path.join(partitionDataDir, 'data'), // Base path for data files
clusterDbOptions
);
// Save the initial state of the database
await vectorDB.save();
// Create an empty HNSW index file
const hnswIndexDir = path.join(partitionDataDir, HNSW_INDEX_DIR_NAM