@n2flowjs/nbase
Version:
Neural Vector Database for efficient similarity search
1,001 lines (855 loc) • 39.2 kB
text/typescript
// --- START OF FILE clustered_vector_db.ts ---
import { VectorDB } from './vector_db';
import config from '../config'; // Assuming config exists and has defaults
import { ClusteredVectorDBOptions, DBStats, DistanceMetric, IDVector, SearchResult, Vector } from '../types';
import { existsSync, promises as fsPromises } from 'fs';
import path from 'path';
import zlib from 'zlib'; // Import zlib for potential compression
import { promisify } from 'util'; // Import promisify
import {KMeans} from '../compression/kmeans'; // Import KMeans
import { log } from '../utils/log'; // Thêm import log
const gzip = promisify(zlib.gzip);
const gunzip = promisify(zlib.gunzip);
// Helper function to pick k random distinct elements from an array
function getRandomElements<T>(arr: T[], k: number): T[] {
if (k >= arr.length) {
return [...arr]; // Return a copy of the whole array if k is too large
}
const shuffled = [...arr].sort(() => 0.5 - Math.random());
return shuffled.slice(0, k);
}
/**
* A clustered vector database that extends the base VectorDB with efficient approximate nearest neighbor search.
*
* ClusteredVectorDB organizes vectors into clusters to improve search performance on large datasets.
* Instead of performing an exhaustive linear search across all vectors, it first identifies the most
* promising clusters and then only searches vectors within those clusters.
*
* Features:
* - Dynamic cluster management with automatic creation of new clusters when needed
* - Configurable clustering parameters to tune performance vs. accuracy tradeoffs
* - Persistence of cluster state alongside vector data
* - Support for different distance metrics
* - Optional K-Means clustering for potentially better cluster quality
*
* @example
* ```ts
* const db = new ClusteredVectorDB(128, './vector-db', {
* clusterSize: 100,
* distanceMetric: 'cosine'
* });
*
* // Add vectors with automatic cluster assignment
* db.addVector('doc1', [0.1, 0.2, ...], { title: 'Document 1' });
*
* // Search efficiently using cluster-based approximation
* const results = db.findNearest([0.3, 0.4, ...], 5);
* ```
*
* @extends VectorDB
*/
export class ClusteredVectorDB extends VectorDB {
// Configuration
public readonly targetClusterSize: number;
protected readonly newClusterThresholdFactor: number;
protected readonly newClusterDistanceThreshold: number;
protected readonly maxClusters: number;
protected readonly distanceMetric: DistanceMetric;
protected readonly kmeansMaxIterations: number; // New config option
protected readonly runKMeansOnLoad: boolean; // Option to run K-Means automatically on load
private kmeans: KMeans;
// Clustering structures
private clusters: Map<
number,
Array<{
id: IDVector /* vector not needed here if memoryStorage is source */;
}>
>; // Store only IDs in cluster list
private clusterCentroids: Map<number, Float32Array>;
private clusterDimensions: Map<number, number>; // Track dimensions per cluster centroid
private clusterIdCounter: number;
constructor(suggestedVectorSize: number | null = null, dbPath: string | null = null, options: ClusteredVectorDBOptions = {}) {
super(suggestedVectorSize, dbPath, {
useCompression: options.useCompression,
});
// Set configuration with defaults from config or reasonable values
this.targetClusterSize = options.clusterSize ?? config.clustering.clusterSize ?? 100;
this.newClusterThresholdFactor = options.newClusterThresholdFactor ?? 1.5;
this.newClusterDistanceThreshold = options.newClusterDistanceThreshold ?? 0.5;
this.maxClusters = options.maxClusters ?? config.clustering.maxClusters ?? 1000; // Set a reasonable max
this.distanceMetric = options.distanceMetric ?? 'euclidean'; // Default metric
this.kmeansMaxIterations = options.kmeansMaxIterations ?? config.clustering.kmeansMaxIterations ?? 100; // K-Means iterations
this.runKMeansOnLoad = options.runKMeansOnLoad ?? false; // Default to false
this.clusterCentroids = new Map(); // Initialize clusterCentroids before use
this.kmeans = new KMeans(this.clusterCentroids.size, this.kmeansMaxIterations);
// Initialize clustering structures
this.clusters = new Map();
this.clusterDimensions = new Map();
this.clusterIdCounter = 0; // Separate counter for cluster keys
// No automatic rebuild after loading - we'll handle this in the load method
}
// --- File Path for Cluster State ---
protected _getClusterStateFilePath(): string {
if (!this.dbPath) throw new Error('DB path not set for cluster state');
// Store cluster state separately from base vector/meta data
log('debug', `[ClusteredVectorDB] Cluster state file path: ${this.dbPath}`);
return path.join(this.dbPath, 'cluster.json') + (this.useCompression ? '.gz' : '');
}
// --- Overridden Save Method ---
override async save(): Promise<void> {
if (!this.dbPath) {
log('warn', '[ClusteredVectorDB] No dbPath specified, skipping save.');
return;
}
if (this.isClosed) {
log('warn', '[ClusteredVectorDB] Attempted to save a closed database.');
return;
}
log('info', `[ClusteredVectorDB] Saving state to ${this.dbPath}`);
// Use a single save promise to prevent race conditions if called multiple times
if (this.savePromise) {
log('info', `[ClusteredVectorDB] Save already in progress, waiting...`);
return this.savePromise;
}
this.savePromise = (async () => {
try {
// 1. Save base data (vectors, metadata) using parent method
await super.save(); // This handles its own file paths and logic
log('info', '[ClusteredVectorDB] Base VectorDB data saved.');
// 2. Prepare cluster state for serialization
const clusterState = {
version: 1, // Versioning for cluster state format
clusterIdCounter: this.clusterIdCounter,
// Convert Maps to structures suitable for JSON
clusters: Array.from(this.clusters.entries()), // [[key1, members1], [key2, members2]]
// Convert Float32Arrays in centroids to regular arrays for JSON
clusterCentroids: Array.from(this.clusterCentroids.entries()).map(([key, centroid]) => [key, Array.from(centroid)]),
clusterDimensions: Array.from(this.clusterDimensions.entries()), // [[key1, dim1], ...]
};
// 3. Save cluster state to its own file
const clusterFilePath = this._getClusterStateFilePath();
log('info', `[ClusteredVectorDB] Saving cluster state to: ${clusterFilePath}`);
let clusterContent: string | Buffer = JSON.stringify(clusterState);
if (this.useCompression) {
clusterContent = await gzip(clusterContent);
}
await fsPromises.writeFile(clusterFilePath, clusterContent);
log('info', '[ClusteredVectorDB] Cluster state saved successfully.');
// Emit the save event (perhaps redundant if parent emits, decide based on needs)
// this.emit('db:save', { path: this.dbPath, count: this.memoryStorage.size });
} catch (error) {
log('error', `[ClusteredVectorDB] Error saving database state to ${this.dbPath}:`, error);
throw error; // Re-throw to indicate failure
} finally {
this.savePromise = null; // Release lock
}
})();
return this.savePromise;
}
// --- Overridden Load Method ---
override async load(): Promise<void> {
if (!this.dbPath) {
throw new Error('[ClusteredVectorDB] Database path not specified for loading.');
}
if (this.isClosed) {
throw new Error('[ClusteredVectorDB] Cannot load into a closed database.');
}
log('info', `[ClusteredVectorDB] Loading state from ${this.dbPath}`);
// 1. Load base data (vectors, metadata) using parent method
await super.load(); // This handles its own file paths and logic
log('info', '[ClusteredVectorDB] Base VectorDB data loaded.');
// 2. Load cluster state if the file exists
const clusterFilePath = this._getClusterStateFilePath();
let clusterStateLoaded = false;
if (existsSync(clusterFilePath)) {
log('info', `[ClusteredVectorDB] Loading cluster state from: ${clusterFilePath}`);
try {
let clusterContentBuffer = await fsPromises.readFile(clusterFilePath);
if (this.useCompression) {
clusterContentBuffer = await gunzip(clusterContentBuffer);
}
const clusterState = JSON.parse(clusterContentBuffer.toString('utf8'));
if (clusterState.version !== 1) {
throw new Error(`Unsupported cluster state format version: ${clusterState.version}`);
}
// 3. Restore cluster state from loaded data
this.clusterIdCounter = clusterState.clusterIdCounter ?? 0;
this.clusters = new Map(clusterState.clusters);
// Convert centroid arrays back to Float32Arrays
this.clusterCentroids = new Map(clusterState.clusterCentroids.map(([key, centroidArray]: [number, number[]]) => [key, new Float32Array(centroidArray)]));
this.clusterDimensions = new Map(clusterState.clusterDimensions);
log('info', `[ClusteredVectorDB] Cluster state loaded successfully (${this.clusterCentroids.size} clusters).`);
clusterStateLoaded = true;
} catch (error) {
log('error', `[ClusteredVectorDB] Error loading cluster state from ${clusterFilePath}, will rebuild clusters:`, error);
// Reset cluster structures before rebuilding
this.clusters.clear();
this.clusterCentroids.clear();
this.clusterDimensions.clear();
this.clusterIdCounter = 0;
}
} else {
log('info', '[ClusteredVectorDB] Cluster state file not found. Will rebuild clusters if vectors were loaded.');
}
// 4. Rebuild clusters or run K-Means if needed
if (!clusterStateLoaded && this.memoryStorage.size > 0) {
if (this.runKMeansOnLoad) {
log('info', '[ClusteredVectorDB] Running K-Means after load (cluster state missing/invalid)...');
await this.runKMeans(); // Run K-Means with default settings
} else {
log('info', '[ClusteredVectorDB] Rebuilding clusters incrementally after load (cluster state missing/invalid)...');
this._rebuildAllClusters(); // Fallback to incremental rebuild
log('info', `[ClusteredVectorDB] Rebuilt ${this.clusterCentroids.size} clusters incrementally.`);
}
}
}
getDistanceMetric(): DistanceMetric {
return this.distanceMetric;
}
// --- Overridden Methods ---
override addVector(id: number | string | undefined, vector: Vector, metadata?: Record<string, any>): number | string {
const vectorId = super.addVector(id, vector, metadata); // Let parent handle storage
const typedVector = this.memoryStorage.get(vectorId);
if (!typedVector) return vectorId; // Should not happen
this._assignVectorToCluster(vectorId, typedVector);
return vectorId;
}
override deleteVector(id: number | string): boolean {
const vector = this.memoryStorage.get(id); // Get vector before deleting
const deleted = super.deleteVector(id); // Let parent handle deletion
if (deleted && vector) {
this._removeVectorFromCluster(id);
}
return deleted;
}
override updateVector(id: number | string, vector: Vector): boolean {
const oldVector = this.memoryStorage.get(id);
if (!oldVector) {
log('warn', `Attempted to update non-existent vector ID: ${id}`);
return false;
}
const deleted = super.deleteVector(id);
if (!deleted) {
log('warn', `Attempted to update non-existent vector ID: ${id}`);
return false;
}
const vectorId = super.addVector(id, vector); // Let parent handle storage
const typedVector = this.memoryStorage.get(vectorId);
if (!typedVector) return false; // Should not happen
this._assignVectorToCluster(vectorId, typedVector);
return true;
}
override findNearest(
query: Vector,
k: number = 10,
options: {
filter?: (id: number | string, metadata?: Record<string, any>) => boolean;
metric?: DistanceMetric;
} = {}
): SearchResult[] {
log('info', `[ClusteredVectorDB] [findNearest] Searching for nearest vectors... with k=${k}}`);
const typedQuery = query instanceof Float32Array ? query : new Float32Array(query);
const metric = options.metric ?? this.distanceMetric; // Use instance default or override
const filter = options.filter;
// Fallback to linear search if no clusters exist
if (this.clusterCentroids.size === 0) {
log('warn', 'No clusters found, falling back to linear search.');
return this._linearSearch(typedQuery, k, metric, filter);
}
const queryDim = typedQuery.length;
// 1. Find candidate clusters
const clusterDistances: Array<{ key: number; dist: number }> = [];
for (const [key, centroid] of this.clusterCentroids.entries()) {
// Check dimension compatibility *before* calculating distance if metric requires it
const centroidDim = this.clusterDimensions.get(key);
if (metric === 'cosine' && centroidDim !== queryDim) {
continue; // Skip incompatible dimensions for cosine
}
// Euclidean can handle mismatch (though results might be less meaningful)
const dist = this._calculateDistance(typedQuery, centroid, metric);
clusterDistances.push({ key, dist });
}
if (clusterDistances.length === 0) {
// No compatible clusters found (e.g., cosine search with wrong dimension)
return [];
}
const clustersToSearch = clusterDistances.sort((a, b) => a.dist - b.dist);
log('info', `[ClusteredVectorDB] [findNearest] Found ${clustersToSearch.length} candidate clusters.`);
// 2. Collect candidate vectors from selected clusters
const candidateIds = new Set<number | string>();
for (const { key } of clustersToSearch) {
const clusterMembers = this.clusters.get(key) || [];
for (const member of clusterMembers) {
candidateIds.add(member.id);
}
}
// 3. Perform exact search on candidates
const results: SearchResult[] = [];
for (const id of candidateIds) {
const vector = this.memoryStorage.get(id);
if (!vector) continue; // Should not happen if cluster list is sync'd
// Apply filter if provided
if (filter) {
const meta = this.metadata.get(id);
if (!filter(id, meta)) {
continue;
}
}
// Double-check dimension compatibility for the specific metric
if (metric === 'cosine' && vector.length !== queryDim) {
continue;
}
const dist = this._calculateDistance(typedQuery, vector, metric);
results.push({ id, dist });
}
log('info', `[ClusteredVectorDB] [findNearest] Found ${results.length} candidates.`);
// 4. Sort final results and return top k
return results.sort((a, b) => a.dist - b.dist).slice(0, k);
}
// --- Clustering Logic ---
private _assignVectorToCluster(vectorId: number | string, vector: Float32Array): void {
const vectorDim = vector.length;
// Handle the very first vector
if (this.clusterCentroids.size === 0) {
this._createNewCluster(vectorId, vector);
return;
}
// Find the best cluster (considering dimensions and distance)
let bestClusterKey: number | null = null;
let minDist = Infinity;
for (const [key, centroid] of this.clusterCentroids.entries()) {
const clusterDim = this.clusterDimensions.get(key);
// Strict dimension check for cosine, optional for Euclidean (centroids should ideally match vector dims)
if (this.distanceMetric === 'cosine' && clusterDim !== vectorDim) {
continue;
}
// Could add a check here for Euclidean too if strict dimension matching per cluster is desired
const dist = this._calculateDistance(vector, centroid, this.distanceMetric);
if (dist < minDist) {
minDist = dist;
bestClusterKey = key;
}
}
// Decide whether to create a new cluster or add to the best existing one
let assignedKey: number;
if (bestClusterKey !== null && this.clusterCentroids.size < this.maxClusters) {
const clusterMembers = this.clusters.get(bestClusterKey) || [];
const needsNewCluster =
// Reason 1: Cluster is getting too large
clusterMembers.length >= this.targetClusterSize * this.newClusterThresholdFactor ||
// Reason 2: Vector is too far from the closest centroid
minDist > this.newClusterDistanceThreshold;
if (!needsNewCluster) {
assignedKey = bestClusterKey;
} else {
// Create a new cluster if conditions met
assignedKey = this._createNewCluster(vectorId, vector);
// Don't add to the list below, it's done in _createNewCluster
return; // Exit early as it's handled
}
} else {
// No suitable existing cluster found, or max clusters reached, or first vector for this dimension
assignedKey = this._createNewCluster(vectorId, vector);
// Don't add to the list below, it's done in _createNewCluster
return; // Exit early as it's handled
}
// Add to the chosen existing cluster
const clusterMembers = this.clusters.get(assignedKey);
if (clusterMembers) {
// Should always exist if assignedKey is from existing
clusterMembers.push({ id: vectorId });
// Update centroid incrementally (more efficient than recalculating)
this._updateCentroidIncrementally(assignedKey, vector, 'add');
} else {
// This case should ideally not be reached if logic above is correct
log('error', `Cluster ${assignedKey} not found when trying to add vector ${vectorId}`);
// Fallback: create cluster anyway?
assignedKey = this._createNewCluster(vectorId, vector);
}
}
private _createNewCluster(initialVectorId: number | string, initialVector: Float32Array): number {
const newKey = this.clusterIdCounter++;
this.clusters.set(newKey, [{ id: initialVectorId }]); // Store only ID
// Centroid starts as the first vector in the cluster
this.clusterCentroids.set(newKey, initialVector.slice()); // Use slice to copy
this.clusterDimensions.set(newKey, initialVector.length);
this.emit('cluster:create', {
clusterId: newKey,
vectorId: initialVectorId,
});
return newKey;
}
private _removeVectorFromCluster(vectorId: number | string): void {
let foundClusterKey: number | null = null;
let indexToRemove: number | null = null;
// Find the cluster containing the vector
for (const [key, members] of this.clusters.entries()) {
const index = members.findIndex((m) => m.id === vectorId);
if (index !== -1) {
foundClusterKey = key;
indexToRemove = index;
break;
}
}
if (foundClusterKey !== null && indexToRemove !== null) {
const members = this.clusters.get(foundClusterKey)!;
const vectorToRemove = this.memoryStorage.get(vectorId) ?? null; // Get vector data for centroid update
// Remove from member list
members.splice(indexToRemove, 1);
// Update centroid or remove cluster if empty
if (members.length > 0 && vectorToRemove) {
// Update centroid incrementally
this._updateCentroidIncrementally(foundClusterKey, vectorToRemove, 'remove');
} else {
// Cluster is now empty, remove it
this.clusters.delete(foundClusterKey);
this.clusterCentroids.delete(foundClusterKey);
this.clusterDimensions.delete(foundClusterKey);
this.emit('cluster:delete', { clusterId: foundClusterKey });
}
} else {
log('warn', `Vector ${vectorId} not found in any cluster during deletion.`);
}
}
// More efficient centroid update without iterating all members
private _updateCentroidIncrementally(clusterKey: number, vector: Float32Array, operation: 'add' | 'remove'): void {
const centroid = this.clusterCentroids.get(clusterKey);
const members = this.clusters.get(clusterKey);
if (!centroid || !members) {
log('error', `Cannot update centroid for non-existent cluster ${clusterKey}`);
return;
}
if (centroid.length !== vector.length) {
log('error', `Dimension mismatch during incremental centroid update for cluster ${clusterKey}`);
// Maybe trigger full rebuild for this cluster?
this._recalculateCentroid(clusterKey); // Fallback to full recalc
return;
}
const currentSize = operation === 'add' ? members.length - 1 : members.length + 1;
const newSize = members.length;
if (newSize === 0 || currentSize < 0) {
// This should be handled by cluster deletion logic, but as a safeguard:
if (newSize === 0) {
this.clusters.delete(clusterKey);
this.clusterCentroids.delete(clusterKey);
this.clusterDimensions.delete(clusterKey);
}
return;
}
if (operation === 'add') {
// new_centroid = (old_centroid * old_size + new_vector) / new_size
for (let i = 0; i < centroid.length; i++) {
centroid[i] = (centroid[i] * currentSize + vector[i]) / newSize;
}
} else {
// operation === 'remove'
// new_centroid = (old_centroid * old_size - removed_vector) / new_size
for (let i = 0; i < centroid.length; i++) {
centroid[i] = (centroid[i] * currentSize - vector[i]) / newSize;
}
}
// No need to set back into map as we modified the array in place
// this.clusterCentroids.set(clusterKey, centroid);
}
// Fallback centroid calculation
private _recalculateCentroid(clusterKey: number): void {
const members = this.clusters.get(clusterKey);
if (!members || members.length === 0) {
// Remove cluster if empty during recalculation
this.clusters.delete(clusterKey);
this.clusterCentroids.delete(clusterKey);
this.clusterDimensions.delete(clusterKey);
return;
}
let firstVector: Float32Array | null = null;
const memberVectors: Float32Array[] = [];
// Gather vectors (inefficient, use only as fallback)
for (const member of members) {
const vec = this.memoryStorage.get(member.id);
if (vec) {
if (!firstVector) firstVector = vec;
memberVectors.push(vec);
} else {
log('warn', `Vector ${member.id} not found in memoryStorage during centroid recalc for cluster ${clusterKey}.`);
}
}
if (!firstVector || memberVectors.length === 0) {
// Cluster effectively empty
this.clusters.delete(clusterKey);
this.clusterCentroids.delete(clusterKey);
this.clusterDimensions.delete(clusterKey);
return;
}
const dimensions = firstVector.length;
const centroid = new Float32Array(dimensions);
// Calculate sum
for (const vector of memberVectors) {
if (vector.length !== dimensions) {
log('error', `Inconsistent dimensions within cluster ${clusterKey} during recalc. Expected ${dimensions}, got ${vector.length}`);
// How to handle? Skip vector? Abort?
continue;
}
for (let i = 0; i < dimensions; i++) {
centroid[i] += vector[i];
}
}
// Calculate average
const count = memberVectors.length;
if (count > 0) {
for (let i = 0; i < dimensions; i++) {
centroid[i] /= count;
}
}
this.clusterCentroids.set(clusterKey, centroid);
this.clusterDimensions.set(clusterKey, dimensions); // Ensure dimension is correct
}
// Method to rebuild all clusters from scratch (e.g., after loading)
private _rebuildAllClusters(): void {
this.clusters.clear();
this.clusterCentroids.clear();
this.clusterDimensions.clear();
this.clusterIdCounter = 0;
// Iterate through all vectors in memory storage and re-assign them
for (const [id, vector] of this.memoryStorage.entries()) {
// Use the assignment logic, which handles creating new clusters
// This is less efficient than a bulk k-means, but reuses existing logic
this._assignVectorToCluster(id, vector);
}
}
// --- K-Means Implementation ---
/**
* Runs the K-Means clustering algorithm to potentially improve cluster quality.
* This is computationally more expensive than incremental updates.
*
* @param k - The target number of clusters. Defaults to the current number of clusters or a minimum of 1.
* @param maxIterations - Maximum number of iterations for the algorithm. Defaults to instance configuration.
* @returns A promise that resolves when K-Means completes.
*/
async runKMeans(k?: number, maxIterations?: number): Promise<void> {
if (this.memoryStorage.size === 0) {
log('info', '[ClusteredVectorDB] Skipping K-Means: No vectors in the database.');
return;
}
const targetK = k ?? Math.max(1, this.clusterCentroids.size); // Default to current cluster count or 1
const iterations = maxIterations ?? this.kmeansMaxIterations;
log('info', `[ClusteredVectorDB] Starting K-Means with k=${targetK}, maxIterations=${iterations}...`);
this.emit('kmeans:start', { k: targetK, iterations }); // Emit start event
const startTime = Date.now();
try {
const vectors = Array.from(this.memoryStorage.values());
this.kmeans = new KMeans(targetK, iterations);
const centroids = await this.kmeans.cluster(vectors);
this._updateClustersFromKMeans(Array.from(this.memoryStorage.entries()), new Map<number | string, number>(), centroids);
const duration = Date.now() - startTime;
log('info', `[ClusteredVectorDB] K-Means finished in ${duration}ms. New cluster count: ${this.clusterCentroids.size}`);
this.emit('kmeans:complete', { k: this.clusterCentroids.size, iterations });
} catch (error) {
log('error', '[ClusteredVectorDB] Error during K-Means execution:', error);
this.emit('kmeans:error', { error });
// Optionally re-throw or handle the error
}
}
private _updateClustersFromKMeans(
allVectors: [number | string, Float32Array][],
assignments: Map<number | string, number>, // vectorId -> centroidIndex
finalCentroids: Float32Array[]
): void {
// Clear existing cluster structures
this.clusters.clear();
this.clusterCentroids.clear();
this.clusterDimensions.clear();
this.clusterIdCounter = 0; // Reset counter, new keys will be assigned
const centroidIndexToClusterKey: Map<number, number> = new Map();
// Create new cluster structures based on final centroids
for (let i = 0; i < finalCentroids.length; i++) {
const centroid = finalCentroids[i];
const newKey = this.clusterIdCounter++;
centroidIndexToClusterKey.set(i, newKey); // Map K-Means index to new DB cluster key
this.clusters.set(newKey, []); // Initialize empty member list { id: vectorId }[]
this.clusterCentroids.set(newKey, centroid); // Already a copy
this.clusterDimensions.set(newKey, centroid.length);
}
// Populate the member lists based on assignments
for (const [vectorId, vector] of allVectors) {
let bestCentroidIndex = -1;
let minDist = Infinity;
for (let i = 0; i < finalCentroids.length; i++) {
const centroid = finalCentroids[i];
// Ensure dimension compatibility if needed by metric
if (this.distanceMetric === 'cosine' && vector.length !== centroid.length) {
continue;
}
const dist = this._calculateDistance(vector, centroid, this.distanceMetric);
if (dist < minDist) {
minDist = dist;
bestCentroidIndex = i;
}
}
const centroidIndex = bestCentroidIndex;
if (centroidIndex !== undefined && centroidIndex !== -1) {
const clusterKey = centroidIndexToClusterKey.get(centroidIndex);
if (clusterKey !== undefined) {
const members = this.clusters.get(clusterKey);
members?.push({ id: vectorId }); // Add vector ID object
} else {
// Should not happen if mapping is correct
log('warn', `[ClusteredVectorDB] K-Means Update: Could not find cluster key for centroid index ${centroidIndex}`);
}
} else {
// Vector wasn't assigned (e.g., dimension mismatch)
log('warn', `[ClusteredVectorDB] K-Means Update: Vector ${vectorId} has no assignment.`);
// Decide how to handle unassigned vectors: create separate cluster? Ignore?
}
}
// Optional: Clean up any clusters that ended up empty despite having a centroid
const keysToDelete: number[] = [];
for (const [key, members] of this.clusters.entries()) {
if (members.length === 0) {
keysToDelete.push(key);
}
}
for (const key of keysToDelete) {
this.clusters.delete(key);
this.clusterCentroids.delete(key);
this.clusterDimensions.delete(key);
log('info', `[ClusteredVectorDB] K-Means Update: Removed empty cluster ${key}.`);
}
}
// --- Stats (Override) ---
override getStats(): DBStats {
const baseStats = super.getStats(); // Get stats from VectorDB
const clusterSizes: Record<number, number> = {};
let totalVectorsInClusters = 0;
this.clusters.forEach((members, key) => {
clusterSizes[key] = members.length;
totalVectorsInClusters += members.length;
});
const clusterDims: Record<number, number> = {};
this.clusterDimensions.forEach((dim, key) => {
clusterDims[key] = dim;
});
baseStats.clusters = {
count: this.clusterCentroids.size,
avgSize: this.clusterCentroids.size > 0 ? totalVectorsInClusters / this.clusterCentroids.size : 0,
dimensions: clusterDims, // Store dimension per cluster key
distribution: Object.entries(clusterSizes).map(([keyStr, size]) => {
const key = parseInt(keyStr, 10); // Ensure key is number
const centroid = this.clusterCentroids.get(key);
const members = this.clusters.get(key) || []; // Get members for this cluster
return {
id: key, // Cluster ID
size,
dimension: this.clusterDimensions.get(key) || 0, // Get stored dimension
// Calculate norm only if centroid exists
centroidNorm: centroid ? this._calculateNorm(centroid) : 0,
members: members, // Add the list of members (vector IDs)
};
}),
};
// Add clustering overhead to memory estimate
let clusterOverhead = 0;
this.clusterCentroids.forEach((c) => (clusterOverhead += c.byteLength)); // Centroid memory
clusterOverhead += this.clusters.size * 16; // Map overhead
clusterOverhead += this.clusterDimensions.size * 8; // Map overhead
// Estimate overhead for member lists (crude: assume ~8 bytes per ID reference)
this.clusters.forEach((m) => (clusterOverhead += m.length * 8));
baseStats.memoryUsage = (baseStats.memoryUsage ?? 0) + clusterOverhead;
return baseStats;
}
override async close(): Promise<void> {
await super.close(); // Call parent close (saves data, clears base maps)
// Parent clear methods already handle memoryStorage, metadata, vectorDimensions
// Clear clustering structures
this.clusters.clear();
this.clusterCentroids.clear();
this.clusterDimensions.clear();
// No need to emit 'db:close' again, parent does it
}
// --- Public Cluster Info Method ---
getClusterInfo(): Array<{
id: number;
centroid: Float32Array;
size: number;
dimension: number;
}> {
const result = [];
for (const [key, centroid] of this.clusterCentroids.entries()) {
const size = this.clusters.get(key)?.length ?? 0;
const dimension = this.clusterDimensions.get(key) ?? centroid.length; // Use stored dim or calculate
result.push({ id: key, centroid, size, dimension });
}
return result;
}
/**
* Extract relationships between vectors based on distance or custom criteria.
*
* @param threshold - The maximum distance between vectors to consider them related.
* @param metric - Distance metric to use (e.g., 'cosine', 'euclidean').
* @returns An array of relationships, where each relationship links two vector IDs, their distance, and optional metadata.
*/
public extractRelationships(
threshold: number,
metric: DistanceMetric = this.distanceMetric
): Array<{
vector1: number | string;
vector2: number | string;
distance: number;
metadata1?: Record<string, any>;
metadata2?: Record<string, any>;
}> {
const relationships: Array<{
vector1: number | string;
vector2: number | string;
distance: number;
metadata1?: Record<string, any>;
metadata2?: Record<string, any>;
}> = [];
// Iterate over all vectors
const vectorEntries = Array.from(this.memoryStorage.entries());
for (let i = 0; i < vectorEntries.length; i++) {
const [id1, vector1] = vectorEntries[i];
for (let j = i + 1; j < vectorEntries.length; j++) {
const [id2, vector2] = vectorEntries[j];
// Ensure dimension compatibility
if (vector1.length !== vector2.length) {
log('warn', `Dimension mismatch between vector ${id1} and ${id2}, skipping.`);
continue;
}
// Calculate distance
const distance = this._calculateDistance(vector1, vector2, metric);
// Check if the distance is within the threshold
if (distance <= threshold) {
// Get metadata for both vectors if available
const metadata1 = this.metadata.get(id1);
const metadata2 = this.metadata.get(id2);
relationships.push({
vector1: id1,
vector2: id2,
distance,
metadata1: metadata1 ? { ...metadata1 } : undefined,
metadata2: metadata2 ? { ...metadata2 } : undefined
});
}
}
}
log('info', `[ClusteredVectorDB] Extracted ${relationships.length} relationships.`);
return relationships;
}
/**
* Extract communities of related vectors based on distance threshold.
* Uses cluster information to optimize the community detection process.
*
* @param threshold - The maximum distance between vectors to consider them related
* @param metric - Distance metric to use (e.g., 'cosine', 'euclidean')
* @returns Array of communities, where each community is an array of related vector information
*/
override extractCommunities(
threshold: number,
metric: DistanceMetric = this.distanceMetric
): Array<Array<{
id: number | string;
metadata?: Record<string, any>;
}>> {
log('info', `[ClusteredVectorDB] Extracting vector communities with threshold ${threshold}...`);
// We can optimize by first checking distances between cluster centroids
// Only compare vectors in clusters whose centroids are within (2 * threshold) distance
// This is an approximation that works because of the triangle inequality property
const clusterAdjacency = new Map<number, Set<number>>();
// Build cluster adjacency graph
for (const [keyA, centroidA] of this.clusterCentroids.entries()) {
clusterAdjacency.set(keyA, new Set());
for (const [keyB, centroidB] of this.clusterCentroids.entries()) {
if (keyA === keyB) continue; // Skip self
// Skip if dimension mismatch for cosine
if (metric === 'cosine' && centroidA.length !== centroidB.length) {
continue;
}
// Calculate inter-cluster distance
const distance = this._calculateDistance(centroidA, centroidB, metric);
// Use 2*threshold as a conservative bound due to triangle inequality
if (distance <= 2 * threshold) {
clusterAdjacency.get(keyA)?.add(keyB);
}
}
}
// Build the vector graph, but only consider vectors in nearby clusters
const graph = new Map<number | string, Set<number | string>>();
// Initialize graph with empty adjacency lists
for (const [id] of this.memoryStorage.entries()) {
graph.set(id, new Set());
}
// For each cluster
for (const [clusterKey, members] of this.clusters.entries()) {
const relatedClusters = new Set([clusterKey, ...(clusterAdjacency.get(clusterKey) || [])]);
// Get all vectors in this cluster
const clusterVectors = members.map(m => m.id);
// For each vector in this cluster
for (const vectorId of clusterVectors) {
const vector = this.memoryStorage.get(vectorId);
if (!vector) continue;
// Compare with vectors in related clusters
for (const relatedClusterKey of relatedClusters) {
const relatedMembers = this.clusters.get(relatedClusterKey) || [];
for (const relatedMember of relatedMembers) {
const relatedId = relatedMember.id;
// Skip self comparison
if (vectorId === relatedId) continue;
// Skip if already checked (undirected graph)
if (graph.get(vectorId)?.has(relatedId)) continue;
const relatedVector = this.memoryStorage.get(relatedId);
if (!relatedVector) continue;
// Ensure dimension compatibility
if (vector.length !== relatedVector.length) {
continue;
}
// Calculate distance
const distance = this._calculateDistance(vector, relatedVector, metric);
// Add edge if distance is within threshold
if (distance <= threshold) {
graph.get(vectorId)?.add(relatedId);
graph.get(relatedId)?.add(vectorId);
}
}
}
}
}
// Use depth-first search to find connected components (communities)
const visited = new Set<number | string>();
const communities: Array<Array<{
id: number | string;
metadata?: Record<string, any>;
}>> = [];
for (const [id] of graph.entries()) {
if (!visited.has(id)) {
const community: Array<{
id: number | string;
metadata?: Record<string, any>;
}> = [];
// DFS to find all connected vectors
const dfs = (nodeId: number | string) => {
visited.add(nodeId);
const metadata = this.metadata.get(nodeId);
community.push({
id: nodeId,
metadata: metadata ? { ...metadata } : undefined
});
// Visit all neighbors
const neighbors = graph.get(nodeId) || new Set();
for (const neighbor of neighbors) {
if (!visited.has(neighbor)) {
dfs(neighbor);
}
}
};
dfs(id);
// Only include communities with at least 2 vectors
if (community.length > 1) {
communities.push(community);
}
}
}
log('info', `[ClusteredVectorDB] Found ${communities.length} communities`);
return communities;
}
}
// --- END OF FILE clustered_vector_db.ts ---