@n2flowjs/nbase
Version:
Neural Vector Database for efficient similarity search
942 lines (815 loc) • 34 kB
text/typescript
// --- START OF FILE vector_db.ts ---
import { EventEmitter } from 'events';
import path from 'path';
import { promises as fsPromises, existsSync } from 'fs';
import zlib from 'zlib';
import { promisify } from 'util';
import { log } from '../utils/log';
const gzip = promisify(zlib.gzip);
const gunzip = promisify(zlib.gunzip);
import config from '../config'; // Assuming config exists and has defaults
import { Vector, DBStats, SearchResult, VectorData, DistanceMetric, TypedEventEmitter, VectorDBEventData } from '../types';
// Function to serialize Float32Array to Buffer
function serializeVector(vector: Float32Array): Buffer {
// Create a Buffer from the underlying ArrayBuffer of the Float32Array
return Buffer.from(vector.buffer, vector.byteOffset, vector.byteLength);
}
// Function to deserialize Buffer to Float32Array
function deserializeVector(buffer: Buffer, dimension: number): Float32Array {
if (buffer.length !== dimension * 4) {
// Basic sanity check (4 bytes per float32)
throw new Error(`Buffer length ${buffer.length} does not match expected size for dimension ${dimension}`);
}
// Create a Float32Array viewing the same memory as the Buffer
// Ensure proper alignment and byte offset handling if the buffer comes from a larger allocation
return new Float32Array(buffer.buffer, buffer.byteOffset, dimension);
}
// --- VectorDB Base Class (Optimized Persistence) ---
/**
* Vector database for storing and retrieving high-dimensional vectors with associated metadata.
* Provides efficient in-memory storage with optional persistence to disk.
*
* Key features:
* - Store vectors with numeric or string IDs
* - Associate metadata with vectors
* - Find nearest neighbors using different distance metrics
* - Persist database to disk with optional compression
* - Event-based architecture for operation monitoring
*
* @example
* ```typescript
* // Create an in-memory database with default vector size of 384
* const db = new VectorDB(384);
*
* // Add vectors with metadata
* const id = db.addVector(undefined, new Float32Array([0.1, 0.2, 0.3]), { source: 'example' });
*
* // Find similar vectors
* const results = db.findNearest([0.1, 0.15, 0.25], 5, { metric: 'cosine' });
*
* // Save to disk
* db.setPath('./vector_data');
* await db.save();
* ```
*
* @fires vector:add - When a vector is added
* @fires vectors:bulkAdd - When multiple vectors are added
* @fires vector:delete - When a vector is deleted
* @fires metadata:add - When metadata is added to a vector
* @fires metadata:update - When vector metadata is updated
* @fires db:save - When the database is saved to disk
* @fires db:load - When the database is loaded from disk
* @fires db:close - When the database is closed
*
* @extends EventEmitter
*/
export class VectorDB extends (EventEmitter as new () => TypedEventEmitter<VectorDBEventData>) {
public defaultVectorSize: number | null = null;
public memoryStorage: Map<number | string, Float32Array>;
protected metadata: Map<number | string, Record<string, any>>;
protected vectorDimensions: Map<number | string, number>; // Keep track of individual dimensions
protected idCounter: number;
protected dbPath: string | null; // Base path (without extension)
protected savePromise: Promise<void> | null = null;
protected isClosed: boolean = false;
protected useCompression: boolean; // Option for compression
public isReady: boolean = false; // Flag to indicate if the database is ready for operations
constructor(suggestedVectorSize: number | null = null, dbPath: string | null = null, options: { useCompression?: boolean } = {}) {
super();
this.defaultVectorSize = suggestedVectorSize;
this.memoryStorage = new Map();
this.metadata = new Map();
this.vectorDimensions = new Map();
this.idCounter = 1;
this.dbPath = dbPath;
this.useCompression = options.useCompression ?? false; // Default to no compression
if (dbPath) {
log('info', `[VectorDB] Constructor Loading database from ${dbPath}...`);
this.load()
.catch((err) => {
// Only log error if file likely existed but failed to load
if (err.code !== 'ENOENT') {
log('error', `Error loading database from ${dbPath}:`, err);
} else {
log('info', `Database files not found at ${dbPath}, starting fresh.`);
}
})
.finally(() => {
this.isReady = true; // Set ready flag after load attempt
});
}
}
getIdCounter(): number {
return this.idCounter;
}
// --- Core Methods (Mostly Unchanged, check ID handling) ---
setPath(dbPath: string): void {
this.dbPath = dbPath;
}
vectorSize(): number {
// (Keep existing logic for determining most common size)
if (this.defaultVectorSize !== null) {
return this.defaultVectorSize;
}
if (this.memoryStorage.size === 0) {
return config.defaults.vectorSize || 0;
}
const dimensionCounts = new Map<number, number>();
let maxCount = 0;
let mostCommonSize = config.defaults.vectorSize || 0;
for (const dim of this.vectorDimensions.values()) {
const count = (dimensionCounts.get(dim) || 0) + 1;
dimensionCounts.set(dim, count);
if (count > maxCount) {
maxCount = count;
mostCommonSize = dim;
}
}
return mostCommonSize;
}
addVector(
id: number | string | undefined,
vector: Vector,
metadata?: Record<string, any> // Allow adding metadata directly
): number | string {
let vectorId = id !== undefined ? id : this.idCounter++;
// Optional: Standardize ID to string for internal consistency?
// vectorId = String(vectorId);
if (this.memoryStorage.has(vectorId)) {
log('warn', `Vector with ID ${vectorId} already exists. Overwriting.`);
// Decide if overwrite is desired or should throw error
}
const typedVector = vector instanceof Float32Array ? vector : new Float32Array(vector);
const dimension = typedVector.length;
if (this.memoryStorage.size === 0 && this.defaultVectorSize === null) {
this.defaultVectorSize = dimension;
}
this.memoryStorage.set(vectorId, typedVector);
this.vectorDimensions.set(vectorId, dimension); // Store dimension
if (metadata) {
this.metadata.set(vectorId, metadata);
}
// Update idCounter if a numeric ID is provided explicitly
if (typeof vectorId === 'number' && vectorId >= this.idCounter) {
this.idCounter = vectorId + 1;
}
this.emit('vector:add', { id: vectorId, dimensions: dimension });
return vectorId;
}
bulkAdd(vectors: VectorData[]): number {
log('info', `[VectorDB] Bulk adding ${vectors.length} vectors...`);
let addedCount = 0;
const addedIds: (number | string)[] = [];
for (const item of vectors) {
try {
// Pass metadata if available in VectorData type
const id = this.addVector(item.id, item.vector, item.metadata);
addedCount++;
addedIds.push(id);
} catch (error) {
log('error', `Error adding vector ${item.id}:`, error);
}
}
// Verify vectors were actually added to memory storage
if (addedCount > 0 && this.memoryStorage.size === 0) {
log('warn', '[VectorDB] Warning: bulkAdd reported success but memoryStorage is empty');
} else {
log('info', `[VectorDB] Successfully added ${addedCount} vectors to memory storage. Storage size: ${this.memoryStorage.size}`);
}
this.emit('vectors:bulkAdd', { count: addedCount, ids: addedIds });
return addedCount;
}
getVector(id: number | string): Float32Array | null {
// Keep existing logic (try exact, then conversions)
return this.memoryStorage.get(id) ?? null; // Simpler check with nullish coalescing
// Consider removing automatic type conversion for stricter behavior if desired
}
hasVector(id: number | string): boolean {
// Keep existing logic
return this.memoryStorage.has(id);
// Consider removing automatic type conversion
}
deleteVector(id: number | string): boolean {
const deleted = this.memoryStorage.delete(id);
if (deleted) {
this.metadata.delete(id);
this.vectorDimensions.delete(id); // Remove dimension info
this.emit('vector:delete', { id });
}
return deleted;
}
updateVector(id: number | string, vector: Vector): boolean {
if (!this.memoryStorage.has(id)) {
log('warn', `Attempted to update non-existent vector ID: ${id}`);
return false;
}
const typedVector = vector instanceof Float32Array ? vector : new Float32Array(vector);
const dimension = typedVector.length;
this.memoryStorage.set(id, typedVector);
this.vectorDimensions.set(id, dimension); // Update dimension
this.emit('vector:update', { id, dimensions: dimension });
return true;
}
addMetadata(id: number | string, data: Record<string, any>): void {
if (!this.memoryStorage.has(id)) {
// Use hasVector for consistency?
throw new Error(`Vector with ID ${id} not found`);
}
this.metadata.set(id, data);
this.emit('metadata:add', { id, metadata: data });
}
getMetadata(id: number | string): Record<string, any> | null {
// Keep existing logic
return this.metadata.get(id) ?? null;
// Consider removing automatic type conversion
}
updateMetadata(id: number | string, data: Record<string, any> | ((current: Record<string, any> | null) => Record<string, any>)): boolean {
// Keep existing logic
if (!this.memoryStorage.has(id)) {
log('warn', `Attempted to update metadata for non-existent vector ID: ${id}`);
return false; // Or throw error
}
const current = this.metadata.get(id) || null;
let updated: Record<string, any>;
if (typeof data === 'function') {
updated = data(current);
} else {
updated = { ...(current || {}), ...data }; // Ensure current is not null
}
this.metadata.set(id, updated);
this.emit('metadata:update', { id, metadata: updated });
return true;
}
getVectorDimension(id: number | string): number | null {
// Direct lookup is now primary source
return this.vectorDimensions.get(id) ?? null;
// Consider removing automatic type conversion
}
// --- Distance Calculations ---
protected _calculateNorm(vector: Float32Array): number {
let sum = 0;
for (let i = 0; i < vector.length; i++) {
sum += vector[i] * vector[i];
}
return Math.sqrt(sum);
}
protected _dotProduct(a: Float32Array, b: Float32Array): number {
const len = Math.min(a.length, b.length); // Handle dimension mismatch
let dot = 0;
for (let i = 0; i < len; i++) {
dot += a[i] * b[i];
}
return dot;
}
protected _euclideanDistance(a: Float32Array, b: Float32Array): number {
const len = Math.min(a.length, b.length);
let sum = 0;
for (let i = 0; i < len; i++) {
const d = a[i] - b[i];
sum += d * d;
}
// Optional: Penalty for dimension mismatch (consider if really needed)
const dimDiff = Math.abs(a.length - b.length);
if (dimDiff > 0) {
// Simple penalty, maybe make this configurable or remove
sum += dimDiff * (config.defaults.dimensionMismatchPenalty ?? 0.01);
}
return Math.sqrt(sum);
}
// Cosine Similarity returns similarity (higher is better).
// Often 1 - similarity is used as a distance metric (lower is better).
protected _cosineDistance(a: Float32Array, b: Float32Array): number {
const normA = this._calculateNorm(a);
const normB = this._calculateNorm(b);
if (normA === 0 || normB === 0) {
return 1.0; // Handle zero vectors - maximally distant
}
// Ensure dimensions match for a meaningful cosine similarity
if (a.length !== b.length) {
// Or handle as per _euclideanDistance mismatch logic?
// Returning max distance is safer if dimensions must match.
log('warn', `Cosine distance called on vectors with different dimensions (${a.length} vs ${b.length}). Returning max distance.`);
return 1.0;
}
const dot = this._dotProduct(a, b);
// Clamp the result to [-1, 1] due to potential floating point inaccuracies
const similarity = Math.max(-1.0, Math.min(1.0, dot / (normA * normB)));
return 1.0 - similarity; // Convert similarity to distance
}
protected _calculateDistance(a: Float32Array, b: Float32Array, metric: DistanceMetric): number {
switch (metric) {
case 'cosine':
return this._cosineDistance(a, b);
case 'euclidean':
default: // Default to Euclidean
return this._euclideanDistance(a, b);
}
}
// --- Search (Linear Scan - Base Implementation) ---
findNearest(
query: Vector,
k: number = 10,
options: {
filter?: (id: number | string, metadata?: Record<string, any>) => boolean;
metric?: DistanceMetric; // Allow specifying metric
} = {}
): SearchResult[] {
const typedQuery = query instanceof Float32Array ? query : new Float32Array(query);
const metric = options.metric ?? 'euclidean'; // Default metric
return this._linearSearch(typedQuery, k, metric, options.filter);
}
protected _linearSearch(query: Float32Array, k: number, metric: DistanceMetric, filter?: (id: number | string, metadata?: Record<string, any>) => boolean): SearchResult[] {
const results: SearchResult[] = [];
const queryDim = query.length;
for (const [id, vector] of this.memoryStorage.entries()) {
// Filter first (if provided) to potentially skip distance calculation
if (filter) {
const meta = this.metadata.get(id);
if (!filter(id, meta)) {
continue;
}
}
// Important: Ensure dimension compatibility based on metric
if (metric === 'cosine' && vector.length !== queryDim) {
// Cosine requires same dimensions
continue;
}
// Euclidean can handle different dimensions (with penalty)
const dist = this._calculateDistance(query, vector, metric);
results.push({ id, dist });
}
// Sort by distance and limit to k results
// Note: For cosine distance (1-similarity), lower is better, so sort ascending still works
return results.sort((a, b) => a.dist - b.dist).slice(0, k);
}
// --- Optimized Persistence ---
protected _getMetaFilePath(): string {
if (!this.dbPath) throw new Error('DB path not set');
return path.join(this.dbPath, 'meta.json') + (this.useCompression ? '.gz' : '');
}
protected _getVectorFilePath(): string {
if (!this.dbPath) throw new Error('DB path not set');
return path.join(this.dbPath, 'vec.bin' + (this.useCompression ? '.gz' : ''));
}
async save(): Promise<void> {
log('info', '[VectorDB] Saving database...');
if (!this.dbPath) {
log('warn', '[VectorDB] No dbPath specified, skipping save.');
return;
}
if (this.isClosed) {
log('warn', '[VectorDB] Attempted to save a closed database.');
return;
}
log('info', `[VectorDB] Saving to ${this.dbPath}`);
// Only log and return existing promise if a save is already in progress
if (this.savePromise) {
log('info', `[VectorDB] Save already in progress, waiting...`);
return this.savePromise;
}
this.savePromise = (async () => {
const metaFilePath = this._getMetaFilePath();
const vectorFilePath = this._getVectorFilePath();
log('info', '[VectorDB] Meta file path:', metaFilePath);
log('info', '[VectorDB] Vector file path:', vectorFilePath);
try {
// Ensure directory exists
await fsPromises.mkdir(path.dirname(metaFilePath), { recursive: true });
log('info', '[VectorDB] Meta file path:', metaFilePath);
log('info', '[VectorDB] Vector file path:', vectorFilePath);
const metaData: Record<string, any> = {};
this.metadata.forEach((value, key) => {
// Ensure keys are strings for JSON compatibility
metaData[String(key)] = value;
});
const vectorInfo: Array<{
id: number | string;
offset: number;
length: number;
dim: number;
}> = [];
const vectorBuffers: Buffer[] = [];
let currentOffset = 0;
// 1. Prepare vector data and metadata structure
log('info', `[VectorDB] Preparing vector data for saving with ${this.memoryStorage.size} vectors...`);
for (const [id, vector] of this.memoryStorage.entries()) {
const vectorBuffer = serializeVector(vector);
vectorBuffers.push(vectorBuffer);
vectorInfo.push({
id: id,
offset: currentOffset,
length: vectorBuffer.length, // Store byte length
dim: vector.length, // Store dimension
});
currentOffset += vectorBuffer.length;
}
const saveData = {
version: 1, // Add a version number for future format changes
defaultVectorSize: this.defaultVectorSize,
idCounter: this.idCounter,
vectors: vectorInfo,
metadata: metaData,
};
log('info', `[VectorDB] Vector data prepared for saving: ${vectorInfo.length} vectors`);
// 2. Write metadata file
log('info', `[VectorDB] Writing metadata file to: ${metaFilePath} with ${vectorInfo.length} vectors`);
// Ensure metadata is JSON-serializable
let metaContent: string | Buffer = JSON.stringify(saveData);
if (this.useCompression) {
metaContent = await gzip(metaContent);
}
log('info', '[VectorDB] Writing meta file to:', metaFilePath);
await fsPromises.writeFile(metaFilePath, metaContent);
log('info', '[VectorDB] Meta file written successfully.');
// 3. Write vector data file
let vectorContent: Buffer | Buffer[] = Buffer.concat(vectorBuffers);
if (this.useCompression) {
vectorContent = await gzip(vectorContent);
}
log('info', `[VectorDB] Writing vector file to: ${vectorFilePath} (${vectorBuffers.length} vectors, ${vectorContent.length} bytes)`);
await fsPromises.writeFile(vectorFilePath, vectorContent);
log('info', '[VectorDB] Vector file written successfully.');
// 4. Emit save event
this.emit('db:save', {
path: this.dbPath || 'DB path not set',
count: this.memoryStorage.size,
});
log('info', '[VectorDB] Save event emitted successfully.');
} catch (error) {
log('error', `Error saving database to ${this.dbPath}:`, error);
throw error; // Re-throw to indicate failure
} finally {
this.savePromise = null; // Release lock
}
})();
log('info', '[VectorDB] Save promise created.');
return this.savePromise;
}
async load(): Promise<void> {
if (!this.dbPath) {
throw new Error('Database path not specified for loading.');
}
if (this.isClosed) {
throw new Error('Cannot load into a closed database.');
}
const metaFilePath = this._getMetaFilePath();
const vectorFilePath = this._getVectorFilePath();
// Check if files exist first to avoid unnecessary error logging for new databases
const metaExists = existsSync(metaFilePath);
const vecExists = existsSync(vectorFilePath);
// If both files don't exist, this is likely a new database
if (!metaExists && !vecExists) {
log('info', `[VectorDB] Database files not found at ${this.dbPath}. Starting new database.`);
return; // Exit early for a new database initialization
}
try {
// 1. Read and parse metadata file
let metaContentBuffer = await fsPromises.readFile(metaFilePath);
if (this.useCompression) {
metaContentBuffer = await gunzip(metaContentBuffer);
}
const saveData = JSON.parse(metaContentBuffer.toString('utf8'));
if (saveData.version !== 1) {
throw new Error(`Unsupported database format version: ${saveData.version}`);
}
// 2. Read vector data file
let vectorDataBuffer = await fsPromises.readFile(vectorFilePath);
if (this.useCompression) {
vectorDataBuffer = await gunzip(vectorDataBuffer);
}
// 3. Clear existing data
this.memoryStorage.clear();
this.metadata.clear();
this.vectorDimensions.clear();
// 4. Load data into memory
this.defaultVectorSize = saveData.defaultVectorSize;
this.idCounter = saveData.idCounter ?? 1; // Use saved counter or default
for (const vecInfo of saveData.vectors) {
const { id, offset, length, dim } = vecInfo;
if (offset + length > vectorDataBuffer.length) {
log('error', `Invalid offset/length for vector ${id}. Offset: ${offset}, Length: ${length}, Buffer Size: ${vectorDataBuffer.length}`);
continue; // Skip corrupted entry
}
const vectorSlice = vectorDataBuffer.slice(offset, offset + length);
const vector = deserializeVector(vectorSlice, dim);
this.memoryStorage.set(id, vector);
this.vectorDimensions.set(id, dim);
// Load metadata (handle string keys from JSON)
const meta = saveData.metadata[String(id)];
if (meta) {
this.metadata.set(id, meta);
}
}
this.emit('db:load', {
path: this.dbPath,
count: this.memoryStorage.size,
});
log('info', `[VectorDB] Loaded ${this.memoryStorage.size} vectors from ${this.dbPath}`);
} catch (error: any) {
if (error.code === 'ENOENT') {
// Files not found is expected for a new DB, don't throw
log('info', `Database files not found at ${this.dbPath}. Starting new database.`);
return; // Don't re-throw ENOENT
}
log('error', `Error loading database from ${this.dbPath}:`, error);
throw error; // Re-throw other errors
}
}
// --- Stats and Lifecycle ---
getStats(): DBStats {
// (Keep existing logic, ensure it uses this.vectorDimensions)
const dimensionCounts: Record<number, number> = {};
for (const dim of this.vectorDimensions.values()) {
dimensionCounts[dim] = (dimensionCounts[dim] || 0) + 1;
}
// Recalculate memory usage based on actual stored data
let vectorMemory = 0;
this.memoryStorage.forEach((vec) => (vectorMemory += vec.byteLength));
let metadataMemory = 0;
try {
// Estimate metadata size (crude)
this.metadata.forEach((meta) => (metadataMemory += JSON.stringify(meta).length * 2));
} catch (e) {
log('warn', 'Could not estimate metadata size:', e);
}
const baseStats: DBStats = {
vectorCount: this.memoryStorage.size,
vectorSize: this.vectorSize(),
defaultVectorSize: this.defaultVectorSize ?? 0, // Use 0 if null
metadataCount: this.metadata.size,
dimensions: {
counts: dimensionCounts,
unique: Object.keys(dimensionCounts).length,
},
// More accurate memory usage estimate
memoryUsage:
vectorMemory +
metadataMemory +
this.memoryStorage.size * 16 + // Estimate Map overhead for vectors
this.metadata.size * 16 + // Estimate Map overhead for metadata
this.vectorDimensions.size * 8, // Estimate Map overhead for dimensions
// Placeholder for cluster stats (filled by subclass)
clusters: { count: 0, avgSize: 0, distribution: [], dimensions: {} },
};
// Compatibility check for older DBStats type if needed
if (!(baseStats.clusters as any).dimensions) {
(baseStats.clusters as any).dimensions = {};
}
return baseStats;
}
protected _estimateMemoryUsage(): number {
// This method is now effectively replaced by the calculation within getStats()
// Kept for potential internal use or backward compatibility if needed elsewhere
let vectorMemory = 0;
this.memoryStorage.forEach((vec) => (vectorMemory += vec.byteLength));
let metadataMemory = 0;
try {
this.metadata.forEach((meta) => (metadataMemory += JSON.stringify(meta).length * 2));
} catch (e) {
/* ignore */
}
const dimensionOverhead = this.vectorDimensions.size * 8;
return vectorMemory + metadataMemory + dimensionOverhead + (this.memoryStorage.size + this.metadata.size) * 16; // Rough map overhead
}
/**
* Gets a list of metadata entries that match specified criteria.
*
* @param criteria Can be:
* - A string: field name to check for existence
* - An array of strings: multiple field names to check for existence
* - An object: key-value pairs where each key must exist and match the specified value
* @param values Optional value(s) to match against the field(s) when using string/array input
* @returns Array of {id, metadata} objects for entries that match the criteria
*
* @example
* ```typescript
* // Get all metadata entries that have a 'source' field
* const allWithSource = db.getMetadataWithField('source');
*
* // Get metadata entries where 'category' equals 'article'
* const articles = db.getMetadataWithField('category', 'article');
*
* // Get entries that have both 'author' and 'title' fields
* const authoredContent = db.getMetadataWithField(['author', 'title']);
*
* // Get entries where 'type' is 'book' AND 'published' is true
* const publishedBooks = db.getMetadataWithField(['type', 'published'], ['book', true]);
*
* // Using object syntax (recommended): type='book' AND published=true
* const publishedBooks = db.getMetadataWithField({ type: 'book', published: true });
* ```
*/
getMetadataWithField(
criteria: string | string[] | Record<string, any>,
values?: any | any[],
options?: { limit?: number } // Optional limit for results
): Array<{ id: number | string; metadata: Record<string, any> }> {
const results: Array<{ id: number | string; metadata: Record<string, any> }> = [];
// Handle object criteria format (new format)
if (criteria !== null && typeof criteria === 'object' && !Array.isArray(criteria)) {
const criteriaObj = criteria as Record<string, any>;
const fields = Object.keys(criteriaObj);
this.metadata.forEach((meta, id) => {
let match = true;
// Check if all fields exist and match their values
for (const field of fields) {
if (!(field in meta) || meta[field] !== criteriaObj[field]) {
match = false;
break;
}
}
if (match) {
results.push({ id, metadata: { ...meta } }); // Return a copy of metadata
}
});
if (options?.limit) {
return results.slice(0, options.limit); // Limit results if specified
}
return results;
}
// Handle legacy string/array format
const fieldArray = Array.isArray(criteria) ? criteria : [criteria];
const valueArray = values !== undefined ? (Array.isArray(values) ? values : [values]) : undefined;
// If values are provided, ensure the length matches fields
if (valueArray !== undefined && valueArray.length !== fieldArray.length) {
log('warn', 'Values array length does not match fields array length. Some value checks will be ignored.');
}
this.metadata.forEach((meta, id) => {
let match = true;
// Check all fields exist and match values if provided
for (let i = 0; i < fieldArray.length; i++) {
const field = fieldArray[i];
if (!(field in meta)) {
match = false;
break;
}
// If values are provided, check if the field value matches
if (valueArray !== undefined && i < valueArray.length && meta[field] !== valueArray[i]) {
match = false;
break;
}
}
if (match) {
results.push({ id, metadata: { ...meta } }); // Return a copy of metadata
}
});
return results;
}
getVectorCount(): number {
return this.memoryStorage.size;
}
/**
* Extract relationships between vectors based on distance or custom criteria.
*
* @param threshold - The maximum distance between vectors to consider them related.
* @param metric - Distance metric to use (e.g., 'cosine', 'euclidean').
* @returns An array of relationships, where each relationship links two vector IDs, their distance, and optional metadata.
*/
extractRelationships(
threshold: number,
metric: DistanceMetric = 'euclidean'
): Array<{
vector1: number | string;
vector2: number | string;
distance: number;
metadata1?: Record<string, any>;
metadata2?: Record<string, any>;
}> {
const relationships: Array<{
vector1: number | string;
vector2: number | string;
distance: number;
metadata1?: Record<string, any>;
metadata2?: Record<string, any>;
}> = [];
// Iterate over all vectors
const vectorEntries = Array.from(this.memoryStorage.entries());
for (let i = 0; i < vectorEntries.length; i++) {
const [id1, vector1] = vectorEntries[i];
for (let j = i + 1; j < vectorEntries.length; j++) {
const [id2, vector2] = vectorEntries[j];
// Ensure dimension compatibility
if (vector1.length !== vector2.length) {
log('warn', `Dimension mismatch between vector ${id1} and ${id2}, skipping.`);
continue;
}
// Calculate distance
const distance = this._calculateDistance(vector1, vector2, metric);
// Check if the distance is within the threshold
if (distance <= threshold) {
// Get metadata for both vectors if available
const metadata1 = this.metadata.get(id1);
const metadata2 = this.metadata.get(id2);
relationships.push({
vector1: id1,
vector2: id2,
distance,
metadata1: metadata1 ? { ...metadata1 } : undefined,
metadata2: metadata2 ? { ...metadata2 } : undefined
});
}
}
}
log('info', `[VectorDB] Extracted ${relationships.length} relationships.`);
return relationships;
}
/**
* Extract communities of related vectors based on distance threshold.
* A community is a group of vectors where each vector is related to at least one other vector in the group.
*
* @param threshold - The maximum distance between vectors to consider them related
* @param metric - Distance metric to use (e.g., 'cosine', 'euclidean')
* @returns Array of communities, where each community is an array of related vector information
*/
extractCommunities(
threshold: number,
metric: DistanceMetric = 'euclidean'
): Array<Array<{
id: number | string;
metadata?: Record<string, any>;
}>> {
log('info', `[VectorDB] Extracting vector communities with threshold ${threshold}...`);
// First build a graph representation where each vector is a node
// and edges exist between vectors with distance <= threshold
const graph = new Map<number | string, Set<number | string>>();
const vectorEntries = Array.from(this.memoryStorage.entries());
// Initialize the graph with empty adjacency lists
for (const [id] of vectorEntries) {
graph.set(id, new Set());
}
// Build edges
for (let i = 0; i < vectorEntries.length; i++) {
const [id1, vector1] = vectorEntries[i];
for (let j = i + 1; j < vectorEntries.length; j++) {
const [id2, vector2] = vectorEntries[j];
// Ensure dimension compatibility
if (vector1.length !== vector2.length) {
continue;
}
// Calculate distance
const distance = this._calculateDistance(vector1, vector2, metric);
// Add edge if distance is within threshold
if (distance <= threshold) {
graph.get(id1)?.add(id2);
graph.get(id2)?.add(id1);
}
}
}
// Use depth-first search to find connected components (communities)
const visited = new Set<number | string>();
const communities: Array<Array<{
id: number | string;
metadata?: Record<string, any>;
}>> = [];
for (const [id] of graph.entries()) {
if (!visited.has(id)) {
const community: Array<{
id: number | string;
metadata?: Record<string, any>;
}> = [];
// DFS to find all connected vectors
const dfs = (nodeId: number | string) => {
visited.add(nodeId);
const metadata = this.metadata.get(nodeId);
community.push({
id: nodeId,
metadata: metadata ? { ...metadata } : undefined
});
// Visit all neighbors
const neighbors = graph.get(nodeId) || new Set();
for (const neighbor of neighbors) {
if (!visited.has(neighbor)) {
dfs(neighbor);
}
}
};
dfs(id);
// Only include communities with at least 2 vectors
if (community.length > 1) {
communities.push(community);
}
}
}
log('info', `[VectorDB] Found ${communities.length} communities`);
return communities;
}
async close(): Promise<void> {
if (this.isClosed) return;
this.isClosed = true; // Mark as closed immediately
try {
if (this.dbPath) {
await this.save(); // Attempt to save on close
}
} catch (error) {
log('error', 'Error saving database during close:', error);
} finally {
// Clear memory regardless of save success
this.memoryStorage.clear();
this.metadata.clear();
this.vectorDimensions.clear();
this.emit('db:close', {});
log('info', 'Database closed.');
}
}
}
// --- END OF FILE vector_db.ts ---