eizen
Version:
Vector database Engine for ArchiveNET
698 lines (696 loc) • 27 kB
text/typescript
import { SetSDK } from "hollowdb";
import { JWKInterface, Warp } from "warp-contracts";
//#region src/types/index.d.ts
/**
* Type definitions for HNSW (Hierarchical Navigable Small World) implementation
*
* These types define the core data structures used throughout the HNSW algorithm.
*/
/**
* A point in high-dimensional space, represented as an array of numbers.
*
* Each number represents the coordinate value in one dimension.
* All points in an HNSW index should have the same dimensionality.
*
* @example
* ```typescript
* // 3-dimensional point
* const point: Point = [0.1, 0.5, -0.3];
*
* // High-dimensional embedding (e.g., from text or image)
* const embedding: Point = [0.12, -0.34, 0.56, 0.78, ...]; // 512 dimensions
* ```
*/
type Point = number[];
/**
* Represents the graph structure for a single layer in the HNSW index.
*
* Maps point indices to their neighbor information (LayerNode).
* Each key is a point index, and each value contains that point's connections.
*
* @example
* ```typescript
* // Layer 1 graph with 3 points
* const layer1: Graph = {
* 5: { 10: 0.2, 15: 0.3 }, // Point 5 connects to points 10 and 15
* 10: { 5: 0.2, 15: 0.1 }, // Point 10 connects to points 5 and 15
* 15: { 5: 0.3, 10: 0.1 } // Point 15 connects to points 5 and 10
* };
* ```
*/
type Graph = Record<number, LayerNode>;
/**
* Represents all neighbors of a single point in a layer.
*
* Maps neighbor point indices to their distances from this point.
* The distances are used for efficient neighbor traversal during search.
*
* @example
* ```typescript
* // Point connects to three neighbors with their respective distances
* const neighbors: LayerNode = {
* 42: 0.15, // Point 42 is distance 0.15 away
* 18: 0.23, // Point 18 is distance 0.23 away
* 7: 0.31 // Point 7 is distance 0.31 away
* };
* ```
*/
type LayerNode = Record<number, number>;
/**
* A tuple representing a point with its distance from a query.
*
* Used throughout the search algorithms to track candidates and results.
* The first element is the distance, the second is the point's index.
* This format allows efficient sorting by distance.
*
* @example
* ```typescript
* // Point 25 is distance 0.42 from the query
* const node: Node = [0.42, 25];
*
* // Array of nodes sorted by distance (closest first)
* const candidates: Node[] = [
* [0.12, 5], // Point 5 is closest
* [0.18, 12], // Point 12 is second closest
* [0.25, 8] // Point 8 is third closest
* ];
* ```
*/
type Node = [distance: number, id: number];
/**
* Result object returned by k-nearest neighbor search.
*
* Contains the point index, its distance from the query, and any associated metadata.
* The metadata can be any type (specified via the generic parameter M).
*
* @template M The type of metadata associated with points (e.g., string, object, etc.)
*
* @example
* ```typescript
* // Results from searching for documents
* type DocMetadata = { filename: string; category: string };
* const results: KNNResult<DocMetadata>[] = [
* {
* id: 42,
* distance: 0.15,
* metadata: { filename: 'research.pdf', category: 'science' }
* },
* {
* id: 18,
* distance: 0.23,
* metadata: null // No metadata for this point
* }
* ];
* ```
*/
type KNNResult<M = unknown> = {
/** The unique index/ID of the point in the HNSW index */
id: number;
/** The distance from the query point (lower = more similar) */
distance: number;
/** Optional metadata associated with this point */
metadata: M | null;
};
//#endregion
//#region src/db/interfaces/index.d.ts
/**
* Database interface for HNSW (Hierarchical Navigable Small World) implementation.
*
* This interface abstracts the storage layer for the HNSW algorithm, allowing
* different backends (in-memory, file-based, database, etc.) to be used.
*
* @template M - Type for point metadata (optional, defaults to unknown)
*/
interface DBInterface<M = unknown> {
/**
* Initializes a new layer in the HNSW graph structure.
* Creates an empty neighbor map for a point at the specified index.
*/
new_neighbor(idx: number): Promise<void>;
/**
* Retrieves all neighbors of a specific node in a given layer.
*/
get_neighbor(layer: number, idx: number): Promise<LayerNode>;
/**
* Batch retrieval of neighbors for multiple nodes in a layer.
* More efficient than multiple individual get_neighbor calls.
*/
get_neighbors(layer: number, idxs: number[]): Promise<Graph>;
/**
* Updates or inserts neighbor connections for a node.
* Creates the connection if it doesn't exist, updates if it does.
*/
upsert_neighbor(layer: number, idx: number, node: LayerNode): Promise<void>;
/**
* Batch update/insert of neighbor connections.
* More efficient than multiple individual upsert_neighbor calls.
*/
upsert_neighbors(layer: number, nodes: Graph): Promise<void>;
/**
* Returns the total number of layers in the HNSW structure.
* Each layer represents a different level of the hierarchical graph.
*/
get_num_layers(): Promise<number>;
/**
* Adds a new vector point to the database.
* Points are assigned sequential indices starting from 0.
*
* @returns The assigned index for the new point
*/
new_point(q: Point): Promise<number>;
/**
* Retrieves a single point by its index.
*/
get_point(idx: number): Promise<Point>;
/**
* Batch retrieval of multiple points.
*
* @throws Error if any point doesn't exist at the given indices
*/
get_points(idxs: number[]): Promise<Point[]>;
/**
* Returns the total number of points stored in the database.
* Equivalent to the next index that would be assigned to a new point.
*/
get_datasize(): Promise<number>;
/**
* Gets the index of the current entry point for HNSW search.
* The entry point is typically the node in the highest layer.
*
* @returns Entry point index, or null if no points have been added
*/
get_ep(): Promise<number | null>;
/**
* Sets the entry point for HNSW search operations.
* This should be a node that exists in the highest layer.
*/
set_ep(ep: number): Promise<void>;
/**
* Retrieves application-specific metadata for a point.
* Metadata can be any additional information associated with a vector.
*
* @returns Metadata object or null if no metadata exists
*/
get_metadata(idx: number): Promise<M | null>;
/**
* Batch retrieval of metadata for multiple points.
* Returns array with same length as input, with null for missing metadata.
*/
get_metadatas(idxs: number[]): Promise<(M | null)[]>;
/**
* Associates metadata with a point.
* Overwrites existing metadata if present.
*/
set_metadata(idx: number, data: M): Promise<void>;
}
//#endregion
//#region src/hnsw.d.ts
/**
* Hierarchical Navigable Small Worlds (HNSW) Implementation
*
* HNSW is a graph-based algorithm for approximate nearest neighbor search in high-dimensional spaces.
* It builds a multi-layer graph structure where:
* - Layer 0 contains all points and forms the base layer
* - Higher layers contain progressively fewer points (sampled probabilistically)
* - Each layer maintains connections between nearby points
*
* Key Concepts:
* - **Multi-layer structure**: Higher layers enable long-range navigation, lower layers provide precision
* - **Entry point**: A single node in the top layer that serves as the starting point for all searches
* - **Greedy search**: Navigate by always moving to the closest unvisited neighbor
* - **Layer selection**: New points are assigned to layers using an exponential decay probability
*
* Algorithm Benefits:
* - Logarithmic search complexity: O(log N) for both search and insertion
* - High recall: Can find very good approximate nearest neighbors
* - Scalable: Works well with millions of high-dimensional vectors
*
* This implementation works over a key-value database interface, allowing different storage backends.
*
* @template M Type of the metadata attached to each point (e.g., document IDs, labels, etc.)
*
* @see https://arxiv.org/pdf/1603.09320.pdf Original HNSW paper by Malkov & Yashunin
*/
declare class HNSW<M = unknown> {
/** Database interface for storing points, graph connections, and metadata */
db: DBInterface<M>;
/**
* Maximum number of bi-directional links for each node during construction.
* This parameter controls the connectivity of the graph:
* - Higher values = better search quality but slower construction and more memory
* - Lower values = faster construction but potentially worse search quality
* - Paper suggests m ∈ [5, 48], with 16 being a good default
* - Weaviate uses 64 for high-dimensional data
*/
m: number;
/**
* Maximum number of connections for layer 0 (base layer).
* Set to 2 * m as recommended in the paper.
* Layer 0 can have more connections since it contains all points.
*/
m_max0: number;
/**
* Normalization factor for level generation probability.
* Used in the exponential decay formula: level = floor(-ln(uniform(0,1)) * ml)
* Set to 1/ln(m) as per the paper's heuristic.
*/
ml: number;
/**
* Size of the dynamic candidate list during construction.
* Controls the search scope when finding neighbors for new points:
* - Higher values = better graph quality but slower construction
* - Lower values = faster construction but potentially worse search quality
* - Common values: 40 (fast), 100 (balanced), 400 (high quality)
*/
ef_construction: number;
/**
* Size of the dynamic candidate list during search.
* Controls the search scope when performing kNN queries:
* - Higher values = better recall but slower search
* - Lower values = faster search but potentially lower recall
* - Should be >= k (number of neighbors to return)
* - Can be adjusted at search time for different speed/quality tradeoffs
*/
ef: number;
/**
* Constructs a new HNSW index with the specified parameters.
*
* @param db Database interface for persistence (must implement DBInterface)
* @param M Maximum number of connections per node (recommended: 16, range: [5-48])
* @param ef_construction Size of candidate list during construction (recommended: 200)
* @param ef_search Size of candidate list during search (recommended: 50, must be >= k)
*
* @example
* ```typescript
* const hnsw = new HNSW(
* database, // Your database implementation
* 16, // M: good balance of speed/quality
* 200, // ef_construction: high quality graph
* 50 // ef_search: good search performance
* );
* ```
*/
constructor(db: DBInterface<M>, M: number, ef_construction: number, ef_search: number);
/**
* Retrieves a vector and its associated metadata by index.
*
* This is a convenience method that fetches both the vector data and any
* metadata stored with it in a single operation.
*
* @param idx The index of the vector to retrieve
* @returns Object containing the vector data and metadata (null if no metadata exists)
*
* @example
* ```typescript
* const result = await hnsw.get_vector(42);
* console.log('Vector:', result.point); // [0.1, 0.2, 0.3, ...]
* console.log('Metadata:', result.metadata); // { filename: 'doc.pdf', category: 'research' }
* ```
*/
get_vector(idx: number): Promise<{
point: Point;
metadata: M | null;
}>;
/**
* Selects which layer a new point should be inserted into.
*
* Uses an exponential decay probability distribution as recommended in the paper.
* Most points will be inserted into layer 0, with progressively fewer points
* in higher layers. This creates the hierarchical structure that makes HNSW efficient.
*
* The probability of a point being in layer L or higher is: (1/2)^L
* This means:
* - ~50% of points are only in layer 0
* - ~25% of points reach layer 1 or higher
* - ~12.5% of points reach layer 2 or higher
* - etc.
*
* @returns The layer number (0-based) where the new point should be inserted
*
* @example
* ```typescript
* const layer = hnsw.select_layer(); // Returns 0, 1, 2, 3, ... with decreasing probability
* ```
*/
select_layer(): number;
/**
* Inserts a new point into the HNSW index.
*
* This is the core method that implements Algorithm 1 from the HNSW paper.
* The insertion process works in several phases:
*
* 1. **Layer Selection**: Randomly determine which layer the new point belongs to
* 2. **Entry Point Search**: Navigate from top layer down to find the best entry point
* 3. **Layer-by-layer Insertion**: Insert the point into each layer from selected layer down to 0
* 4. **Neighbor Selection**: For each layer, find the best neighbors and create bidirectional links
* 5. **Pruning**: Ensure no node has too many connections by removing the worst ones
*
* Time Complexity: O(log N) expected, where N is the number of points
* Space Complexity: O(M * N) where M is the average number of connections per point
*
* @param q The vector to insert (array of numbers representing the point in space)
* @param metadata Optional metadata to associate with this point (e.g., document ID, labels)
*
* @example
* ```typescript
* // Insert a simple vector
* await hnsw.insert([0.1, 0.2, 0.3, 0.4]);
*
* // Insert a vector with metadata
* await hnsw.insert(
* [0.1, 0.2, 0.3, 0.4],
* { filename: 'document.pdf', category: 'research' }
* );
* ```
*
* @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 1 (page 7)
*/
insert(q: Point, metadata?: M): Promise<void>;
/**
* Performs a greedy search within a single layer of the HNSW graph.
*
* This implements Algorithm 2 from the HNSW paper and is the core search primitive
* used by both insertion and query operations. The algorithm uses a best-first search
* strategy with two priority queues:
*
* - **Candidates (C)**: Min-heap of points to explore next (closest first)
* - **Dynamic list (W)**: Max-heap of found neighbors (furthest first, for easy removal)
*
* The search expands outward from the entry points, visiting the closest unvisited
* neighbors first, until either:
* - No more promising candidates remain (all remaining candidates are further than current furthest result)
* - The desired number of neighbors (ef) has been found
*
* @param q The query point to search for
* @param ep Array of entry points to start the search from (typically 1 point, but can be multiple)
* @param ef Maximum number of neighbors to return (controls search scope vs speed)
* @param l_c The layer to search in (0 = base layer with all points, higher = sparser layers)
*
* @returns Array of [distance, point_id] pairs representing the closest neighbors found
*
* @example
* ```typescript
* // Search for 5 closest points starting from entry point 42 in layer 0
* const entryPoints = [[0.5, 42]]; // [distance_to_query, point_id]
* const neighbors = await hnsw.search_layer(queryVector, entryPoints, 5, 0);
* // Returns: [[0.1, 15], [0.2, 23], [0.3, 8], [0.4, 31], [0.5, 42]]
* ```
*
* @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 2 (page 8)
*/
search_layer(q: Point, ep: Node[], ef: number, l_c: number): Promise<Node[]>;
/**
* Selects the best neighbors from a candidate set using a simple heuristic.
*
* This implements Algorithm 4 from the HNSW paper (Simple heuristic for selecting neighbors).
* The goal is to select diverse, high-quality connections that:
* 1. Are close to the query point
* 2. Provide good graph connectivity
* 3. Don't create redundant paths
*
* The algorithm works by:
* 1. Always preferring closer neighbors first (greedy selection)
* 2. Optionally keeping some pruned connections to maintain graph connectivity
*
* This is a simplified version of the neighbor selection heuristic. The paper also
* describes a more complex "extended heuristic" (Algorithm 4*) that considers
* the distance between candidates to avoid clustering, but this implementation
* uses the simpler approach for performance.
*
* @param q The query point (either a new point being inserted or existing point being pruned)
* @param C Candidate neighbors with their distances: [distance, point_id]
* @param l_c Current layer (affects maximum number of connections allowed)
* @param keepPrunedConnections Whether to fill remaining slots with pruned candidates (recommended: true)
*
* @returns Array of selected neighbors, up to M (or M_max0 for layer 0) neighbors
*
* @example
* ```typescript
* // Select best neighbors for a point in layer 1
* const candidates = [[0.1, 5], [0.2, 10], [0.15, 8], [0.3, 15]];
* const selected = hnsw.select_neighbors(queryPoint, candidates, 1, true);
* // Returns: [[0.1, 5], [0.15, 8]] (assuming M=2)
* ```
*
* @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 4 (page 9)
*/
select_neighbors(q: Point, C: Node[], l_c: number, keepPrunedConnections?: boolean): Node[];
/**
* Performs k-nearest neighbor search to find the closest points to a query.
*
* This implements Algorithm 5 from the HNSW paper and is the main query interface.
* The search works in two phases:
*
* 1. **Routing Phase**: Navigate from top layer down to layer 1 using greedy search
* with ef=1 to quickly find a good entry point in the base layer
*
* 2. **Search Phase**: Perform a more thorough search in layer 0 (base layer)
* using the configured ef parameter to find the k best neighbors
*
* The multi-layer approach provides logarithmic search complexity because:
* - Higher layers have fewer points but longer connections (for fast routing)
* - Lower layers have more points but shorter connections (for precise search)
*
* Time Complexity: O(log N) expected, where N is the number of points
*
* @param q The query vector to search for
* @param K Number of nearest neighbors to return
*
* @returns Array of KNNResult objects containing id, distance, and metadata for each neighbor,
* sorted by distance (closest first). Returns empty array if no points in index.
*
* @example
* ```typescript
* // Find 5 most similar vectors to query
* const results = await hnsw.knn_search([0.1, 0.2, 0.3, 0.4], 5);
*
* // Results format:
* // [
* // { id: 42, distance: 0.1, metadata: { filename: 'doc1.pdf' } },
* // { id: 15, distance: 0.2, metadata: { filename: 'doc2.pdf' } },
* // { id: 8, distance: 0.25, metadata: null },
* // ...
* // ]
*
* for (const result of results) {
* console.log(`Point ${result.id} with distance ${result.distance}`);
* if (result.metadata) {
* console.log(` Metadata:`, result.metadata);
* }
* }
* ```
*
* @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 5 (page 10)
*/
knn_search(q: Point, K: number): Promise<KNNResult<M>[]>;
}
/**
* HNSW Usage Guide and Performance Tips
* ===================================
*
* ## Basic Usage Pattern
*
* ```typescript
* // 1. Initialize with your database and parameters
* const hnsw = new HNSW(database, 16, 200, 50);
*
* // 2. Insert vectors with optional metadata
* await hnsw.insert([0.1, 0.2, 0.3], { type: 'document', id: 'doc1' });
* await hnsw.insert([0.4, 0.5, 0.6], { type: 'image', id: 'img1' });
*
* // 3. Search for similar vectors
* const results = await hnsw.knn_search([0.15, 0.25, 0.35], 5);
* ```
*
* ## Parameter Tuning Guide
*
* ### Construction Parameters (set once, affect build quality):
*
* **M (connections per node)**:
* - Low values (4-8): Fast insertion, uses less memory, lower search quality
* - Medium values (12-16): Good balance for most use cases
* - High values (32-64): Slower insertion, more memory, better search quality
* - Rule of thumb: Increase M for higher-dimensional data
*
* **ef_construction (candidate list size during building)**:
* - Low values (40-100): Fast building, lower graph quality
* - Medium values (100-400): Good balance for most use cases
* - High values (400+): Slow building, high graph quality
* - Should be >= M and preferably >= 2*M
*
* ### Search Parameters (can be adjusted per query):
*
* **ef (candidate list size during search)**:
* - Must be >= K (number of results requested)
* - Higher values: Better recall (finds more true neighbors) but slower search
* - Lower values: Faster search but potentially misses some true neighbors
* - Typical range: K to 10*K depending on quality requirements
*
* ## Performance Characteristics
*
* **Time Complexity**:
* - Insertion: O(log N) expected
* - Search: O(log N) expected
* - Memory: O(M * N) where M is avg connections per point
*
* **Scalability**:
* - Works efficiently with millions of vectors
* - Performance degrades gracefully with dimension (unlike tree-based methods)
* - Parallelizable: Multiple insertions/searches can run concurrently
*
* ## Best Practices
*
* 1. **Normalize your vectors** if using cosine distance
* 2. **Insert in batches** for better database performance
* 3. **Start with conservative parameters** and tune based on your data
* 4. **Monitor recall vs latency** tradeoffs during parameter tuning
* 5. **Use metadata** to filter results post-search if needed
*
* ## Common Issues & Solutions
*
* **Poor search quality**:
* - Increase M (more connections per node)
* - Increase ef_construction (better graph during building)
* - Increase ef during search (larger candidate list)
*
* **Slow insertions**:
* - Decrease ef_construction
* - Decrease M
* - Use batch insertions if supported by your database
*
* **High memory usage**:
* - Decrease M (fewer connections per node)
* - Consider dimensionality reduction techniques
* - Use quantization if precision allows
*
* **Slow searches**:
* - Decrease ef (smaller candidate list)
* - Ensure your database is optimized for batch reads
* - Consider caching frequently accessed vectors
*/
//#endregion
//#region src/index.d.ts
/**
* Compatibility SDK for legacy vector storage contracts.
*
* Some legacy contracts use a custom function called `upsertVectorMulti` instead of the
* standard `setMany` operation. This class provides backwards compatibility by mapping
* the standard operations to the legacy function names.
*
* @example
* ```typescript
* const compatSdk = new EizenCompatSDK(contractInstance);
* await compatSdk.setMany(['key1', 'key2'], ['value1', 'value2']);
* ```
*/
declare class EizenCompatSDK extends SetSDK<string> {
/**
* Stores multiple key-value pairs using the legacy contract interface.
*
* @param keys Array of keys to store
* @param values Array of corresponding values to store
* @throws {Error} If keys and values arrays have different lengths
*/
setMany(keys: string[], values: string[]): Promise<void>;
/**
* Stores a single key-value pair using the legacy contract interface.
*
* @param key The key to store
* @param value The value to store
*/
set(key: string, value: string): Promise<void>;
}
/**
* Main HNSW Vector Database implementation for Arweave.
*
* This class provides a complete vector database solution that combines HNSW
* algorithm with persistent storage on Arweave blockchain. It supports:
* - High-dimensional vector storage and similarity search
* - Metadata attachment to vectors
* - Persistent storage via blockchain contracts
* - Protobuf encoding for efficient serialization
*
* @template M Type of metadata associated with each vector
*
* @example
* ```typescript
* // Initialize with default parameters
* const vectorDb = new EizenDbVector(contractSDK);
*
* // Insert vectors with metadata
* await vectorDb.insert([0.1, 0.2, 0.3], { type: 'document', id: 'doc1' });
*
* // Search for similar vectors
* const results = await vectorDb.knn_search([0.15, 0.25, 0.35], 5);
* ```
*/
declare class EizenDbVector<M = unknown> extends HNSW<M> {
/** Database SDK instance for persistent storage operations */
sdk: SetSDK<string>;
/**
* Creates a new HNSW vector database instance.
*
* @param contractSDK A blockchain contract SDK instance with `set` and `setMany` operations
* - Vectors are encoded using protobuf and stored as base64 strings
* - Metadata is stored as JSON-stringified values
* - For legacy contracts using `upsertVectorMulti`, use `EizenCompatSDK`
*
* @param options Optional HNSW algorithm parameters:
* - `m`: Maximum connections per node (default: 5, range: 5-48, higher for better quality)
* - `efConstruction`: Build-time candidate list size (default: 128, higher for better graph quality)
* - `efSearch`: Search-time candidate list size (default: 20, higher for better recall)
*
* @template M Type of metadata associated with each vector
*
* @example
* ```typescript
* // Basic usage with default parameters
* const vectorDb = new EizenDbVector(contractSDK);
*
* // High-quality configuration for production
* const vectorDb = new EizenDbVector(contractSDK, {
* m: 16, // More connections for better quality
* efConstruction: 200, // Better graph construction
* efSearch: 50 // Better search recall
* });
* ```
*/
constructor(contractSDK: SetSDK<string>, options?: {
/** Maximum number of bidirectional connections per node (default: 5) */
m?: number;
/** Size of candidate list during graph construction (default: 128) */
efConstruction?: number;
/** Size of candidate list during search (default: 20) */
efSearch?: number;
});
/**
* Deploys a new vector storage contract on Arweave.
*
* Creates a blockchain contract with vector storage capabilities including
* `set` and `setMany` functions for persistent data storage.
*
* @param wallet User's/ our Arweave wallet for contract deployment
* @param warp A Warp instance connected to mainnet
* @returns Object containing the deployed contract transaction ID and source transaction ID
*
* @throws {Error} If Warp is not connected to mainnet
*
* @example
* ```typescript
* import { readFileSync } from "fs";
*
* const wallet = JSON.parse(readFileSync("./path/to/wallet.json", "utf-8"));
* const warp = WarpFactory.forMainnet();
*
* const { contractTxId, srcTxId } = await EizenDbVector.deploy(wallet, warp);
* console.log(`Contract deployed: ${contractTxId}`);
* ```
*/
static deploy(wallet: JWKInterface, warp: Warp): Promise<{
contractTxId: string;
srcTxId: string;
}>;
}
//#endregion
export { EizenCompatSDK, EizenDbVector };
//# sourceMappingURL=index.d.cts.map