UNPKG

eizen

Version:

Vector database Engine for ArchiveNET

698 lines (696 loc) 27 kB
import { SetSDK } from "hollowdb"; import { JWKInterface, Warp } from "warp-contracts"; //#region src/types/index.d.ts /** * Type definitions for HNSW (Hierarchical Navigable Small World) implementation * * These types define the core data structures used throughout the HNSW algorithm. */ /** * A point in high-dimensional space, represented as an array of numbers. * * Each number represents the coordinate value in one dimension. * All points in an HNSW index should have the same dimensionality. * * @example * ```typescript * // 3-dimensional point * const point: Point = [0.1, 0.5, -0.3]; * * // High-dimensional embedding (e.g., from text or image) * const embedding: Point = [0.12, -0.34, 0.56, 0.78, ...]; // 512 dimensions * ``` */ type Point = number[]; /** * Represents the graph structure for a single layer in the HNSW index. * * Maps point indices to their neighbor information (LayerNode). * Each key is a point index, and each value contains that point's connections. * * @example * ```typescript * // Layer 1 graph with 3 points * const layer1: Graph = { * 5: { 10: 0.2, 15: 0.3 }, // Point 5 connects to points 10 and 15 * 10: { 5: 0.2, 15: 0.1 }, // Point 10 connects to points 5 and 15 * 15: { 5: 0.3, 10: 0.1 } // Point 15 connects to points 5 and 10 * }; * ``` */ type Graph = Record<number, LayerNode>; /** * Represents all neighbors of a single point in a layer. * * Maps neighbor point indices to their distances from this point. * The distances are used for efficient neighbor traversal during search. * * @example * ```typescript * // Point connects to three neighbors with their respective distances * const neighbors: LayerNode = { * 42: 0.15, // Point 42 is distance 0.15 away * 18: 0.23, // Point 18 is distance 0.23 away * 7: 0.31 // Point 7 is distance 0.31 away * }; * ``` */ type LayerNode = Record<number, number>; /** * A tuple representing a point with its distance from a query. * * Used throughout the search algorithms to track candidates and results. * The first element is the distance, the second is the point's index. * This format allows efficient sorting by distance. * * @example * ```typescript * // Point 25 is distance 0.42 from the query * const node: Node = [0.42, 25]; * * // Array of nodes sorted by distance (closest first) * const candidates: Node[] = [ * [0.12, 5], // Point 5 is closest * [0.18, 12], // Point 12 is second closest * [0.25, 8] // Point 8 is third closest * ]; * ``` */ type Node = [distance: number, id: number]; /** * Result object returned by k-nearest neighbor search. * * Contains the point index, its distance from the query, and any associated metadata. * The metadata can be any type (specified via the generic parameter M). * * @template M The type of metadata associated with points (e.g., string, object, etc.) * * @example * ```typescript * // Results from searching for documents * type DocMetadata = { filename: string; category: string }; * const results: KNNResult<DocMetadata>[] = [ * { * id: 42, * distance: 0.15, * metadata: { filename: 'research.pdf', category: 'science' } * }, * { * id: 18, * distance: 0.23, * metadata: null // No metadata for this point * } * ]; * ``` */ type KNNResult<M = unknown> = { /** The unique index/ID of the point in the HNSW index */ id: number; /** The distance from the query point (lower = more similar) */ distance: number; /** Optional metadata associated with this point */ metadata: M | null; }; //#endregion //#region src/db/interfaces/index.d.ts /** * Database interface for HNSW (Hierarchical Navigable Small World) implementation. * * This interface abstracts the storage layer for the HNSW algorithm, allowing * different backends (in-memory, file-based, database, etc.) to be used. * * @template M - Type for point metadata (optional, defaults to unknown) */ interface DBInterface<M = unknown> { /** * Initializes a new layer in the HNSW graph structure. * Creates an empty neighbor map for a point at the specified index. */ new_neighbor(idx: number): Promise<void>; /** * Retrieves all neighbors of a specific node in a given layer. */ get_neighbor(layer: number, idx: number): Promise<LayerNode>; /** * Batch retrieval of neighbors for multiple nodes in a layer. * More efficient than multiple individual get_neighbor calls. */ get_neighbors(layer: number, idxs: number[]): Promise<Graph>; /** * Updates or inserts neighbor connections for a node. * Creates the connection if it doesn't exist, updates if it does. */ upsert_neighbor(layer: number, idx: number, node: LayerNode): Promise<void>; /** * Batch update/insert of neighbor connections. * More efficient than multiple individual upsert_neighbor calls. */ upsert_neighbors(layer: number, nodes: Graph): Promise<void>; /** * Returns the total number of layers in the HNSW structure. * Each layer represents a different level of the hierarchical graph. */ get_num_layers(): Promise<number>; /** * Adds a new vector point to the database. * Points are assigned sequential indices starting from 0. * * @returns The assigned index for the new point */ new_point(q: Point): Promise<number>; /** * Retrieves a single point by its index. */ get_point(idx: number): Promise<Point>; /** * Batch retrieval of multiple points. * * @throws Error if any point doesn't exist at the given indices */ get_points(idxs: number[]): Promise<Point[]>; /** * Returns the total number of points stored in the database. * Equivalent to the next index that would be assigned to a new point. */ get_datasize(): Promise<number>; /** * Gets the index of the current entry point for HNSW search. * The entry point is typically the node in the highest layer. * * @returns Entry point index, or null if no points have been added */ get_ep(): Promise<number | null>; /** * Sets the entry point for HNSW search operations. * This should be a node that exists in the highest layer. */ set_ep(ep: number): Promise<void>; /** * Retrieves application-specific metadata for a point. * Metadata can be any additional information associated with a vector. * * @returns Metadata object or null if no metadata exists */ get_metadata(idx: number): Promise<M | null>; /** * Batch retrieval of metadata for multiple points. * Returns array with same length as input, with null for missing metadata. */ get_metadatas(idxs: number[]): Promise<(M | null)[]>; /** * Associates metadata with a point. * Overwrites existing metadata if present. */ set_metadata(idx: number, data: M): Promise<void>; } //#endregion //#region src/hnsw.d.ts /** * Hierarchical Navigable Small Worlds (HNSW) Implementation * * HNSW is a graph-based algorithm for approximate nearest neighbor search in high-dimensional spaces. * It builds a multi-layer graph structure where: * - Layer 0 contains all points and forms the base layer * - Higher layers contain progressively fewer points (sampled probabilistically) * - Each layer maintains connections between nearby points * * Key Concepts: * - **Multi-layer structure**: Higher layers enable long-range navigation, lower layers provide precision * - **Entry point**: A single node in the top layer that serves as the starting point for all searches * - **Greedy search**: Navigate by always moving to the closest unvisited neighbor * - **Layer selection**: New points are assigned to layers using an exponential decay probability * * Algorithm Benefits: * - Logarithmic search complexity: O(log N) for both search and insertion * - High recall: Can find very good approximate nearest neighbors * - Scalable: Works well with millions of high-dimensional vectors * * This implementation works over a key-value database interface, allowing different storage backends. * * @template M Type of the metadata attached to each point (e.g., document IDs, labels, etc.) * * @see https://arxiv.org/pdf/1603.09320.pdf Original HNSW paper by Malkov & Yashunin */ declare class HNSW<M = unknown> { /** Database interface for storing points, graph connections, and metadata */ db: DBInterface<M>; /** * Maximum number of bi-directional links for each node during construction. * This parameter controls the connectivity of the graph: * - Higher values = better search quality but slower construction and more memory * - Lower values = faster construction but potentially worse search quality * - Paper suggests m ∈ [5, 48], with 16 being a good default * - Weaviate uses 64 for high-dimensional data */ m: number; /** * Maximum number of connections for layer 0 (base layer). * Set to 2 * m as recommended in the paper. * Layer 0 can have more connections since it contains all points. */ m_max0: number; /** * Normalization factor for level generation probability. * Used in the exponential decay formula: level = floor(-ln(uniform(0,1)) * ml) * Set to 1/ln(m) as per the paper's heuristic. */ ml: number; /** * Size of the dynamic candidate list during construction. * Controls the search scope when finding neighbors for new points: * - Higher values = better graph quality but slower construction * - Lower values = faster construction but potentially worse search quality * - Common values: 40 (fast), 100 (balanced), 400 (high quality) */ ef_construction: number; /** * Size of the dynamic candidate list during search. * Controls the search scope when performing kNN queries: * - Higher values = better recall but slower search * - Lower values = faster search but potentially lower recall * - Should be >= k (number of neighbors to return) * - Can be adjusted at search time for different speed/quality tradeoffs */ ef: number; /** * Constructs a new HNSW index with the specified parameters. * * @param db Database interface for persistence (must implement DBInterface) * @param M Maximum number of connections per node (recommended: 16, range: [5-48]) * @param ef_construction Size of candidate list during construction (recommended: 200) * @param ef_search Size of candidate list during search (recommended: 50, must be >= k) * * @example * ```typescript * const hnsw = new HNSW( * database, // Your database implementation * 16, // M: good balance of speed/quality * 200, // ef_construction: high quality graph * 50 // ef_search: good search performance * ); * ``` */ constructor(db: DBInterface<M>, M: number, ef_construction: number, ef_search: number); /** * Retrieves a vector and its associated metadata by index. * * This is a convenience method that fetches both the vector data and any * metadata stored with it in a single operation. * * @param idx The index of the vector to retrieve * @returns Object containing the vector data and metadata (null if no metadata exists) * * @example * ```typescript * const result = await hnsw.get_vector(42); * console.log('Vector:', result.point); // [0.1, 0.2, 0.3, ...] * console.log('Metadata:', result.metadata); // { filename: 'doc.pdf', category: 'research' } * ``` */ get_vector(idx: number): Promise<{ point: Point; metadata: M | null; }>; /** * Selects which layer a new point should be inserted into. * * Uses an exponential decay probability distribution as recommended in the paper. * Most points will be inserted into layer 0, with progressively fewer points * in higher layers. This creates the hierarchical structure that makes HNSW efficient. * * The probability of a point being in layer L or higher is: (1/2)^L * This means: * - ~50% of points are only in layer 0 * - ~25% of points reach layer 1 or higher * - ~12.5% of points reach layer 2 or higher * - etc. * * @returns The layer number (0-based) where the new point should be inserted * * @example * ```typescript * const layer = hnsw.select_layer(); // Returns 0, 1, 2, 3, ... with decreasing probability * ``` */ select_layer(): number; /** * Inserts a new point into the HNSW index. * * This is the core method that implements Algorithm 1 from the HNSW paper. * The insertion process works in several phases: * * 1. **Layer Selection**: Randomly determine which layer the new point belongs to * 2. **Entry Point Search**: Navigate from top layer down to find the best entry point * 3. **Layer-by-layer Insertion**: Insert the point into each layer from selected layer down to 0 * 4. **Neighbor Selection**: For each layer, find the best neighbors and create bidirectional links * 5. **Pruning**: Ensure no node has too many connections by removing the worst ones * * Time Complexity: O(log N) expected, where N is the number of points * Space Complexity: O(M * N) where M is the average number of connections per point * * @param q The vector to insert (array of numbers representing the point in space) * @param metadata Optional metadata to associate with this point (e.g., document ID, labels) * * @example * ```typescript * // Insert a simple vector * await hnsw.insert([0.1, 0.2, 0.3, 0.4]); * * // Insert a vector with metadata * await hnsw.insert( * [0.1, 0.2, 0.3, 0.4], * { filename: 'document.pdf', category: 'research' } * ); * ``` * * @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 1 (page 7) */ insert(q: Point, metadata?: M): Promise<void>; /** * Performs a greedy search within a single layer of the HNSW graph. * * This implements Algorithm 2 from the HNSW paper and is the core search primitive * used by both insertion and query operations. The algorithm uses a best-first search * strategy with two priority queues: * * - **Candidates (C)**: Min-heap of points to explore next (closest first) * - **Dynamic list (W)**: Max-heap of found neighbors (furthest first, for easy removal) * * The search expands outward from the entry points, visiting the closest unvisited * neighbors first, until either: * - No more promising candidates remain (all remaining candidates are further than current furthest result) * - The desired number of neighbors (ef) has been found * * @param q The query point to search for * @param ep Array of entry points to start the search from (typically 1 point, but can be multiple) * @param ef Maximum number of neighbors to return (controls search scope vs speed) * @param l_c The layer to search in (0 = base layer with all points, higher = sparser layers) * * @returns Array of [distance, point_id] pairs representing the closest neighbors found * * @example * ```typescript * // Search for 5 closest points starting from entry point 42 in layer 0 * const entryPoints = [[0.5, 42]]; // [distance_to_query, point_id] * const neighbors = await hnsw.search_layer(queryVector, entryPoints, 5, 0); * // Returns: [[0.1, 15], [0.2, 23], [0.3, 8], [0.4, 31], [0.5, 42]] * ``` * * @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 2 (page 8) */ search_layer(q: Point, ep: Node[], ef: number, l_c: number): Promise<Node[]>; /** * Selects the best neighbors from a candidate set using a simple heuristic. * * This implements Algorithm 4 from the HNSW paper (Simple heuristic for selecting neighbors). * The goal is to select diverse, high-quality connections that: * 1. Are close to the query point * 2. Provide good graph connectivity * 3. Don't create redundant paths * * The algorithm works by: * 1. Always preferring closer neighbors first (greedy selection) * 2. Optionally keeping some pruned connections to maintain graph connectivity * * This is a simplified version of the neighbor selection heuristic. The paper also * describes a more complex "extended heuristic" (Algorithm 4*) that considers * the distance between candidates to avoid clustering, but this implementation * uses the simpler approach for performance. * * @param q The query point (either a new point being inserted or existing point being pruned) * @param C Candidate neighbors with their distances: [distance, point_id] * @param l_c Current layer (affects maximum number of connections allowed) * @param keepPrunedConnections Whether to fill remaining slots with pruned candidates (recommended: true) * * @returns Array of selected neighbors, up to M (or M_max0 for layer 0) neighbors * * @example * ```typescript * // Select best neighbors for a point in layer 1 * const candidates = [[0.1, 5], [0.2, 10], [0.15, 8], [0.3, 15]]; * const selected = hnsw.select_neighbors(queryPoint, candidates, 1, true); * // Returns: [[0.1, 5], [0.15, 8]] (assuming M=2) * ``` * * @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 4 (page 9) */ select_neighbors(q: Point, C: Node[], l_c: number, keepPrunedConnections?: boolean): Node[]; /** * Performs k-nearest neighbor search to find the closest points to a query. * * This implements Algorithm 5 from the HNSW paper and is the main query interface. * The search works in two phases: * * 1. **Routing Phase**: Navigate from top layer down to layer 1 using greedy search * with ef=1 to quickly find a good entry point in the base layer * * 2. **Search Phase**: Perform a more thorough search in layer 0 (base layer) * using the configured ef parameter to find the k best neighbors * * The multi-layer approach provides logarithmic search complexity because: * - Higher layers have fewer points but longer connections (for fast routing) * - Lower layers have more points but shorter connections (for precise search) * * Time Complexity: O(log N) expected, where N is the number of points * * @param q The query vector to search for * @param K Number of nearest neighbors to return * * @returns Array of KNNResult objects containing id, distance, and metadata for each neighbor, * sorted by distance (closest first). Returns empty array if no points in index. * * @example * ```typescript * // Find 5 most similar vectors to query * const results = await hnsw.knn_search([0.1, 0.2, 0.3, 0.4], 5); * * // Results format: * // [ * // { id: 42, distance: 0.1, metadata: { filename: 'doc1.pdf' } }, * // { id: 15, distance: 0.2, metadata: { filename: 'doc2.pdf' } }, * // { id: 8, distance: 0.25, metadata: null }, * // ... * // ] * * for (const result of results) { * console.log(`Point ${result.id} with distance ${result.distance}`); * if (result.metadata) { * console.log(` Metadata:`, result.metadata); * } * } * ``` * * @see https://arxiv.org/pdf/1603.09320.pdf Algorithm 5 (page 10) */ knn_search(q: Point, K: number): Promise<KNNResult<M>[]>; } /** * HNSW Usage Guide and Performance Tips * =================================== * * ## Basic Usage Pattern * * ```typescript * // 1. Initialize with your database and parameters * const hnsw = new HNSW(database, 16, 200, 50); * * // 2. Insert vectors with optional metadata * await hnsw.insert([0.1, 0.2, 0.3], { type: 'document', id: 'doc1' }); * await hnsw.insert([0.4, 0.5, 0.6], { type: 'image', id: 'img1' }); * * // 3. Search for similar vectors * const results = await hnsw.knn_search([0.15, 0.25, 0.35], 5); * ``` * * ## Parameter Tuning Guide * * ### Construction Parameters (set once, affect build quality): * * **M (connections per node)**: * - Low values (4-8): Fast insertion, uses less memory, lower search quality * - Medium values (12-16): Good balance for most use cases * - High values (32-64): Slower insertion, more memory, better search quality * - Rule of thumb: Increase M for higher-dimensional data * * **ef_construction (candidate list size during building)**: * - Low values (40-100): Fast building, lower graph quality * - Medium values (100-400): Good balance for most use cases * - High values (400+): Slow building, high graph quality * - Should be >= M and preferably >= 2*M * * ### Search Parameters (can be adjusted per query): * * **ef (candidate list size during search)**: * - Must be >= K (number of results requested) * - Higher values: Better recall (finds more true neighbors) but slower search * - Lower values: Faster search but potentially misses some true neighbors * - Typical range: K to 10*K depending on quality requirements * * ## Performance Characteristics * * **Time Complexity**: * - Insertion: O(log N) expected * - Search: O(log N) expected * - Memory: O(M * N) where M is avg connections per point * * **Scalability**: * - Works efficiently with millions of vectors * - Performance degrades gracefully with dimension (unlike tree-based methods) * - Parallelizable: Multiple insertions/searches can run concurrently * * ## Best Practices * * 1. **Normalize your vectors** if using cosine distance * 2. **Insert in batches** for better database performance * 3. **Start with conservative parameters** and tune based on your data * 4. **Monitor recall vs latency** tradeoffs during parameter tuning * 5. **Use metadata** to filter results post-search if needed * * ## Common Issues & Solutions * * **Poor search quality**: * - Increase M (more connections per node) * - Increase ef_construction (better graph during building) * - Increase ef during search (larger candidate list) * * **Slow insertions**: * - Decrease ef_construction * - Decrease M * - Use batch insertions if supported by your database * * **High memory usage**: * - Decrease M (fewer connections per node) * - Consider dimensionality reduction techniques * - Use quantization if precision allows * * **Slow searches**: * - Decrease ef (smaller candidate list) * - Ensure your database is optimized for batch reads * - Consider caching frequently accessed vectors */ //#endregion //#region src/index.d.ts /** * Compatibility SDK for legacy vector storage contracts. * * Some legacy contracts use a custom function called `upsertVectorMulti` instead of the * standard `setMany` operation. This class provides backwards compatibility by mapping * the standard operations to the legacy function names. * * @example * ```typescript * const compatSdk = new EizenCompatSDK(contractInstance); * await compatSdk.setMany(['key1', 'key2'], ['value1', 'value2']); * ``` */ declare class EizenCompatSDK extends SetSDK<string> { /** * Stores multiple key-value pairs using the legacy contract interface. * * @param keys Array of keys to store * @param values Array of corresponding values to store * @throws {Error} If keys and values arrays have different lengths */ setMany(keys: string[], values: string[]): Promise<void>; /** * Stores a single key-value pair using the legacy contract interface. * * @param key The key to store * @param value The value to store */ set(key: string, value: string): Promise<void>; } /** * Main HNSW Vector Database implementation for Arweave. * * This class provides a complete vector database solution that combines HNSW * algorithm with persistent storage on Arweave blockchain. It supports: * - High-dimensional vector storage and similarity search * - Metadata attachment to vectors * - Persistent storage via blockchain contracts * - Protobuf encoding for efficient serialization * * @template M Type of metadata associated with each vector * * @example * ```typescript * // Initialize with default parameters * const vectorDb = new EizenDbVector(contractSDK); * * // Insert vectors with metadata * await vectorDb.insert([0.1, 0.2, 0.3], { type: 'document', id: 'doc1' }); * * // Search for similar vectors * const results = await vectorDb.knn_search([0.15, 0.25, 0.35], 5); * ``` */ declare class EizenDbVector<M = unknown> extends HNSW<M> { /** Database SDK instance for persistent storage operations */ sdk: SetSDK<string>; /** * Creates a new HNSW vector database instance. * * @param contractSDK A blockchain contract SDK instance with `set` and `setMany` operations * - Vectors are encoded using protobuf and stored as base64 strings * - Metadata is stored as JSON-stringified values * - For legacy contracts using `upsertVectorMulti`, use `EizenCompatSDK` * * @param options Optional HNSW algorithm parameters: * - `m`: Maximum connections per node (default: 5, range: 5-48, higher for better quality) * - `efConstruction`: Build-time candidate list size (default: 128, higher for better graph quality) * - `efSearch`: Search-time candidate list size (default: 20, higher for better recall) * * @template M Type of metadata associated with each vector * * @example * ```typescript * // Basic usage with default parameters * const vectorDb = new EizenDbVector(contractSDK); * * // High-quality configuration for production * const vectorDb = new EizenDbVector(contractSDK, { * m: 16, // More connections for better quality * efConstruction: 200, // Better graph construction * efSearch: 50 // Better search recall * }); * ``` */ constructor(contractSDK: SetSDK<string>, options?: { /** Maximum number of bidirectional connections per node (default: 5) */ m?: number; /** Size of candidate list during graph construction (default: 128) */ efConstruction?: number; /** Size of candidate list during search (default: 20) */ efSearch?: number; }); /** * Deploys a new vector storage contract on Arweave. * * Creates a blockchain contract with vector storage capabilities including * `set` and `setMany` functions for persistent data storage. * * @param wallet User's/ our Arweave wallet for contract deployment * @param warp A Warp instance connected to mainnet * @returns Object containing the deployed contract transaction ID and source transaction ID * * @throws {Error} If Warp is not connected to mainnet * * @example * ```typescript * import { readFileSync } from "fs"; * * const wallet = JSON.parse(readFileSync("./path/to/wallet.json", "utf-8")); * const warp = WarpFactory.forMainnet(); * * const { contractTxId, srcTxId } = await EizenDbVector.deploy(wallet, warp); * console.log(`Contract deployed: ${contractTxId}`); * ``` */ static deploy(wallet: JWKInterface, warp: Warp): Promise<{ contractTxId: string; srcTxId: string; }>; } //#endregion export { EizenCompatSDK, EizenDbVector }; //# sourceMappingURL=index.d.cts.map