UNPKG

autosnippet

Version:

Extract code patterns into a knowledge base for AI coding assistants

599 lines (598 loc) 20.9 kB
/** * HnswIndex — 纯 JS 实现的 HNSW 近似最近邻索引 * * 参考论文: "Efficient and robust approximate nearest neighbor search * using Hierarchical Navigable Small World graphs" (Malkov & Yashunin, 2018) * * 特点: * - 零外部依赖, 纯 JavaScript 实现 * - 支持增量插入 (无需全量重建) * - 余弦距离 (1 - cosineSimilarity) * - 可配置超参数 (M, efConstruct, efSearch) * * @module infrastructure/vector/HnswIndex */ // ── 堆结构 ── class MinHeap { #data = []; get size() { return this.#data.length; } peek() { return this.#data[0] || null; } push(nodeIdx, dist) { this.#data.push({ nodeIdx, dist }); this.#siftUp(this.#data.length - 1); } pop() { if (this.#data.length === 0) { return null; } const top = this.#data[0]; const last = this.#data.pop(); if (this.#data.length > 0 && last) { this.#data[0] = last; this.#siftDown(0); } return top; } toArray() { return [...this.#data]; } #siftUp(i) { const data = this.#data; while (i > 0) { const parent = (i - 1) >> 1; if (data[parent].dist <= data[i].dist) { break; } [data[parent], data[i]] = [data[i], data[parent]]; i = parent; } } #siftDown(i) { const data = this.#data; const n = data.length; while (true) { let smallest = i; const left = 2 * i + 1; const right = 2 * i + 2; if (left < n && data[left].dist < data[smallest].dist) { smallest = left; } if (right < n && data[right].dist < data[smallest].dist) { smallest = right; } if (smallest === i) { break; } [data[smallest], data[i]] = [data[i], data[smallest]]; i = smallest; } } } class MaxHeap { #data = []; get size() { return this.#data.length; } peek() { return this.#data[0] || null; } push(nodeIdx, dist) { this.#data.push({ nodeIdx, dist }); this.#siftUp(this.#data.length - 1); } pop() { if (this.#data.length === 0) { return null; } const top = this.#data[0]; const last = this.#data.pop(); if (this.#data.length > 0 && last) { this.#data[0] = last; this.#siftDown(0); } return top; } /** 按距离升序返回所有元素 */ toSortedArray() { return [...this.#data].sort((a, b) => a.dist - b.dist); } #siftUp(i) { const data = this.#data; while (i > 0) { const parent = (i - 1) >> 1; if (data[parent].dist >= data[i].dist) { break; } [data[parent], data[i]] = [data[i], data[parent]]; i = parent; } } #siftDown(i) { const data = this.#data; const n = data.length; while (true) { let largest = i; const left = 2 * i + 1; const right = 2 * i + 2; if (left < n && data[left].dist > data[largest].dist) { largest = left; } if (right < n && data[right].dist > data[largest].dist) { largest = right; } if (largest === i) { break; } [data[largest], data[i]] = [data[i], data[largest]]; i = largest; } } } // ── HNSW Index ── export class HnswIndex { // ── 超参数 ── M; // 每层最大邻居数 M0; // L0 层最大邻居数 (= 2*M) efConstruct; // 构建时搜索宽度 efSearch; // 查询时搜索宽度 mL; // 层级采样因子 = 1 / ln(M) // ── 存储 ── /** >} */ nodes = []; /** graphs — per-level adjacency: graphs[level].get(nodeIdx) → Set<neighborIdx> */ graphs = []; entryPoint = -1; // 入口节点索引 maxLevel = -1; // 当前最大层级 /** id → nodeIdx */ idToIndex = new Map(); // ── 可选的自定义距离函数 (用于量化空间) ── #distanceFn = null; /** @param [options.distanceFn] 自定义距离函数 (a, b) => number */ constructor(options = {}) { this.M = options.M || 16; this.M0 = this.M * 2; this.efConstruct = options.efConstruct || 200; this.efSearch = options.efSearch || 100; this.mL = 1 / Math.log(this.M); if (options.distanceFn) { this.#distanceFn = options.distanceFn; } } /** 获取节点数量 */ get size() { return this.nodes.length; } /** 余弦距离 = 1 - cosineSimilarity (越小越相似) */ distance(a, b) { if (this.#distanceFn) { return this.#distanceFn(a, b); } return cosineDistance(a, b); } /** * 随机选取节点层级 (几何分布) * 使用 1 - Math.random() 避免 log(0) = -Infinity */ #randomLevel() { // 1 - Math.random() ∈ (0, 1], 永远不会为 0 return Math.floor(-Math.log(1 - Math.random()) * this.mL); } /** 确保 graphs 数组至少有 level+1 层 */ #ensureLevel(level) { while (this.graphs.length <= level) { this.graphs.push(new Map()); } } /** 获取节点在某层的邻居集合 (如不存在则创建) */ #getNeighbors(level, nodeIdx) { const graph = this.graphs[level]; if (!graph) { return new Set(); } let neighbors = graph.get(nodeIdx); if (!neighbors) { neighbors = new Set(); graph.set(nodeIdx, neighbors); } return neighbors; } /** * 插入一个向量到索引 * @param id 文档 ID * @param [options.qvector] 预量化向量 (SQ8), 用于 2-pass 搜索加速 */ addPoint(id, vector, options = {}) { // 如果 id 已存在, 先移除旧的 (支持更新) if (this.idToIndex.has(id)) { this.removePoint(id); } const nodeLevel = this.#randomLevel(); const nodeIdx = this.nodes.length; this.nodes.push({ id, vector, level: nodeLevel, qvector: options.qvector || null }); this.idToIndex.set(id, nodeIdx); this.#ensureLevel(nodeLevel); // 第一个节点 if (this.entryPoint === -1) { this.entryPoint = nodeIdx; this.maxLevel = nodeLevel; return; } // Phase 1: 从顶层贪心搜索到 nodeLevel+1 层 let current = this.entryPoint; for (let level = this.maxLevel; level > nodeLevel; level--) { current = this.#greedySearch(vector, current, level); } // Phase 2: 从 min(nodeLevel, maxLevel) 向下, 每层做 efConstruct 宽度搜索 for (let level = Math.min(nodeLevel, this.maxLevel); level >= 0; level--) { const candidates = this.#searchLayer(vector, current, this.efConstruct, level); // 选择 M (或 M0 for L0) 个最近邻作为邻居 const maxNeighbors = level === 0 ? this.M0 : this.M; const neighbors = this.#selectNeighborsSimple(candidates, maxNeighbors); // 双向连接 for (const neighbor of neighbors) { const neighborsOfNode = this.#getNeighbors(level, nodeIdx); neighborsOfNode.add(neighbor.nodeIdx); const neighborsOfNeighbor = this.#getNeighbors(level, neighbor.nodeIdx); neighborsOfNeighbor.add(nodeIdx); // 如果邻居的邻居数超限, 裁剪最远的 const limit = level === 0 ? this.M0 : this.M; if (neighborsOfNeighbor.size > limit) { this.#pruneConnections(neighbor.nodeIdx, level, limit); } } // 更新入口 (取最近候选) if (candidates.length > 0) { current = candidates[0].nodeIdx; } } // 如果新节点层级 > 当前最大层级, 更新入口点 if (nodeLevel > this.maxLevel) { this.maxLevel = nodeLevel; this.entryPoint = nodeIdx; } } /** * 移除一个向量 (软删除: 断开所有连接但保留 slot) * 完整的 compaction 可在持久化时做 */ removePoint(id) { const nodeIdx = this.idToIndex.get(id); if (nodeIdx === undefined) { return; } const node = this.nodes[nodeIdx]; if (!node) { return; } // 断开所有层级的连接 for (let level = 0; level <= node.level; level++) { const graph = this.graphs[level]; if (!graph) { continue; } const neighbors = graph.get(nodeIdx); if (neighbors) { // 移除邻居对该节点的引用 for (const neighborIdx of neighbors) { const neighborSet = graph.get(neighborIdx); if (neighborSet) { neighborSet.delete(nodeIdx); } } graph.delete(nodeIdx); } } // 标记为已删除 (保留 slot 避免 index 移位) this.nodes[nodeIdx] = null; this.idToIndex.delete(id); // 如果删的是入口点, 需要找新入口 if (this.entryPoint === nodeIdx) { this.#findNewEntryPoint(); } } /** 查找新的入口点 (删除后) */ #findNewEntryPoint() { this.entryPoint = -1; this.maxLevel = -1; for (let i = 0; i < this.nodes.length; i++) { const node = this.nodes[i]; if (node && node.level > this.maxLevel) { this.maxLevel = node.level; this.entryPoint = i; } } } /** 为所有现有节点批量设置量化向量 */ setQuantizedVectors(quantizer) { for (const node of this.nodes) { if (node && node.vector.length > 0) { node.qvector = quantizer.encode(node.vector); } } } /** * 搜索 K 个最近邻 * * 支持 2-pass 搜索 (SQ8 粗排 + Float32 精排): * - 传入 quantizedQuery + quantizer 时启用 * - Phase 1-2: 使用 SQ8 量化距离图遍历 (快速粗排) * - Phase 3: 对候选用 Float32 精确余弦距离重排 (精排) * * @param [options.quantizedQuery] SQ8 编码后的查询向量 * @returns >} */ searchKnn(queryVector, k = 10, options = {}) { if (this.entryPoint === -1 || this.nodes.length === 0) { return []; } const { quantizedQuery, quantizer } = options; const use2Pass = !!(quantizedQuery && quantizer); // Phase 1: 从顶层贪心搜索到 L1 (使用 SQ8 距离加速, 如果可用) let current = this.entryPoint; for (let level = this.maxLevel; level > 0; level--) { current = this.#greedySearch(queryVector, current, level, use2Pass ? quantizer : null, quantizedQuery); } // Phase 2: L0 层做 efSearch 宽度搜索 (SQ8 粗排) const ef = Math.max(this.efSearch, k); const candidates = this.#searchLayer(queryVector, current, ef, 0, use2Pass ? quantizer : null, quantizedQuery); // Phase 3: 2-pass 精排 — 用 Float32 精确余弦距离重新排序候选 if (use2Pass) { for (const c of candidates) { const node = this.nodes[c.nodeIdx]; if (node) { c.dist = cosineDistance(queryVector, node.vector); } } candidates.sort((a, b) => a.dist - b.dist); } // 返回前 k 个 return candidates.slice(0, k).map((c) => ({ id: this.nodes[c.nodeIdx]?.id, nodeIdx: c.nodeIdx, dist: c.dist, })); } /** * 贪心搜索 — 在单一层级中找到离 query 最近的节点 * @param quantizer SQ8 量化器 (可选) * @param quantizedQuery SQ8 编码后的查询向量 (可选) * @returns 最近节点的 index */ #greedySearch(query, entryNodeIdx, level, quantizer = null, quantizedQuery = null) { let current = entryNodeIdx; const currentNode = this.nodes[current]; if (!currentNode) { return current; } let currentDist = this.#dist(query, currentNode, quantizer, quantizedQuery); let improved = true; while (improved) { improved = false; const neighbors = this.#getNeighbors(level, current); for (const neighborIdx of neighbors) { const neighbor = this.nodes[neighborIdx]; if (!neighbor) { continue; // 已删除的节点 } const dist = this.#dist(query, neighbor, quantizer, quantizedQuery); if (dist < currentDist) { current = neighborIdx; currentDist = dist; improved = true; } } } return current; } /** * searchLayer — HNSW 核心的宽度优先搜索 * @param ef 搜索宽度 * @returns >} 按距离升序排列 */ #searchLayer(query, entryNodeIdx, ef, level, quantizer = null, quantizedQuery = null) { const entryNode = this.nodes[entryNodeIdx]; if (!entryNode) { return []; } const visited = new Set([entryNodeIdx]); const entryDist = this.#dist(query, entryNode, quantizer, quantizedQuery); // candidates: 待探索, MinHeap (距离最小优先) const candidates = new MinHeap(); candidates.push(entryNodeIdx, entryDist); // results: 当前 top-ef 结果, MaxHeap (距离最大在顶, 方便淘汰) const results = new MaxHeap(); results.push(entryNodeIdx, entryDist); while (candidates.size > 0) { const nearest = candidates.pop(); const farthest = results.peek(); // 如果最近候选比当前最差结果还远, 终止 if (nearest.dist > farthest.dist) { break; } // 探索最近候选的邻居 const neighbors = this.#getNeighbors(level, nearest.nodeIdx); for (const neighborIdx of neighbors) { if (visited.has(neighborIdx)) { continue; } visited.add(neighborIdx); const neighbor = this.nodes[neighborIdx]; if (!neighbor) { continue; // 已删除 } const dist = this.#dist(query, neighbor, quantizer, quantizedQuery); const currentFarthest = results.peek(); if (dist < currentFarthest.dist || results.size < ef) { candidates.push(neighborIdx, dist); results.push(neighborIdx, dist); if (results.size > ef) { results.pop(); // 淘汰最远的 } } } } return results.toSortedArray(); } /** * 距离计算: 优先使用 SQ8 量化距离, 降级到 Float32 精确距离 * @param node { vector, qvector? } */ #dist(query, node, quantizer, quantizedQuery) { if (quantizer && quantizedQuery && node.qvector) { return quantizer.distance(quantizedQuery, node.qvector); } return this.distance(query, node.vector); } /** * 简单邻居选择 — 取距离最近的 maxNeighbors 个 * @param candidates * @returns >} */ #selectNeighborsSimple(candidates, maxNeighbors) { return candidates.sort((a, b) => a.dist - b.dist).slice(0, maxNeighbors); } /** * 裁剪节点的连接数到 maxNeighbors * 保留距离最近的邻居, 移除最远的 */ #pruneConnections(nodeIdx, level, maxNeighbors) { const node = this.nodes[nodeIdx]; if (!node) { return; } const neighbors = this.#getNeighbors(level, nodeIdx); if (neighbors.size <= maxNeighbors) { return; } // 计算所有邻居的距离, 保留最近的 const scored = []; for (const nIdx of neighbors) { const nNode = this.nodes[nIdx]; if (!nNode) { continue; } scored.push({ nodeIdx: nIdx, dist: this.distance(node.vector, nNode.vector) }); } scored.sort((a, b) => a.dist - b.dist); // 重建邻居集合 const newNeighbors = new Set(scored.slice(0, maxNeighbors).map((s) => s.nodeIdx)); this.graphs[level].set(nodeIdx, newNeighbors); // 清理被移除邻居的反向链接 for (const s of scored.slice(maxNeighbors)) { const removedNeighborSet = this.graphs[level]?.get(s.nodeIdx); if (removedNeighborSet) { removedNeighborSet.delete(nodeIdx); } } } // ── 序列化/反序列化 (供 BinaryPersistence 使用) ── /** * 导出索引状态 (用于持久化) * @returns } */ serialize() { // 将 graphs Map<Set> 转为可序列化格式 const serializedGraphs = this.graphs.map((graph) => { const entries = []; for (const [nodeIdx, neighbors] of graph) { entries.push([nodeIdx, [...neighbors]]); } return entries; }); // 注意: qvector 不序列化 (启动时由 quantizer 重新编码, 节省空间) return { M: this.M, M0: this.M0, efConstruct: this.efConstruct, efSearch: this.efSearch, entryPoint: this.entryPoint, maxLevel: this.maxLevel, nodes: this.nodes.map((n) => n ? { id: n.id, vector: Array.from(n.vector), level: n.level } : null), graphs: serializedGraphs, }; } /** * 从序列化数据恢复索引 * @param data serialize() 的返回值 */ static deserialize(data) { const index = new HnswIndex({ M: data.M, efConstruct: data.efConstruct, efSearch: data.efSearch, }); index.M0 = data.M0; index.entryPoint = data.entryPoint; index.maxLevel = data.maxLevel; index.nodes = data.nodes.map((n) => n ? { id: n.id, vector: new Float32Array(n.vector), level: n.level } : null); // 恢复 idToIndex for (let i = 0; i < index.nodes.length; i++) { const node = index.nodes[i]; if (node) { index.idToIndex.set(node.id, i); } } // 恢复 graphs index.graphs = data.graphs.map((entries) => { const graph = new Map(); for (const [nodeIdx, neighbors] of entries) { graph.set(nodeIdx, new Set(neighbors)); } return graph; }); return index; } /** * 批量插入 (比逐个 addPoint 更高效的初始构建) * @param items */ addPoints(items) { for (const item of items) { this.addPoint(item.id, item.vector); } } /** 获取索引统计信息 */ getStats() { const activeNodes = this.nodes.filter((n) => n !== null).length; let totalEdges = 0; for (const graph of this.graphs) { for (const neighbors of graph.values()) { totalEdges += neighbors.size; } } return { totalNodes: activeNodes, deletedSlots: this.nodes.length - activeNodes, maxLevel: this.maxLevel, levels: this.graphs.length, totalEdges, entryPoint: this.entryPoint, }; } } /** 余弦距离 = 1 - cosineSimilarity */ export function cosineDistance(a, b) { if (!a || !b || a.length === 0 || b.length === 0) { return 1; } const len = Math.min(a.length, b.length); let dot = 0; let normA = 0; let normB = 0; for (let i = 0; i < len; i++) { dot += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } const denom = Math.sqrt(normA) * Math.sqrt(normB); if (denom === 0) { return 1; } const similarity = dot / denom; return 1 - similarity; } export { MinHeap, MaxHeap };