autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
757 lines (756 loc) • 27.7 kB
JavaScript
/**
* HnswVectorAdapter — 基于 HNSW 的向量存储实现
*
* 实现 VectorStore 接口, 内部使用:
* - HnswIndex: 纯 JS HNSW 近似最近邻索引
* - ScalarQuantizer: SQ8 量化 (文档数 > threshold 时自动启用)
* - BinaryPersistence: .asvec 二进制持久化
*
* 特点:
* - O(log N) 搜索, 替代暴力 O(N)
* - 75% 内存节省 (SQ8 量化)
* - 异步 debounced 持久化
* - 自动从 JSON 旧格式迁移
*
* @module infrastructure/vector/HnswVectorAdapter
*/
import { existsSync, mkdirSync, readFileSync, renameSync } from 'node:fs';
import { join } from 'node:path';
import pathGuard from '../../shared/PathGuard.js';
import { AsyncPersistence, WAL_OP } from './AsyncPersistence.js';
import { BinaryPersistence } from './BinaryPersistence.js';
import { HnswIndex } from './HnswIndex.js';
import { ScalarQuantizer } from './ScalarQuantizer.js';
import { VectorStore } from './VectorStore.js';
export class HnswVectorAdapter extends VectorStore {
#index;
/** id → metadata */
#metadata;
/** id → content */
#contents;
#quantizer;
/** 向量维度 (首次 upsert 自动检测) */
#dimension = 0;
/** 数据是否已修改 */
#dirty = false;
/** flush 定时器 */
#flushTimer = null;
/** 待刷盘操作计数 */
#pendingOps = 0;
/** 是否正在刷盘 */
#flushing = false;
/** WAL 持久化管理 */
#wal = null;
// ── 配置 ──
#config;
#indexDir;
#indexPath; // .asvec 文件路径
/**
* @param [options.quantize='auto'] 'auto' | 'sq8' | 'none'
* @param [options.walEnabled=true] 启用 WAL 持久化
*/
constructor(projectRoot, options = {}) {
super();
this.#config = {
M: options.M || 16,
efConstruct: options.efConstruct || 200,
efSearch: options.efSearch || 100,
quantize: options.quantize ?? 'auto',
quantizeThreshold: options.quantizeThreshold || 3000,
flushIntervalMs: options.flushIntervalMs || 2000,
flushBatchSize: options.flushBatchSize || 100,
walEnabled: options.walEnabled !== false,
};
this.#indexDir = options.indexDir || join(projectRoot, '.autosnippet/context/index');
this.#indexPath = join(this.#indexDir, 'vector_index.asvec');
this.#metadata = new Map();
this.#contents = new Map();
this.#quantizer = null;
this.#index = new HnswIndex({
M: this.#config.M,
efConstruct: this.#config.efConstruct,
efSearch: this.#config.efSearch,
});
}
/**
* 初始化: 加载已有索引或创建新索引
* 自动检测 JSON 旧索引并迁移
*/
async init() {
// 路径安全检查 — 阻止在开发仓库内创建向量索引目录
pathGuard.assertProjectWriteSafe(this.#indexDir);
// 确保目录存在
if (!existsSync(this.#indexDir)) {
mkdirSync(this.#indexDir, { recursive: true });
}
// 尝试加载二进制索引
if (existsSync(this.#indexPath) && BinaryPersistence.isValid(this.#indexPath)) {
try {
const loaded = BinaryPersistence.load(this.#indexPath);
const { indexData, quantizerData, metadata, contents, dimension } = loaded;
// 恢复 HNSW 索引
this.#index = HnswIndex.deserialize(indexData);
this.#index.efSearch = this.#config.efSearch;
this.#dimension = dimension;
// 恢复量化器
if (quantizerData) {
this.#quantizer = ScalarQuantizer.deserialize(quantizerData);
// 从 quantizer 重新编码量化向量到 HNSW 节点 (qvector 不序列化, 启动时重建)
this.#index.setQuantizedVectors(this.#quantizer);
}
// 恢复 metadata 和 contents
this.#metadata = metadata;
this.#contents = contents;
// 初始化 WAL + replay 崩溃前未刷盘的操作
this.#initWal();
const { replayed } = this.#wal?.recover() || { replayed: 0 };
if (replayed > 0) {
this.#dirty = true;
await this.#persist();
}
return;
}
catch {
// 损坏的文件, 忽略, 重新构建
}
}
// 尝试从 JSON 迁移
const { VectorMigration } = await import('./VectorMigration.js');
const migrationResult = await VectorMigration.migrate(this.#indexDir, this);
if (migrationResult === 'migrated') {
// 迁移完成, 数据已加载到内存
await this.#persist();
}
// 初始化 WAL + replay 未刷盘操作 (即使是空索引也创建, 以便后续操作写 WAL)
this.#initWal();
const { replayed } = this.#wal?.recover() || { replayed: 0 };
if (replayed > 0) {
this.#dirty = true;
await this.#persist();
}
}
/**
* 同步初始化 (兼容 JsonVectorAdapter)
* 注意: 同步路径无法执行 async 迁移, 但会尝试同步加载 JSON
*/
initSync() {
// 路径安全检查 — 阻止在开发仓库内创建向量索引目录
pathGuard.assertProjectWriteSafe(this.#indexDir);
if (!existsSync(this.#indexDir)) {
mkdirSync(this.#indexDir, { recursive: true });
}
// 尝试加载二进制索引
if (existsSync(this.#indexPath) && BinaryPersistence.isValid(this.#indexPath)) {
try {
const loaded = BinaryPersistence.load(this.#indexPath);
const { indexData, quantizerData, metadata, contents, dimension } = loaded;
this.#index = HnswIndex.deserialize(indexData);
this.#index.efSearch = this.#config.efSearch;
this.#dimension = dimension;
if (quantizerData) {
this.#quantizer = ScalarQuantizer.deserialize(quantizerData);
// 从 quantizer 重新编码量化向量到 HNSW 节点 (qvector 不序列化, 启动时重建)
this.#index.setQuantizedVectors(this.#quantizer);
}
this.#metadata = metadata;
this.#contents = contents;
// 初始化 WAL + replay
this.#initWal();
const { replayed } = this.#wal?.recover() || { replayed: 0 };
if (replayed > 0) {
this.#dirty = true;
// 同步 persist (WAL 已 replay 到内存, 需要落盘)
BinaryPersistence.save(this.#indexPath, {
index: this.#index,
quantizer: this.#quantizer,
metadata: this.#metadata,
contents: this.#contents,
});
this.#dirty = false;
}
return;
}
catch {
// 损坏或不兼容, 尝试从 JSON 迁移
}
}
// 同步迁移: 读取 JSON 索引并加载到内存
this.#syncMigrateFromJson();
// 初始化 WAL + replay 未刷盘操作
this.#initWal();
const { replayed } = this.#wal?.recover() || { replayed: 0 };
if (replayed > 0) {
this.#dirty = true;
BinaryPersistence.save(this.#indexPath, {
index: this.#index,
quantizer: this.#quantizer,
metadata: this.#metadata,
contents: this.#contents,
});
this.#dirty = false;
}
}
/** 同步从 JSON 索引迁移 (用于 initSync 路径) */
#syncMigrateFromJson() {
const jsonPath = join(this.#indexDir, 'vector_index.json');
if (!existsSync(jsonPath)) {
return;
}
try {
const raw = readFileSync(jsonPath, 'utf-8');
const items = JSON.parse(raw);
const itemList = Array.isArray(items)
? items
: Object.entries(items).map(([id, item]) => ({ ...item, id }));
for (const item of itemList) {
if (!item?.id) {
continue;
}
const vector = item.vector || [];
if (vector.length > 0 && this.#dimension === 0) {
this.#dimension = vector.length;
}
this.#metadata.set(item.id, {
...(item.metadata || {}),
updatedAt: Date.now(),
});
this.#contents.set(item.id, item.content || '');
if (vector.length > 0) {
this.#index.addPoint(item.id, vector);
}
}
// 同步保存二进制索引
BinaryPersistence.save(this.#indexPath, {
index: this.#index,
quantizer: this.#quantizer,
metadata: this.#metadata,
contents: this.#contents,
});
this.#dirty = false;
// 重命名旧文件
try {
renameSync(jsonPath, `${jsonPath}.bak`);
}
catch {
/* ignore */
}
}
catch {
// JSON 解析失败, 保持空索引
}
}
async upsert(item) {
if (!item?.id) {
throw new Error('Item must have an id');
}
const vector = item.vector || [];
// 自动检测维度 + 维度一致性守卫
if (vector.length > 0) {
if (this.#dimension === 0) {
this.#dimension = vector.length;
}
else if (vector.length !== this.#dimension) {
throw new Error(`Vector dimension mismatch: store has ${this.#dimension}d, ` +
`new vector is ${vector.length}d. ` +
`This usually means the embedding model was changed. ` +
`Run 'asd embed --clear --force' to rebuild with the new model.`);
}
}
// 存储 metadata 和 content
this.#metadata.set(item.id, {
...(item.metadata || {}),
updatedAt: Date.now(),
});
this.#contents.set(item.id, item.content || '');
// 如果有向量, 插入 HNSW 索引
if (vector.length > 0) {
const qvector = this.#quantizer?.trained ? this.#quantizer.encode(vector) : null;
this.#index.addPoint(item.id, vector, { qvector });
}
this.#dirty = true;
this.#pendingOps++;
// 定期检查是否需要训练量化器 (每 500 次 upsert 检查一次)
if (this.#pendingOps % 500 === 0) {
this.#maybeTrainQuantizer();
}
// WAL 追加 + 调度 flush
if (this.#wal) {
this.#wal.appendWal({
t: WAL_OP.UPSERT,
id: item.id,
c: item.content || '',
v: vector.length > 0 ? Array.from(vector) : [],
m: item.metadata || {},
});
}
else {
this.#scheduleFlush();
}
}
async batchUpsert(items) {
const walOps = [];
for (const item of items) {
if (!item?.id) {
continue;
}
const vector = item.vector || [];
// 维度一致性守卫
if (vector.length > 0) {
if (this.#dimension === 0) {
this.#dimension = vector.length;
}
else if (vector.length !== this.#dimension) {
throw new Error(`Vector dimension mismatch: store has ${this.#dimension}d, ` +
`new vector is ${vector.length}d. ` +
`This usually means the embedding model was changed. ` +
`Run 'asd embed --clear --force' to rebuild with the new model.`);
}
}
this.#metadata.set(item.id, {
...(item.metadata || {}),
updatedAt: Date.now(),
});
this.#contents.set(item.id, item.content || '');
if (vector.length > 0) {
const qvector = this.#quantizer?.trained ? this.#quantizer.encode(vector) : null;
this.#index.addPoint(item.id, vector, { qvector });
}
walOps.push({
t: WAL_OP.UPSERT,
id: item.id,
c: item.content || '',
v: vector.length > 0 ? Array.from(vector) : [],
m: item.metadata || {},
});
}
this.#dirty = true;
this.#pendingOps += items.length;
// 检查是否需要训练/重训练量化器
this.#maybeTrainQuantizer();
// WAL 批量追加
if (this.#wal) {
for (const op of walOps) {
this.#wal.appendWal(op);
}
}
else {
this.#scheduleFlush();
}
}
async remove(id) {
this.#index.removePoint(id);
this.#metadata.delete(id);
this.#contents.delete(id);
this.#dirty = true;
this.#pendingOps++;
if (this.#wal) {
this.#wal.appendWal({ t: WAL_OP.REMOVE, id });
}
else {
this.#scheduleFlush();
}
}
async getById(id) {
if (!this.#metadata.has(id) && !this.#contents.has(id)) {
return null;
}
const nodeIdx = this.#index.idToIndex.get(id);
const node = nodeIdx !== undefined ? this.#index.nodes[nodeIdx] : null;
return {
id,
content: this.#contents.get(id) || '',
vector: node ? Array.from(node.vector) : [],
metadata: this.#metadata.get(id) || {},
};
}
/**
* 向量相似度搜索 — HNSW O(log N)
*
* 当量化器已训练时启用 2-pass 搜索:
* - Pass 1 (粗排): SQ8 量化距离在 HNSW 图中遍历, 获取 efSearch 个候选
* - Pass 2 (精排): Float32 精确余弦距离对候选重排, 返回 top-K
*/
async searchVector(queryVector, options = {}) {
const { topK = 10, filter = null, minScore = 0 } = options;
if (!queryVector || queryVector.length === 0) {
return [];
}
// HNSW 搜索 (多召回一些, 后续过滤可能减少)
const rawK = filter ? topK * 3 : topK;
let knnResults;
if (this.#quantizer?.trained && this.#index.size > this.#config.quantizeThreshold) {
// 2-pass: SQ8 粗排 → Float32 精排
const quantizedQuery = this.#quantizer.encode(queryVector);
knnResults = this.#index.searchKnn(queryVector, rawK, {
quantizedQuery,
quantizer: this.#quantizer,
});
}
else {
// 直接 Float32 搜索
knnResults = this.#index.searchKnn(queryVector, rawK);
}
// 转换为标准格式 + 过滤
let results = knnResults
.filter((r) => r.id) // 过滤掉已删除节点
.map((r) => ({
item: {
id: r.id,
content: this.#contents.get(r.id) || '',
vector: this.#index.nodes[r.nodeIdx]
? Array.from(this.#index.nodes[r.nodeIdx].vector)
: [],
metadata: this.#metadata.get(r.id) || {},
},
score: 1 - r.dist, // 距离转相似度
}))
.filter((r) => r.score >= minScore);
// 应用过滤
if (filter) {
results = results.filter((r) => this.#matchFilter(r.item, filter));
}
return results.slice(0, topK);
}
/**
* 混合搜索: HNSW 向量 + 关键词, 使用 RRF (Reciprocal Rank Fusion) 融合
*
* score = α × 1/(k+rank_dense) + (1-α) × 1/(k+rank_sparse)
*
* @deprecated 优先使用 VectorService.hybridSearch() → HybridRetriever.fuse()
* 此方法保留作为 VectorStore 层的本地混合搜索能力
*/
async hybridSearch(queryVector, queryText, options = {}) {
const { topK = 10, filter = null, rrfK = 60, alpha = 0.5 } = options;
const expandedK = topK * 3;
// Dense: HNSW 向量搜索
const vectorResults = queryVector && queryVector.length > 0
? await this.searchVector(queryVector, { topK: expandedK, filter })
: [];
// Sparse: 关键词搜索
const keywordResults = this.#keywordSearch(queryText, expandedK, filter);
// RRF 融合
const scores = new Map();
// Dense RRF 分数
vectorResults.forEach((r, rank) => {
const id = r.item.id;
const entry = scores.get(id) || { item: r.item, rrfScore: 0 };
entry.rrfScore += alpha * (1 / (rrfK + rank + 1));
entry.item = r.item;
scores.set(id, entry);
});
// Sparse RRF 分数
keywordResults.forEach((r, rank) => {
const id = r.id;
const existing = scores.get(id);
if (existing) {
existing.rrfScore += (1 - alpha) * (1 / (rrfK + rank + 1));
}
else {
scores.set(id, {
item: {
id,
content: this.#contents.get(id) || '',
vector: [],
metadata: this.#metadata.get(id) || {},
},
rrfScore: (1 - alpha) * (1 / (rrfK + rank + 1)),
});
}
});
// 按 RRF 分数降序, 归一化到 [0, 1]
const fused = [...scores.values()].sort((a, b) => b.rrfScore - a.rrfScore).slice(0, topK);
const maxScore = fused.length > 0 ? fused[0].rrfScore : 1;
return fused.map((r) => ({
item: r.item,
score: maxScore > 0 ? r.rrfScore / maxScore : 0,
vectorScore: 0,
keywordScore: 0,
}));
}
/**
* 关键词搜索 (token 匹配 + IDF 近似)
* @returns >}
*/
#keywordSearch(queryText, limit, filter) {
if (!queryText) {
return [];
}
const queryLower = queryText.toLowerCase();
const words = queryLower.split(/\s+/).filter((w) => w.length > 0);
if (words.length === 0) {
return [];
}
const results = [];
for (const [id, content] of this.#contents) {
if (filter) {
const item = { metadata: this.#metadata.get(id) || {} };
if (!this.#matchFilter(item, filter)) {
continue;
}
}
const textLower = content.toLowerCase();
const hits = words.filter((w) => textLower.includes(w)).length;
const keywordScore = hits / words.length;
if (keywordScore > 0) {
results.push({ id, score: keywordScore });
}
}
return results.sort((a, b) => b.score - a.score).slice(0, limit);
}
/** query() — SearchEngine 使用的向量搜索别名 */
async query(queryVector, topK = 10) {
const results = await this.searchVector(queryVector, { topK });
return results.map((r) => ({
id: r.item.id,
similarity: r.score,
score: r.score,
content: r.item.content,
metadata: r.item.metadata || {},
}));
}
async searchByFilter(filter) {
const results = [];
for (const [id, meta] of this.#metadata) {
const item = { id, content: this.#contents.get(id) || '', metadata: meta };
if (this.#matchFilter(item, filter)) {
results.push(item);
}
}
return results;
}
async listIds() {
return [...this.#metadata.keys()];
}
async clear() {
this.#index = new HnswIndex({
M: this.#config.M,
efConstruct: this.#config.efConstruct,
efSearch: this.#config.efSearch,
});
this.#metadata.clear();
this.#contents.clear();
this.#quantizer = null;
this.#dimension = 0;
this.#dirty = true;
if (this.#wal) {
this.#wal.appendWal({ t: WAL_OP.CLEAR });
}
else {
this.#scheduleFlush();
}
}
async getStats() {
const stats = this.#index.getStats();
return {
count: this.#metadata.size,
indexSize: 0, // 实际文件大小在 flush 后才知道
indexPath: this.#indexPath,
hasVectors: stats.totalNodes,
hnswLevels: stats.levels,
hnswEdges: stats.totalEdges,
quantized: this.#quantizer?.trained || false,
dimension: this.#dimension,
};
}
// ── 持久化 ──
/** 初始化 WAL (Write-Ahead Log) */
#initWal() {
if (!this.#config.walEnabled) {
return;
}
this.#wal = new AsyncPersistence({
indexPath: this.#indexPath,
enabled: true,
flushIntervalMs: this.#config.flushIntervalMs,
flushBatchSize: this.#config.flushBatchSize,
onPersist: () => this.#persist(),
onReplay: (op) => this.#replayOp(op),
});
}
/**
* 重放 WAL 操作 (启动时恢复崩溃前未刷盘的操作)
* @param op WAL 操作
*/
#replayOp(op) {
switch (op.t) {
case WAL_OP.UPSERT: {
const vector = (op.v || []);
if (vector.length > 0 && this.#dimension === 0) {
this.#dimension = vector.length;
}
this.#metadata.set(op.id, {
...(op.m || {}),
updatedAt: Date.now(),
});
this.#contents.set(op.id, (op.c || ''));
if (vector.length > 0) {
const qvector = this.#quantizer?.trained ? this.#quantizer.encode(vector) : null;
this.#index.addPoint(op.id, vector, { qvector });
}
break;
}
case WAL_OP.REMOVE:
this.#index.removePoint(op.id);
this.#metadata.delete(op.id);
this.#contents.delete(op.id);
break;
case WAL_OP.CLEAR:
this.#index = new HnswIndex({
M: this.#config.M,
efConstruct: this.#config.efConstruct,
efSearch: this.#config.efSearch,
});
this.#metadata.clear();
this.#contents.clear();
this.#quantizer = null;
this.#dimension = 0;
break;
}
}
/** 手动触发持久化 (测试/关闭时使用) */
async flush() {
if (this.#wal) {
await this.#wal.flush();
}
if (this.#dirty) {
await this.#persist();
}
}
#scheduleFlush() {
if (this.#flushing) {
return;
}
// 如果积累了足够操作, 立即 flush
if (this.#pendingOps >= this.#config.flushBatchSize) {
this.#doFlush();
return;
}
// 否则 debounced flush
if (this.#flushTimer) {
return;
}
this.#flushTimer = setTimeout(() => {
this.#flushTimer = null;
this.#doFlush();
}, this.#config.flushIntervalMs);
// unref() 使定时器不阻止 Node 进程退出
if (this.#flushTimer?.unref) {
this.#flushTimer.unref();
}
}
async #doFlush() {
if (this.#flushing || !this.#dirty) {
return;
}
this.#flushing = true;
this.#pendingOps = 0;
try {
await this.#persist();
}
catch {
/* persist failure is non-fatal */
}
finally {
this.#flushing = false;
}
}
async #persist() {
try {
await BinaryPersistence.saveAsync(this.#indexPath, {
index: this.#index,
quantizer: this.#quantizer,
metadata: this.#metadata,
contents: this.#contents,
});
this.#dirty = false;
}
catch {
/* 写入失败暂时忽略, 下次重试 */
}
}
// ── 量化器 ──
/** 检查是否需要训练量化器, 训练后批量设置量化向量到 HNSW 节点 */
#maybeTrainQuantizer() {
if (this.#config.quantize === 'none') {
return;
}
if (this.#config.quantize === 'auto' && this.#index.size < this.#config.quantizeThreshold) {
return;
}
// 已训练则跳过 (除非文档增长 50% 以上需要重训练)
if (this.#quantizer?.trained) {
return;
}
// 收集训练向量
const vectors = [];
for (const node of this.#index.nodes) {
if (node && node.vector.length > 0) {
vectors.push(node.vector);
}
}
if (vectors.length < 100) {
return; // 数据太少不训练
}
this.#quantizer = new ScalarQuantizer(this.#dimension);
this.#quantizer.train(vectors);
// 批量设置量化向量到 HNSW 节点 (用于 2-pass 搜索)
this.#index.setQuantizedVectors(this.#quantizer);
}
// ── 过滤 ──
#matchFilter(item, filter) {
const meta = item.metadata || {};
if (filter.type && meta.type !== filter.type) {
return false;
}
if (filter.category && meta.category !== filter.category) {
return false;
}
if (filter.language && meta.language !== filter.language) {
return false;
}
if (filter.sourcePath &&
!meta.sourcePath?.includes(filter.sourcePath)) {
return false;
}
if (filter.module && meta.module !== filter.module) {
return false;
}
if (filter.tags && Array.isArray(filter.tags)) {
const itemTags = meta.tags || [];
if (!filter.tags.some((t) => itemTags.includes(t))) {
return false;
}
}
if (filter.deprecated === false && meta.deprecated) {
return false;
}
return true;
}
/** 销毁: 清理定时器 */
destroy() {
// 清理 WAL
if (this.#wal) {
this.#wal.destroy();
}
// 清理 legacy 定时器
if (this.#flushTimer) {
clearTimeout(this.#flushTimer);
this.#flushTimer = null;
}
// 同步最后一次 persist
if (this.#dirty) {
try {
BinaryPersistence.save(this.#indexPath, {
index: this.#index,
quantizer: this.#quantizer,
metadata: this.#metadata,
contents: this.#contents,
});
this.#dirty = false;
}
catch {
/* ignore */
}
}
}
}