autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
486 lines (485 loc) • 18.4 kB
JavaScript
/**
* VectorService — 统一向量服务层
*
* 整合 IndexingPipeline、VectorStore、BatchEmbedder 等分散组件,
* 提供统一的索引构建、查询、CRUD 同步、维护接口。
*
* 设计原则:
* 1. 单一职责 — 统一管理向量生命周期(构建、更新、查询、维护)
* 2. 事件驱动 — 知识 CRUD → EventBus → 增量同步
* 3. 渐进增强 — 无 EmbedProvider 时 graceful degrade
* 4. CLI-first — `asd embed` 与 API 同等一等公民
*
* @module service/vector/VectorService
*/
import Logger from '../../infrastructure/logging/Logger.js';
// ── Service ──
export class VectorService {
#vectorStore;
#indexingPipeline;
#hybridRetriever;
#eventBus;
#embedProvider;
#contextualEnricher;
#syncCoordinator = null;
#autoSyncOnCrud;
#syncDebounceMs;
#drizzle;
#logger = Logger.getInstance();
#initialized = false;
// ── Embed circuit breaker ──
#embedConsecutiveFailures = 0;
#embedCircuitOpenUntil = 0;
static #EMBED_CIRCUIT_THRESHOLD = 3;
static #EMBED_CIRCUIT_COOLDOWN_MS = 60_000;
constructor(config) {
this.#vectorStore = config.vectorStore;
this.#indexingPipeline = config.indexingPipeline;
this.#hybridRetriever = config.hybridRetriever;
this.#eventBus = config.eventBus;
this.#embedProvider = config.embedProvider;
this.#contextualEnricher = config.contextualEnricher;
this.#autoSyncOnCrud = config.autoSyncOnCrud;
this.#syncDebounceMs = config.syncDebounceMs;
this.#drizzle = config.drizzle ?? null;
}
// ═══ Lifecycle ═══
/** 初始化: 绑定 EventBus 事件监听 */
async initialize() {
if (this.#initialized) {
return;
}
// 延迟 import SyncCoordinator 避免循环依赖
if (this.#autoSyncOnCrud && this.#eventBus && this.#embedProvider) {
const { SyncCoordinator: SC } = await import('./SyncCoordinator.js');
this.#syncCoordinator = new SC({
vectorStore: this.#vectorStore,
embedProvider: this.#embedProvider,
contextualEnricher: this.#contextualEnricher,
debounceMs: this.#syncDebounceMs,
drizzle: this.#drizzle ?? undefined,
});
this.#syncCoordinator.bindEventBus(this.#eventBus);
this.#logger.info('[VectorService] SyncCoordinator bound to EventBus');
}
this.#initialized = true;
this.#logger.info('[VectorService] Initialized', {
embedAvailable: !!this.#embedProvider,
autoSync: this.#autoSyncOnCrud,
});
}
// ═══ 索引管理 ═══
/**
* 全量构建向量索引
* 委托给 IndexingPipeline.run(),增加 enrichment 步骤和计时
*/
async fullBuild(opts = {}) {
const start = Date.now();
const pipelineResult = await this.#indexingPipeline.run({
force: opts.force ?? false,
dryRun: opts.dryRun ?? false,
clear: opts.clear ?? false,
onProgress: opts.onProgress
? (info) => opts.onProgress(info)
: undefined,
});
return {
scanned: pipelineResult.scanned,
chunked: pipelineResult.chunked,
enriched: pipelineResult.enriched ?? 0,
embedded: pipelineResult.embedded,
upserted: pipelineResult.upserted,
skipped: pipelineResult.skipped,
errors: pipelineResult.errors,
duration: Date.now() - start,
};
}
/**
* 增量更新: 只处理指定的变更文件
* 适用于文件系统级变更(watch 或 git diff)
*/
async incrementalUpdate(changedFiles, opts = {}) {
const start = Date.now();
if (changedFiles.length === 0) {
return {
scanned: 0,
chunked: 0,
enriched: 0,
embedded: 0,
upserted: 0,
skipped: 0,
errors: 0,
duration: 0,
};
}
// 用 IndexingPipeline 的 run(),但只针对变更文件
// 目前 pipeline 不支持 file filter,使用 fullBuild 的 force 模式
// 未来可以扩展 pipeline 支持 filter
const pipelineResult = await this.#indexingPipeline.run({
force: true,
dryRun: false,
clear: false,
onProgress: opts.onProgress
? (info) => opts.onProgress(info)
: undefined,
});
return {
scanned: pipelineResult.scanned,
chunked: pipelineResult.chunked,
enriched: 0,
embedded: pipelineResult.embedded,
upserted: pipelineResult.upserted,
skipped: pipelineResult.skipped,
errors: pipelineResult.errors,
duration: Date.now() - start,
};
}
/** 清空向量索引 */
async clear() {
await this.#vectorStore.clear();
this.#logger.info('[VectorService] Vector index cleared');
}
/**
* 校验向量索引健康状态
* - 维度一致性
* - 孤儿向量检查 (向量有但 DB 无对应 entry)
* - Embed Provider 可用性
*/
async validate() {
const issues = [];
try {
const stats = await this.#vectorStore.getStats();
const storeStats = stats;
// 检查索引是否有数据
if (storeStats.count === 0) {
issues.push('Vector index is empty. Run `asd embed` to build the index.');
}
// 检查维度是否已设置
if (storeStats.dimension !== undefined &&
storeStats.dimension === 0 &&
storeStats.count > 0) {
issues.push('Vector dimension is 0 but entries exist. Index may be corrupted.');
}
// 检查 embed provider 可用性
if (!this.#embedProvider) {
issues.push('No embedding provider configured. Semantic search will not work.');
}
// 孤儿向量检查: 检查 entry_ 前缀的 ID 是否有未知的
if (storeStats.count > 0) {
try {
const allIds = await this.#vectorStore.listIds();
const entryIds = allIds.filter((id) => id.startsWith('entry_'));
if (entryIds.length > 0) {
// 统计 entry_ 前缀的向量数量
this.#logger.info('[VectorService] validate: found entry vectors', {
entryVectors: entryIds.length,
totalVectors: allIds.length,
});
}
}
catch {
// listIds 不支持时跳过孤儿检查
}
}
}
catch (err) {
issues.push(`Failed to get vector stats: ${err instanceof Error ? err.message : String(err)}`);
}
return {
healthy: issues.length === 0,
issues,
};
}
// ═══ 查询 ═══
/**
* 语义搜索
* Embed query → vectorStore.searchVector → 返回结果
*/
async search(query, opts = {}) {
if (!this.#embedProvider) {
return [];
}
const { topK = 10, filter = null, minScore = 0 } = opts;
try {
const t0 = performance.now();
const embedResult = await this.#embedProvider.embed(query);
const tEmbed = performance.now();
const queryVector = Array.isArray(embedResult[0]) ? embedResult[0] : embedResult;
const results = await this.#vectorStore.searchVector(queryVector, {
topK,
filter,
minScore,
});
const tHnsw = performance.now();
this.#logger.info(`[VectorService] search: embed=${Math.round(tEmbed - t0)}ms hnsw=${Math.round(tHnsw - tEmbed)}ms total=${Math.round(tHnsw - t0)}ms results=${results.length}`);
return results;
}
catch (err) {
this.#logger.warn('[VectorService] search failed', {
error: err instanceof Error ? err.message : String(err),
});
return [];
}
}
/**
* 混合搜索 (Dense + Sparse RRF 融合)
* 通过 HybridRetriever 执行向量 + BM25 关键词并行检索
*
* Embed 失败时优雅降级: 跳过 Dense 路, 仅用 Sparse 结果进行 RRF 融合,
* 避免因网络问题导致整个搜索返回空结果。
*/
async hybridSearch(query, opts = {}) {
if (!this.#embedProvider) {
return [];
}
if (!this.#hybridRetriever) {
// 无 hybridRetriever 时降级为纯向量搜索
const results = await this.search(query, { topK: opts.topK });
return results.map((r) => ({
id: r.item.id || '',
score: r.score,
item: r.item,
}));
}
const { topK = 10, alpha = 0.5, sparseSearchFn = null } = opts;
// Embed query — circuit breaker skips embed after repeated failures
let queryVector = null;
const circuitOpen = Date.now() < this.#embedCircuitOpenUntil;
const tEmbedStart = performance.now();
if (circuitOpen) {
this.#logger.debug('[VectorService] embed circuit open, skipping embed');
}
else {
try {
const embedResult = await this.#embedProvider.embed(query);
queryVector = Array.isArray(embedResult[0])
? embedResult[0]
: embedResult;
this.#embedConsecutiveFailures = 0;
}
catch (err) {
this.#embedConsecutiveFailures++;
if (this.#embedConsecutiveFailures >= VectorService.#EMBED_CIRCUIT_THRESHOLD) {
this.#embedCircuitOpenUntil = Date.now() + VectorService.#EMBED_CIRCUIT_COOLDOWN_MS;
this.#logger.warn('[VectorService] embed circuit OPEN — skipping embed for 60s', {
consecutiveFailures: this.#embedConsecutiveFailures,
});
}
else {
this.#logger.warn('[VectorService] embed failed, degrading to sparse-only', {
error: err instanceof Error ? err.message : String(err),
failCount: this.#embedConsecutiveFailures,
});
}
}
}
const tEmbedEnd = performance.now();
try {
const fused = await this.#hybridRetriever.search(query, queryVector, {
topK,
alpha,
sparseSearchFn: sparseSearchFn ?? undefined,
});
const tFuseEnd = performance.now();
this.#logger.info(`[VectorService] hybridSearch: embed=${Math.round(tEmbedEnd - tEmbedStart)}ms fuse=${Math.round(tFuseEnd - tEmbedEnd)}ms total=${Math.round(tFuseEnd - tEmbedStart)}ms hasVector=${!!queryVector} results=${fused.length} alpha=${alpha}`);
return fused.map((r) => ({
id: r.id || '',
score: r.score || 0,
...r,
}));
}
catch (err) {
this.#logger.warn('[VectorService] hybridSearch failed', {
error: err instanceof Error ? err.message : String(err),
});
return [];
}
}
/** 通过 ID 查找相似向量 */
async similarById(id, topK = 10) {
try {
const existing = await this.#vectorStore.getById(id);
if (!existing) {
return [];
}
const vector = existing.vector;
if (!vector || vector.length === 0) {
return [];
}
const results = await this.#vectorStore.searchVector(vector, { topK: topK + 1 });
// 排除自身
return results.filter((r) => r.item.id !== id).slice(0, topK);
}
catch (err) {
this.#logger.warn('[VectorService] similarById failed', {
error: err instanceof Error ? err.message : String(err),
});
return [];
}
}
// ═══ 同步 ═══
/**
* 手动同步单个知识条目到向量索引
* 用于 KnowledgeService CRUD 后的即时同步
*/
async syncEntry(entry) {
if (!this.#embedProvider) {
return;
}
try {
const text = this.#extractText(entry);
if (!text) {
return;
}
const embedResult = await this.#embedProvider.embed(text);
const vector = Array.isArray(embedResult[0]) ? embedResult[0] : embedResult;
await this.#vectorStore.upsert({
id: `entry_${entry.id}`,
content: text,
vector: vector,
metadata: {
entryId: entry.id,
title: entry.title,
kind: entry.kind || 'unknown',
source: 'crud_sync',
updatedAt: Date.now(),
},
});
}
catch (err) {
this.#logger.warn('[VectorService] syncEntry failed', {
entryId: entry.id,
error: err instanceof Error ? err.message : String(err),
});
}
}
/** 从向量索引移除一个条目 */
async removeEntry(entryId) {
try {
await this.#vectorStore.remove(`entry_${entryId}`);
}
catch (err) {
this.#logger.warn('[VectorService] removeEntry failed', {
entryId,
error: err instanceof Error ? err.message : String(err),
});
}
}
/** 批量同步知识条目 */
async batchSync(entries) {
const result = { added: 0, updated: 0, removed: 0, errors: [] };
if (!this.#embedProvider || entries.length === 0) {
return result;
}
// 提取文本
const textsWithIds = [];
for (const entry of entries) {
const text = this.#extractText(entry);
if (text) {
textsWithIds.push({ id: entry.id, text, entry });
}
}
if (textsWithIds.length === 0) {
return result;
}
try {
// 批量 embed
const embedResult = await this.#embedProvider.embed(textsWithIds.map((t) => t.text));
const vectors = Array.isArray(embedResult[0])
? embedResult
: [embedResult];
// 批量 upsert
const batch = textsWithIds.map((t, i) => ({
id: `entry_${t.id}`,
content: t.text,
vector: vectors[i] || [],
metadata: {
entryId: t.id,
title: t.entry.title,
kind: t.entry.kind || 'unknown',
source: 'batch_sync',
updatedAt: Date.now(),
},
}));
await this.#vectorStore.batchUpsert(batch);
result.added = batch.length;
}
catch (err) {
result.errors.push(err instanceof Error ? err.message : String(err));
}
return result;
}
// ═══ 维护 ═══
/** 获取向量索引统计信息 */
async getStats() {
const raw = await this.#vectorStore.getStats();
const stats = raw;
return {
count: stats.count || 0,
dimension: stats.dimension || 0,
indexSize: stats.indexSize || 0,
quantized: stats.quantized || false,
embedProviderAvailable: !!this.#embedProvider,
autoSyncEnabled: this.#autoSyncOnCrud && !!this.#syncCoordinator,
};
}
/**
* 迁移维度: 清空索引并使用新的 EmbedProvider 重建
* 用于 embedding 模型切换场景
*/
async migrateDimension(newProvider, opts = {}) {
this.#logger.info('[VectorService] Starting dimension migration');
// 1. 清空现有索引
await this.clear();
opts.onProgress?.({ phase: 'migrate', detail: 'Old index cleared' });
// 2. 切换 provider
this.#embedProvider = newProvider;
this.#indexingPipeline.setAiProvider(newProvider);
opts.onProgress?.({ phase: 'migrate', detail: 'Provider switched' });
// 3. 全量重建
const result = await this.fullBuild({
force: true,
clear: false, // 已经清过了
onProgress: opts.onProgress,
});
this.#logger.info('[VectorService] Dimension migration complete', {
upserted: result.upserted,
duration: result.duration,
});
return result;
}
// ═══ 生命周期 ═══
/** 销毁: 清理 SyncCoordinator 的定时器和事件监听 */
destroy() {
if (this.#syncCoordinator) {
this.#syncCoordinator.destroy();
this.#syncCoordinator = null;
}
this.#initialized = false;
}
// ═══ Private ═══
/** 从知识条目中提取可嵌入的文本 */
#extractText(entry) {
const parts = [];
if (entry.title) {
parts.push(entry.title);
}
if (typeof entry.content === 'string') {
parts.push(entry.content);
}
else if (entry.content && typeof entry.content === 'object') {
// KnowledgeEntry content 可能是 { body, code, ... } 结构
const c = entry.content;
if (typeof c.body === 'string') {
parts.push(c.body);
}
if (typeof c.code === 'string') {
parts.push(c.code);
}
if (typeof c.description === 'string') {
parts.push(c.description);
}
}
return parts.join('\n\n');
}
}