autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
301 lines (300 loc) • 12 kB
JavaScript
/**
* IndexingPipeline v2 — 索引管线
* scan → chunk (AST / section / fixed) → detect incremental changes (sourceHash) → batch embed → batch upsert
*
* v2 变更:
* - 集成 BatchEmbedder: 批量 embed 替代串行 per-chunk embed, ~50× 加速
* - 集成 Chunker v2: auto 策略自动选择 AST / section / fixed 分块
* - 新增 onProgress 回调支持
* - 新增 chunking 配置透传 (strategy, maxChunkTokens, overlapTokens, useAST)
*/
import { createHash } from 'node:crypto';
import { existsSync, readdirSync, readFileSync } from 'node:fs';
import { extname, join, relative } from 'node:path';
import { LanguageService } from '../../shared/LanguageService.js';
import { KNOWLEDGE_BASE_DIR } from '../config/Defaults.js';
import { BatchEmbedder } from './BatchEmbedder.js';
import { chunk } from './Chunker.js';
const SCANNABLE_EXTENSIONS = new Set([
'.md',
'.markdown',
'.txt',
'.swift',
'.m',
'.h',
'.js',
'.ts',
'.jsx',
'.tsx',
'.py',
'.java',
'.kt',
'.go',
'.rs',
'.rb',
]);
export class IndexingPipeline {
#vectorStore; // VectorStore 实例
#aiProvider; // AiProvider 实例 (可选, 用于 embedding)
#batchEmbedder; // BatchEmbedder 实例 (可选, 自动从 aiProvider 创建)
#scanDirs; // 要扫描的目录
#projectRoot;
#chunkingOptions; // Chunker v2 透传选项
#contextualEnricher; // 上下文增强器 (可选)
constructor(options = {}) {
this.#vectorStore = options.vectorStore || null;
this.#aiProvider = options.aiProvider || null;
this.#scanDirs = options.scanDirs || [
'recipes',
'candidates',
`${KNOWLEDGE_BASE_DIR}/recipes`,
`${KNOWLEDGE_BASE_DIR}/candidates`,
];
this.#projectRoot = options.projectRoot || process.cwd();
this.#chunkingOptions = {
strategy: options.chunking?.strategy ?? 'auto',
maxChunkTokens: options.chunking?.maxChunkTokens ?? 512,
overlapTokens: options.chunking?.overlapTokens ?? 50,
useAST: options.chunking?.useAST ?? true,
};
this.#contextualEnricher = options.contextualEnricher || null;
// 自动创建 BatchEmbedder (如果有 aiProvider)
if (this.#aiProvider) {
this.#batchEmbedder = new BatchEmbedder(this.#aiProvider, {
batchSize: options.batchSize ?? 32,
maxConcurrency: options.maxConcurrency ?? 2,
});
}
}
setVectorStore(store) {
this.#vectorStore = store;
}
setAiProvider(provider) {
this.#aiProvider = provider;
if (provider) {
this.#batchEmbedder = new BatchEmbedder(provider, {
batchSize: 32,
maxConcurrency: 2,
});
}
}
setContextualEnricher(enricher) {
this.#contextualEnricher = enricher;
}
/**
* 运行完整索引管线
* @param options { force: boolean, dryRun: boolean, onProgress: function }
* @returns >}
*/
async run(options = {}) {
const { force = false, dryRun = false, clear = false, onProgress } = options;
const stats = {
scanned: 0,
chunked: 0,
enriched: 0,
embedded: 0,
upserted: 0,
skipped: 0,
errors: 0,
};
if (!this.#vectorStore) {
throw new Error('VectorStore not set');
}
// 0. clear — 清空现有索引后重建
if (clear && !dryRun) {
await this.#vectorStore.clear();
onProgress?.({ phase: 'clear', detail: 'Existing index cleared' });
}
// 1. 扫描文件
const files = this.scan();
stats.scanned = files.length;
// 2. 增量检测 + 分块 (先收集所有 chunks)
const existingIds = new Set(await this.#vectorStore.listIds());
const allChunks = []; // { id, content, metadata }
const staleIds = []; // 需要清理的旧 chunk id
for (const file of files) {
try {
const content = readFileSync(file.absolutePath, 'utf-8');
const hash = this.hashContent(content);
const baseId = relative(this.#projectRoot, file.absolutePath).replace(/\//g, '_');
// 增量检测:hash 未变时跳过
if (!force) {
const existing = await this.#vectorStore.getById(`${baseId}_0`);
if (existing?.metadata?.sourceHash === hash) {
stats.skipped++;
continue;
}
}
// 分块 (使用 Chunker v2 - 支持 AST 策略)
const language = this.#detectLanguage(file.absolutePath);
const chunks = chunk(content, {
type: file.type,
sourcePath: file.relativePath,
sourceHash: hash,
language,
}, this.#chunkingOptions);
stats.chunked += chunks.length;
// 收集 chunks
for (let i = 0; i < chunks.length; i++) {
allChunks.push({
id: `${baseId}_${i}`,
content: chunks[i].content,
metadata: { ...chunks[i].metadata, chunkIndex: i },
});
}
// 标记需要清理的旧 chunk
for (const existId of existingIds) {
if (existId.startsWith(`${baseId}_`)) {
const idx = Number.parseInt(existId.split('_').pop(), 10);
if (idx >= chunks.length) {
staleIds.push(existId);
}
}
}
}
catch (_error) {
stats.errors++;
}
}
// 2.5. Contextual Enrichment (可选, 在 embed 之前)
if (this.#contextualEnricher && allChunks.length > 0) {
onProgress?.({ phase: 'enrich', detail: 'Running contextual enrichment...' });
// 按 sourcePath 分组,每个文档的 chunks 一起 enrich
const chunksBySource = new Map();
for (let i = 0; i < allChunks.length; i++) {
const sourcePath = allChunks[i].metadata.sourcePath || 'unknown';
if (!chunksBySource.has(sourcePath)) {
chunksBySource.set(sourcePath, []);
}
chunksBySource.get(sourcePath).push({ index: i, chunk: allChunks[i] });
}
for (const [sourcePath, group] of chunksBySource) {
try {
// 读取原始文档内容作为上下文
const firstChunk = group[0].chunk;
const docTitle = firstChunk.metadata.sourcePath || sourcePath;
const docKind = firstChunk.metadata.type || 'recipe';
// 拼接所有 chunk 作为文档摘要(enricher 内部会截断)
const docContent = group.map((g) => g.chunk.content).join('\n\n');
const enrichedChunks = await this.#contextualEnricher.enrichChunks({ title: docTitle, content: docContent, kind: docKind, sourcePath }, group.map((g) => ({
content: g.chunk.content,
metadata: g.chunk.metadata,
})));
// 回写 enriched 内容
for (let j = 0; j < enrichedChunks.length; j++) {
const originalIndex = group[j].index;
allChunks[originalIndex] = {
...allChunks[originalIndex],
content: enrichedChunks[j].content,
metadata: { ...allChunks[originalIndex].metadata, ...enrichedChunks[j].metadata },
};
if (enrichedChunks[j].metadata.contextEnriched) {
stats.enriched++;
}
}
}
catch {
// enrichment 失败不阻塞,使用原始 chunks
}
}
onProgress?.({ phase: 'enrich', detail: `Enriched ${stats.enriched} chunks` });
}
// 3. 批量 embed (使用 BatchEmbedder)
let vectorMap = new Map(); // id → vector
if (this.#batchEmbedder && allChunks.length > 0) {
try {
vectorMap = await this.#batchEmbedder.embedAll(allChunks.map((c) => ({ id: c.id, content: c.content })), (embedded, total) => {
stats.embedded = embedded;
onProgress?.({ phase: 'embed', embedded, total });
});
stats.embedded = vectorMap.size;
}
catch {
// embed 全部失败, 继续写入 (无向量)
}
}
// 4. 批量写入
if (!dryRun && allChunks.length > 0) {
const batch = allChunks.map((c) => ({
id: c.id,
content: c.content,
vector: vectorMap.get(c.id) || [],
metadata: c.metadata,
}));
await this.#vectorStore.batchUpsert(batch);
stats.upserted = batch.length;
onProgress?.({ phase: 'upsert', upserted: stats.upserted });
}
// 5. 清理旧 chunks
if (!dryRun) {
for (const staleId of staleIds) {
try {
await this.#vectorStore.remove(staleId);
}
catch {
/* skip cleanup errors */
}
}
}
return stats;
}
/**
* 扫描项目中的可索引文件
* @returns >}
*/
scan() {
const files = [];
for (const dir of this.#scanDirs) {
const absDir = join(this.#projectRoot, dir);
if (!existsSync(absDir)) {
continue;
}
this.#walkDir(absDir, files);
}
// 也扫描根目录的 README
const readmePath = join(this.#projectRoot, 'README.md');
if (existsSync(readmePath)) {
files.push({
absolutePath: readmePath,
relativePath: 'README.md',
type: 'readme',
});
}
return files;
}
/** 计算内容 hash */
hashContent(content) {
return createHash('sha256').update(content).digest('hex').slice(0, 16);
}
#walkDir(dir, files) {
try {
const entries = readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
if (entry.name.startsWith('.') || entry.name === 'node_modules') {
continue;
}
this.#walkDir(fullPath, files);
}
else if (entry.isFile()) {
const ext = extname(entry.name).toLowerCase();
if (SCANNABLE_EXTENSIONS.has(ext)) {
files.push({
absolutePath: fullPath,
relativePath: relative(this.#projectRoot, fullPath),
type: ext === '.md' || ext === '.markdown' ? 'recipe' : 'code',
});
}
}
}
}
catch {
/* skip unreadable dirs */
}
}
#detectLanguage(filePath) {
const lang = LanguageService.inferLang(filePath);
return lang === 'unknown' ? 'text' : lang;
}
}