autosnippet
Version:
Extract code patterns into a knowledge base for AI coding assistants
150 lines (149 loc) • 5.63 kB
JavaScript
/**
* ContextualEnricher — 上下文增强管线
*
* 基于 Anthropic Contextual Retrieval 论文 (2024-09) 的实现。
* 对每个 chunk 生成 50-100 token 的上下文前缀,使其嵌入时保留文档层面的语义。
*
* 效果: retrieval failure rate 降低 35-67% (with reranking)
*
* 成本控制:
* - 使用轻量模型 (Haiku/Gemini Flash)
* - Prompt Caching: 同一文档不同 chunk 共享 system prompt 缓存
* - 增量模式: 只对新增/变更 chunk 做 enrichment
* - 配置开关: contextualEnrich = false 时完全跳过
*
* @module service/vector/ContextualEnricher
*/
import Logger from '../../infrastructure/logging/Logger.js';
// ── Enricher ──
export class ContextualEnricher {
#aiProvider;
#cache;
#cacheEnabled;
#logger = Logger.getInstance();
constructor(config) {
this.#aiProvider = config.aiProvider;
this.#cacheEnabled = config.cacheEnabled !== false;
this.#cache = new Map();
}
/**
* 为多个 chunks 生成上下文前缀
*
* 策略: 将整篇文档作为 system prompt,逐 chunk 请求上下文描述。
* 利用 Prompt Caching: 文档只需编码一次,后续 chunk 查询只需增量 tokens。
*
* @param document - 文档整体信息
* @param chunks - 分块后的内容数组
* @returns 带上下文前缀的 chunks
*/
async enrichChunks(document, chunks) {
if (chunks.length === 0) {
return [];
}
// Mock 模式下跳过 AI enrichment,直接返回原始 chunks
if (this.#aiProvider.name === 'mock') {
return chunks;
}
const systemPrompt = this.#buildSystemPrompt(document);
const enriched = [];
for (const chunk of chunks) {
try {
// 检查缓存
const cacheKey = this.#cacheEnabled
? this.#computeCacheKey(document.sourcePath || document.title, chunk.content)
: '';
let context;
if (this.#cacheEnabled && this.#cache.has(cacheKey)) {
context = this.#cache.get(cacheKey);
}
else {
context = await this.#generateContext(systemPrompt, chunk.content);
if (this.#cacheEnabled && context) {
this.#cache.set(cacheKey, context);
}
}
if (context) {
enriched.push({
content: `[${context.trim()}]\n\n${chunk.content}`,
metadata: {
...chunk.metadata,
contextEnriched: true,
contextLength: context.length,
},
});
}
else {
enriched.push(chunk);
}
}
catch (err) {
// 单个 chunk enrichment 失败不阻塞整个流程
this.#logger.warn('[ContextualEnricher] Failed to enrich chunk', {
error: err instanceof Error ? err.message : String(err),
});
enriched.push(chunk);
}
}
return enriched;
}
/** 清除缓存 */
clearCache() {
this.#cache.clear();
}
/** 当前缓存大小 */
get cacheSize() {
return this.#cache.size;
}
// ═══ Private ═══
#buildSystemPrompt(document) {
// 截断过长文档(避免超出模型 context window)
const maxDocLen = 8000;
const docContent = document.content.length > maxDocLen
? `${document.content.slice(0, maxDocLen)}\n\n[... document truncated ...]`
: document.content;
return [
`<document title="${this.#escapeXml(document.title)}" kind="${document.kind}">`,
docContent,
'</document>',
'',
'Given the above document, provide 1-2 sentences of context that situate the following chunk within the document.',
'Focus on: what topic/function/section this chunk belongs to, and any key entities or concepts referenced.',
'Answer ONLY with the context sentences, nothing else.',
].join('\n');
}
async #generateContext(systemPrompt, chunkContent) {
const userPrompt = `<chunk>\n${chunkContent}\n</chunk>`;
const response = await this.#aiProvider.chat(userPrompt, {
system: systemPrompt,
maxTokens: 120,
temperature: 0,
});
// 清理响应 — 移除可能的 XML tag 或多余引号
let cleaned = response.trim();
if (cleaned.startsWith('"') && cleaned.endsWith('"')) {
cleaned = cleaned.slice(1, -1);
}
// 限制长度
if (cleaned.length > 500) {
cleaned = cleaned.slice(0, 500);
}
return cleaned;
}
#computeCacheKey(sourcePath, content) {
// 简单的字符串 hash(不需要加密级别)
let hash = 0;
const str = `${sourcePath}::${content.slice(0, 200)}`;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash + char) | 0;
}
return `ctx_${hash.toString(36)}`;
}
#escapeXml(str) {
return str
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"');
}
}