UNPKG

autosnippet

Version:

Extract code patterns into a knowledge base for AI coding assistants

147 lines (141 loc) 5.43 kB
/** * CrossEncoderReranker — AI 驱动的语义重排器 * * 替代 Jaccard 相似度,使用 LLM 对 (query, document) 对进行语义相关性评分。 * * 策略: * 1. 将候选文档与 query 组成 pairs,批量送入 AI 评分 * 2. AI 返回每个 pair 的 relevance score (0.0-1.0) * 3. 按 score 降序排列 * * 优化: * - 单次 API 调用批量评分(减少延迟和成本) * - 文档截断至 MAX_DOC_LEN 控制 token 消耗 * - 候选上限 MAX_CANDIDATES,超出部分保留原始顺序 * - AI 不可用时自动降级到 Jaccard */ import { jaccardSimilarity } from '../../shared/similarity.js'; import { tokenize } from './tokenizer.js'; const MAX_CANDIDATES = 40; // 超过此数量截断(控制 prompt 大小) const MAX_DOC_LEN = 300; // 每个文档最大字符数 export class CrossEncoderReranker { #aiProvider; #logger; constructor(opts = {}) { this.#aiProvider = opts.aiProvider || null; this.#logger = opts.logger || console; } /** * 对候选列表进行语义重排 * * @param query 用户查询 * @param candidates Layer 1 输出的候选列表 * @returns 附带 semanticScore 的候选列表(降序) */ async rerank(query, candidates) { if (!candidates || candidates.length === 0) { return []; } if (!query) { return candidates; } // 如果 AI Provider 不可用,降级到 Jaccard if (!this.#aiProvider || typeof this.#aiProvider.chatWithStructuredOutput !== 'function') { return this.#jaccardFallback(query, candidates); } // 截取前 MAX_CANDIDATES 个候选,剩余保持原始顺序 const head = candidates.slice(0, MAX_CANDIDATES); const tail = candidates.slice(MAX_CANDIDATES); try { const scored = await this.#batchScore(query, head); // tail 部分给一个递减的低分以保持稳定排序 const minScore = scored.length > 0 ? Math.min(...scored.map((s) => s.semanticScore || 0)) * 0.5 : 0; const tailScored = tail.map((c, i) => ({ ...c, semanticScore: Math.max(minScore - (i + 1) * 0.001, 0), })); return [...scored, ...tailScored]; } catch (err) { this.#logger.warn?.(`[CrossEncoderReranker] AI scoring failed, falling back to Jaccard: ${err.message}`); return this.#jaccardFallback(query, candidates); } } /** 批量 AI 评分 — 单次 chatWithStructuredOutput 调用 */ async #batchScore(query, candidates) { const pairs = candidates.map((c, i) => { const doc = this.#extractDocText(c); return `[${i}] ${doc.substring(0, MAX_DOC_LEN)}`; }); const prompt = `# Task Score the relevance of each document to the query. Return ONLY a JSON array. # Query ${query} # Documents ${pairs.join('\n')} # Output Format Return a JSON array of objects: [{"i": 0, "s": 0.85}, {"i": 1, "s": 0.3}, ...] - "i": document index (integer) - "s": relevance score (float 0.0-1.0, where 1.0 = perfectly relevant) Score guidelines: - 1.0: exact match or directly answers the query - 0.7-0.9: highly relevant, covers the main topic - 0.4-0.6: partially relevant, related topic - 0.1-0.3: tangentially related - 0.0: completely irrelevant Return ONLY a JSON array, no markdown or explanation.`; const result = await this.#aiProvider.chatWithStructuredOutput(prompt, { openChar: '[', closeChar: ']', temperature: 0.1, maxTokens: 2048, }); if (!Array.isArray(result)) { throw new Error('AI returned non-array result'); } // 构建 index → score 映射 const scoreMap = new Map(); for (const item of result) { const idx = item.i ?? item.index; const score = item.s ?? item.score ?? 0; if (typeof idx === 'number' && idx >= 0 && idx < candidates.length) { scoreMap.set(idx, Math.max(0, Math.min(1, score))); } } // 合并分数,未评分的给 0 return candidates .map((c, i) => ({ ...c, semanticScore: scoreMap.get(i) ?? 0, })) .sort((a, b) => b.semanticScore - a.semanticScore); } /** 从候选对象提取用于评分的文本表示 */ #extractDocText(candidate) { const parts = [ candidate.title, candidate.trigger, candidate.description || candidate.summary, candidate.code, candidate.content, ].filter(Boolean); return parts.join(' | '); } /** Jaccard 降级 — 当 AI 不可用时使用 */ #jaccardFallback(query, candidates) { const queryTokens = new Set(tokenize(query)); if (queryTokens.size === 0) { return candidates; } return candidates .map((candidate) => { const text = this.#extractDocText(candidate); const docTokens = new Set(tokenize(text)); const score = jaccardSimilarity(queryTokens, docTokens); return { ...candidate, semanticScore: score }; }) .sort((a, b) => b.semanticScore - a.semanticScore); } }