UNPKG

autosnippet

Version:

Extract code patterns into a knowledge base for AI coding assistants

978 lines (977 loc) 40.5 kB
/** * SearchEngine - 统一搜索引擎 * * 三级搜索策略: keyword → FieldWeighted ranking → semantic(可选) * 从 V1 SearchServiceV2 迁移,适配 V2 架构 */ import Logger from '../../infrastructure/logging/Logger.js'; import { RawDbKnowledgeAdapter, RawDbSourceRefAdapter, unwrapSearchDb, } from '../../repository/search/SearchRepoAdapter.js'; import { CoarseRanker } from './CoarseRanker.js'; import { contextBoost } from './contextBoost.js'; import { FieldWeightedScorer } from './FieldWeightedScorer.js'; import { MultiSignalRanker } from './MultiSignalRanker.js'; // ── Re-exports for backward compatibility ── export { BM25Scorer } from './BM25Scorer.js'; export { FieldWeightedScorer } from './FieldWeightedScorer.js'; export { groupByKind, slimSearchResult } from './SearchTypes.js'; export { tokenize } from './tokenizer.js'; /** * SearchEngine - 完整搜索服务 * 整合召回评分 + 关键词 + 可选 AI 增强 */ export class SearchEngine { _cache; _cacheMaxAge; _coarseRanker; _crossEncoder; _fusionRecallWeight; _fusionSemanticWeight; _indexed; _lastIndexTime = null; _multiSignalRanker; _signalBus; aiProvider; db; hybridRetriever; #knowledgeRepo; #sourceRefRepo; logger; scorer; vectorService; vectorStore; constructor(db, options = {}) { this.db = unwrapSearchDb(db); const opts = options; this.#knowledgeRepo = opts.knowledgeRepo ?? new RawDbKnowledgeAdapter(this.db); this.#sourceRefRepo = opts.sourceRefRepo ?? new RawDbSourceRefAdapter(this.db); this.logger = Logger.getInstance(); this.aiProvider = options.aiProvider || null; this.vectorStore = options.vectorStore || null; this.vectorService = options.vectorService || null; this.hybridRetriever = options.hybridRetriever || null; this.scorer = new FieldWeightedScorer(); this._coarseRanker = new CoarseRanker(options); this._multiSignalRanker = new MultiSignalRanker(options); this._crossEncoder = options.crossEncoderReranker || null; this._indexed = false; this._cache = new Map(); this._cacheMaxAge = options.cacheMaxAge || 300_000; // 5min // auto 模式 召回+semantic 融合权重(可配置) this._fusionRecallWeight = options.fusionRecallWeight ?? 0.6; this._fusionSemanticWeight = options.fusionSemanticWeight ?? 0.4; this._signalBus = options.signalBus || null; } /** 构建搜索索引 - 从数据库加载所有可搜索实体 */ buildIndex() { this.scorer.clear(); this._cache.clear(); try { let entries = []; try { const rawEntries = this.#knowledgeRepo.findNonDeprecatedSync(); entries = rawEntries.map((e) => ({ ...e, status: e.lifecycle, })); } catch { /* table may not exist */ } for (const r of entries) { const text = this._buildDocText(r); const meta = this._buildDocMeta(r); meta.status = r.status; // buildIndex uses mapped status from lifecycle this.scorer.addDocument(r.id, text, meta); } this._indexed = true; this._lastIndexTime = new Date().toISOString(); this.logger.info('Search index built', { entries: entries.length, total: this.scorer.totalDocs, }); } catch (err) { this.logger.error('Failed to build search index', { error: err.message }); } } /** 确保索引已构建(幂等),supply 给需要准确 stats 的调用方 */ ensureIndex() { if (!this._indexed) { this.buildIndex(); } } /** * 统一搜索入口 * @param query 搜索关键词 * @param options {type, limit, mode, useAI} */ async search(query, options = {}) { const { type = 'all', limit = 20, mode = 'keyword', context } = options; const shouldRank = options.rank ?? mode !== 'keyword'; const tSearchStart = performance.now(); if (!query || !query.trim()) { return { items: [], total: 0, query }; } // 带 sessionHistory 的上下文搜索不缓存(个性化结果) const hasSessionContext = (context?.sessionHistory?.length ?? 0) > 0; const cacheKey = hasSessionContext ? null : `${query}:${type}:${limit}:${mode}:${shouldRank ? 'r' : ''}:${options.groupByKind ? 'g' : ''}`; if (cacheKey) { const cached = this._getCache(cacheKey); if (cached) { return cached; } } // 确保索引已构建 this.ensureIndex(); // 排序阶段需要更多候选,过采样 3x const recallLimit = shouldRank ? limit * 3 : limit; let results; let actualMode = mode; switch (mode) { case 'auto': { // ── Weighted-First + Confidence Gate ── // 先跑 weighted(~40ms),评估是否需要 embed(2-22s) const weightedItems = this._scorerSearch(query, type, recallLimit); const confidence = this.#computeWeightedConfidence(query, weightedItems, limit); if (confidence >= 60 || !this.vectorService) { // 高 confidence: weighted 已足够,跳过 embed results = weightedItems; actualMode = `auto(weighted-only,conf=${confidence})`; this.logger.info(`[QueryRouter] skip-semantic: conf=${confidence} topScore=${weightedItems[0]?.score ?? 0} query="${query}"`); break; } // 低 confidence: 投入 embed,RRF 融合 // 自适应 alpha:confidence 越低 → semantic 权重越高 // conf=0 → alpha=0.75, conf=30 → alpha=0.575, conf=55 → alpha=0.42 const adaptiveAlpha = this._fusionSemanticWeight + (0.75 - this._fusionSemanticWeight) * (1 - confidence / 60); this.logger.info(`[QueryRouter] invoke-semantic: conf=${confidence} alpha=${adaptiveAlpha.toFixed(2)} topScore=${weightedItems[0]?.score ?? 0} query="${query}"`); try { const rrfResults = await this.vectorService.hybridSearch(query, { topK: recallLimit, alpha: adaptiveAlpha, sparseSearchFn: () => weightedItems, }); if (rrfResults.length > 0) { results = rrfResults.map((r) => { const base = r.data?.item || r.data || {}; const baseMeta = (base.metadata || {}); return { id: r.id, title: (base.title || baseMeta.title || r.id), type: (base.type || 'recipe'), kind: (base.kind || baseMeta.kind || 'pattern'), status: (base.status || baseMeta.status || 'active'), score: Math.round(r.score * 1000) / 1000, content: base.content, description: base.description, }; }); this._supplementDetails(results); actualMode = `auto(rrf,conf=${confidence},α=${adaptiveAlpha.toFixed(2)})`; break; } } catch { // VectorService RRF 失败, 降级 } // 降级: embed 失败 → 返回已有的 weighted 结果 results = weightedItems; actualMode = `auto(weighted-fallback,conf=${confidence})`; break; } case 'weighted': case 'bm25': results = this._scorerSearch(query, type, recallLimit); break; case 'semantic': { const semResult = await this._semanticSearch(query, type, recallLimit); results = semResult.items; actualMode = semResult.actualMode || 'semantic'; break; } default: results = this._keywordSearch(query, type, limit); break; } // ── Ranking Pipeline ([CrossEncoder] → CoarseRanker → MultiSignalRanker → ContextBoost) ── if (shouldRank && results.length > 0) { results = await this._applyRanking(results, query, context); } results = results.slice(0, limit); const response = { items: results, total: results.length, query, mode: actualMode, type, ranked: shouldRank && results.length > 0, }; // ── 搜索计时日志 ── const tSearchEnd = performance.now(); this.logger.info(`Search completed: mode=${actualMode} total=${results.length} time=${Math.round(tSearchEnd - tSearchStart)}ms ranked=${response.ranked} query="${query}"`); if (options.groupByKind) { response.byKind = { rule: [], pattern: [], fact: [] }; for (const r of results) { const kind = r.kind || 'pattern'; const bucket = response.byKind[kind] ?? response.byKind.pattern; bucket.push(r); } } if (cacheKey) { this._setCache(cacheKey, response); } // ── Signal emission ── if (this._signalBus && response.total > 0) { this._signalBus.send('search', 'SearchEngine', Math.min(response.total / limit, 1), { metadata: { query, mode: actualMode, total: response.total }, }); } return response; } // ── Ranking Pipeline ──────────────────────────────────────────── /** * 统一排序管线: * 规范化 → [CrossEncoder 语义重排] → CoarseRanker (E-E-A-T 5维) * → MultiSignalRanker (6信号) → 上下文加成 * * CrossEncoder 仅在构造时传入 crossEncoderReranker 且 AI 可用时生效, * 否则自动跳过(零额外开销)。 */ async _applyRanking(items, query, context = {}) { let normalized = this._normalizeForRanking(items); // Optional: Cross-Encoder semantic rerank (AI → Jaccard fallback) if (this._crossEncoder) { normalized = (await this._crossEncoder.rerank(query, normalized)); } let ranked = this._coarseRanker.rank(normalized); ranked = this._multiSignalRanker.rank(ranked, { ...context, query, scenario: context?.intent || 'search', }); if ((context?.sessionHistory?.length ?? 0) > 0) { ranked = contextBoost(ranked, context); } return ranked.map((r) => ({ ...r, recallScore: r.recallScore || 0, score: r.contextScore || r.rankerScore || r.coarseScore || r.recallScore || 0, })); } /** * 将召回结果转换为 Ranker 所需格式(解析 content JSON、映射信号字段) * 保留原始 content 供下游消费者使用 */ _normalizeForRanking(items) { return items.map((item) => { let codeText = ''; if (item.content) { try { const parsed = typeof item.content === 'string' ? JSON.parse(item.content) : item.content; codeText = parsed.pattern || parsed.code || ''; } catch { /* ignore */ } } let tags = item.tags || []; if (typeof tags === 'string') { try { tags = JSON.parse(tags); } catch { tags = []; } } return { ...item, code: codeText || item.code || '', recallScore: item.score || 0, qualityScore: item.qualityScore || (item.status === 'active' ? 70 : 40), usageCount: item.usageCount || 0, authorityScore: item.authorityScore || 0, tags, difficulty: item.difficulty || 'intermediate', }; }); } /** * 关键词搜索 - 直接 SQL LIKE * 返回包含 kind 字段的完整结果,使用 ESCAPE 防止通配符注入 * 当 SQL LIKE 无结果时,降级到 FieldWeighted 搜索以提升自然语言查询的召回率 */ _keywordSearch(query, type, limit) { const results = []; // 转义 LIKE 通配符 (% → \%, _ → \_) const escaped = query.replace(/[%_\\]/g, (ch) => `\\${ch}`); const pattern = `%${escaped}%`; if (type === 'all' || type === 'recipe' || type === 'knowledge' || type === 'rule' || type === 'solution') { try { let rows = []; try { const rawRows = this.#knowledgeRepo.keywordSearchSync(pattern, limit); rows = rawRows.map((r) => ({ ...r, status: r.lifecycle ?? r.status, type: 'knowledge', })); } catch { /* table may not exist */ } // 基础相关性排序:trigger 精确 > 标题匹配 > 描述匹配 > 内容匹配 const lowerQ = query.toLowerCase(); results.push(...rows.map((r) => { let score = 0.5; if (r.trigger?.toLowerCase().includes(lowerQ)) { score = 1.2; } else if (r.title?.toLowerCase().includes(lowerQ)) { score = 1.0; } else if (r.description?.toLowerCase().includes(lowerQ)) { score = 0.8; } return { ...r, trigger: r.trigger || '', kind: r.kind || 'pattern', score: Math.round(score * 1000) / 1000, }; })); results.sort((a, b) => (b.score ?? 0) - (a.score ?? 0)); } catch { /* table may not exist */ } } // 补充排序信号字段(whenClause/doClause/tags 等),与 scorer/semantic 路径一致 this._supplementDetails(results); // 当 SQL LIKE 无结果时,降级到 FieldWeighted 搜索 // 这让自然语言查询(如 "如何处理网络错误")在 keyword 模式下也能返回结果 if (results.length === 0) { this.ensureIndex(); const scorerResults = this._scorerSearch(query, type, limit); return scorerResults; } return results.slice(0, limit); } /** * 加权字段搜索(FieldWeightedScorer) * 增加 Title/Trigger 精确匹配 bonus — 当 query 出现在标题/触发词中时 * 给予额外分数加成,确保精确匹配的条目排名靠前 */ _scorerSearch(query, type, limit) { let results = this.scorer.search(query, limit * 2); if (type !== 'all') { // All types now map to 'recipe' since everything is unified results = results.filter((r) => { if (type === 'rule') { return r.meta.knowledgeType === 'boundary-constraint'; } return r.meta.type === 'recipe'; }); } // ── Title/Trigger exact-match bonus ── // 当 query 精确出现在标题或触发词中时,增加分数 // 这解决了 "BaseRequest" 被 "BD前缀类名命名规范" 排在 "BDBaseRequest 继承请求模式" 前面的问题 const lowerQuery = query.toLowerCase(); const maxScore = results.length > 0 ? results[0].score : 1; for (const r of results) { const meta = r.meta; const title = (meta.title || '').toLowerCase(); const trigger = (meta.trigger || '').toLowerCase(); let bonus = 0; if (title === lowerQuery || trigger === lowerQuery) { // 完全匹配: +50% of max score bonus = maxScore * 0.5; } else if (title.includes(lowerQuery) || trigger.includes(lowerQuery)) { // 子串匹配: +30% of max score bonus = maxScore * 0.3; } else if (lowerQuery.includes(title) && title.length > 3) { // 反向包含 (query 包含 title): +15% of max score bonus = maxScore * 0.15; } r.score += bonus; } // 重新排序 results.sort((a, b) => b.score - a.score); const items = results.slice(0, limit).map((r) => { const meta = r.meta; return { id: r.id, title: meta.title, trigger: meta.trigger || '', type: meta.type, kind: meta.kind || 'pattern', status: meta.status, language: meta.language || '', category: meta.category || '', score: Math.round(r.score * 1000) / 1000, // 排序信号字段(供 CoarseRanker / MultiSignalRanker 使用) updatedAt: meta.updatedAt || null, createdAt: meta.createdAt || null, difficulty: meta.difficulty || 'intermediate', tags: meta.tags || [], usageCount: meta.usageCount || 0, authorityScore: meta.authorityScore || 0, qualityScore: meta.qualityScore || 0, }; }); // 为每个结果补充 content(预览需要)— 批量 IN 查询替代 N+1 this._supplementDetails(items); return items; } /** * 语义搜索 - 需要 AI Provider 的 embed 功能 * 不可用时降级到 FieldWeighted 搜索 * @returns >} */ async _semanticSearch(query, type, limit) { // 优先使用 VectorService (统一向量服务层) if (this.vectorService) { try { const vectorResults = await this.vectorService.search(query, { topK: limit * 2 }); if (vectorResults.length > 0) { let results = vectorResults.map((vr) => { const item = vr.item; const metadata = (item.metadata || {}); const rawId = item.id || ''; // 从 vector ID 提取 DB entryId: "entry_<uuid>" → "<uuid>" const entryId = metadata.entryId || rawId.replace(/^entry_/, ''); return { id: entryId, title: metadata.title || entryId, type: 'recipe', kind: metadata.kind || 'pattern', status: metadata.status || 'active', score: Math.round(vr.score * 1000) / 1000, }; }); // 按 entryId 去重 — 同一 Recipe 的多个 chunk 只保留最高分 results = this.#deduplicateByEntryId(results); if (type !== 'all') { results = results.filter((r) => { if (type === 'rule') { return r.kind === 'rule'; } return r.type === 'recipe'; }); } results = results.slice(0, limit); this._supplementDetails(results); return { items: results, actualMode: 'semantic' }; } } catch (err) { this.logger.warn('VectorService search failed, falling back to legacy path', { error: err.message, }); } } // Legacy fallback: 直接使用 aiProvider embed + vectorStore if (!this.aiProvider) { this.logger.debug('AI provider not available, falling back to FieldWeighted search'); return { items: this._scorerSearch(query, type, limit), actualMode: 'weighted' }; } try { const queryEmbedding = await this.aiProvider.embed(query); if (!queryEmbedding || queryEmbedding.length === 0) { return { items: this._scorerSearch(query, type, limit), actualMode: 'weighted' }; } if (this.vectorStore) { try { let vectorResults; if (typeof this.vectorStore.hybridSearch === 'function') { const hybrid = await this.vectorStore.hybridSearch(queryEmbedding, query, { topK: limit * 2, }); vectorResults = hybrid.map((r) => ({ id: r.item?.id ?? r.id, similarity: r.score, score: r.score, content: r.item?.content, metadata: r.item?.metadata || {}, })); } else { vectorResults = await this.vectorStore.query(queryEmbedding, limit * 2); } if (vectorResults && vectorResults.length > 0) { let results = vectorResults.map((vr) => { const rawId = vr.id || ''; const entryId = vr.metadata?.entryId || rawId.replace(/^entry_/, ''); return { id: entryId, title: vr.metadata?.title || entryId, type: 'recipe', kind: vr.metadata?.kind || 'pattern', status: vr.metadata?.status || 'active', score: Math.round((vr.similarity || vr.score || 0) * 1000) / 1000, }; }); // 按 entryId 去重 results = this.#deduplicateByEntryId(results); if (type !== 'all') { results = results.filter((r) => { if (type === 'rule') { return r.kind === 'rule'; } return r.type === 'recipe'; }); } results = results.slice(0, limit); this._supplementDetails(results); return { items: results, actualMode: 'semantic' }; } } catch (vecErr) { this.logger.warn('Vector store query failed, falling back to FieldWeighted', { error: vecErr.message, }); } } this.logger.debug('Vector search fallback to FieldWeighted'); return { items: this._scorerSearch(query, type, limit), actualMode: 'weighted' }; } catch (err) { this.logger.warn('Semantic search failed, falling back to FieldWeighted', { error: err.message, }); return { items: this._scorerSearch(query, type, limit), actualMode: 'weighted' }; } } /** * 按 entryId 去重 — 同一 Recipe 的多个 chunk 只保留最高分的 * 解决向量搜索返回同一条目的多个 chunk 浪费结果位的问题 */ #deduplicateByEntryId(items) { const seen = new Map(); for (const item of items) { const existing = seen.get(item.id); if (!existing || (item.score ?? 0) > (existing.score ?? 0)) { seen.set(item.id, item); } } return [...seen.values()].sort((a, b) => (b.score ?? 0) - (a.score ?? 0)); } /** * 评估 weighted 搜索结果的 confidence,决定是否需要语义搜索 * 返回 0-100 的分数,>= 60 跳过语义 */ #computeWeightedConfidence(query, items, requestedLimit) { let score = 0; // ── 结果质量信号 ── // FieldWeightedScorer 分数范围约 0-20,归一化后判断 const topScore = items[0]?.score ?? 0; const secondScore = items[1]?.score ?? 0; // top1 与 top2 分差大 → 明确命中 if (items.length >= 2 && topScore > 0) { const relativeGap = (topScore - secondScore) / topScore; if (relativeGap > 0.3) { score += 25; } else if (relativeGap > 0.15) { score += 15; } } // title/trigger 匹配(子串级别) const lq = query.toLowerCase(); const matchLevel = items.slice(0, 3).reduce((best, it) => { const t = (it.title || '').toLowerCase(); const tr = (it.trigger || '').toLowerCase(); if (t === lq || tr === lq || tr === `@${lq}`) { return Math.max(best, 3); // 完全匹配 } if (t.includes(lq) || tr.includes(lq)) { return Math.max(best, 2); // 子串匹配 } if (lq.includes(t) && t.length > 3) { return Math.max(best, 1); // 反向包含 } return best; }, 0); if (matchLevel === 3) { score += 50; } else if (matchLevel === 2) { score += 35; } else if (matchLevel === 1) { score += 15; } // 代码术语检测(CamelCase、snake_case、@trigger) if (/^[A-Z][a-zA-Z0-9]+$/.test(query) || /^[a-z]+(_[a-z]+)+$/.test(query) || query.startsWith('@')) { score += 25; } // 候选充足 if (items.length >= requestedLimit) { score += 10; } // ── 查询特征信号(降低 confidence → 倾向调用语义)── // 中文自然语言疑问句 if (/[如怎什为何哪]么?|是否|有没有|都有哪些|应该|需要/.test(query)) { score -= 40; } // 英文自然语言问句 if (/^(how|what|why|when|where|which|can|does|is|should)\b/i.test(query)) { score -= 40; } // 较长查询(可能是描述性语句) if (query.length > 20) { score -= 20; } else if (query.length > 10) { score -= 10; } return Math.max(0, Math.min(100, score)); } /** * 补充详细字段(content / description / trigger / delivery 字段)— 批量 IN 查询 * 用于向量搜索结果与 FieldWeighted 结果的一致性 */ _supplementDetails(items) { if (!items || items.length === 0) { return; } try { const ids = items.map((it) => it.id); let rows = []; try { rows = this.#knowledgeRepo.findByIdsDetailSync(ids); } catch { /* table may not exist */ } const rowMap = new Map(rows.map((r) => [r.id, r])); for (const item of items) { const row = rowMap.get(item.id); if (row) { item.content = item.content || row.content || undefined; item.description = item.description || row.description || ''; item.trigger = item.trigger || row.trigger || ''; if (row.headers) { item.headers = row.headers; } if (row.moduleName) { item.moduleName = row.moduleName; } // Cursor 交付字段 — 供 Agent 投影生成 actionHint if (!item.whenClause && row.whenClause) { item.whenClause = row.whenClause; } if (!item.doClause && row.doClause) { item.doClause = row.doClause; } // 排序信号补充 — 确保 Funnel/Ranker 有真实数据 if (!item.language && row.language) { item.language = row.language; } if (!item.category && row.category) { item.category = row.category; } if (!item.updatedAt && row.updatedAt) { item.updatedAt = row.updatedAt; } if (!item.createdAt && row.createdAt) { item.createdAt = row.createdAt; } if (!item.difficulty && row.difficulty) { item.difficulty = row.difficulty; } // 解析 tags if (!item.tags || (Array.isArray(item.tags) && item.tags.length === 0)) { try { item.tags = JSON.parse(row.tags || '[]'); } catch { /* ignore */ } } // 解析 quality JSON → qualityScore if (!item.qualityScore) { try { item.qualityScore = JSON.parse(row.quality || '{}').overall || 0; } catch { /* ignore */ } } // 解析 stats JSON → usageCount + authorityScore if (!item.usageCount) { try { const stats = JSON.parse(row.stats || '{}'); item.usageCount = (stats.adoptions || 0) + (stats.applications || 0) + (stats.searchHits || 0); if (!item.authorityScore) { item.authorityScore = stats.authority || 0; } } catch { /* ignore */ } } } } } catch { /* DB may not be available */ } // ── 从 recipe_source_refs 桥接表批量读取已验证的 sourceRefs ── try { const ids = items.map((it) => it.id); if (ids.length === 0) { return; } let refsRows; refsRows = this.#sourceRefRepo.findActiveByRecipeIds(ids); this.logger.debug('recipe_source_refs query', { idCount: ids.length, rowCount: refsRows.length, }); const refsMap = new Map(); for (const row of refsRows) { const recipeId = row.recipeId ?? row.recipe_id; const sourcePath = row.sourcePath ?? row.source_path; const status = row.status; const newPath = row.newPath ?? row.new_path; const refPath = status === 'renamed' && newPath ? newPath : sourcePath; if (!refsMap.has(recipeId)) { refsMap.set(recipeId, []); } refsMap.get(recipeId)?.push(refPath); } for (const item of items) { const refs = refsMap.get(item.id); if (refs && refs.length > 0) { item.sourceRefs = refs; } } } catch { /* recipe_source_refs table may not exist */ } } /** * 刷新索引(增量模式) * * 策略: * 1. 如果尚未构建索引 → 全量 buildIndex() * 2. 否则只加载 updatedAt > lastIndexTime 的条目 + 已删除(deprecated)条目 * - 新增/更新 → scorer.updateDocument() * - 已删除 → scorer.removeDocument() * 3. 清空缓存以确保搜索结果刷新 * * @param [opts] - force=true 强制全量重建 */ refreshIndex(opts = {}) { if (opts.force || !this._indexed || !this._lastIndexTime) { this._indexed = false; this.buildIndex(); return; } this._cache.clear(); try { // 查找自上次索引后更新的条目 const changed = this.#knowledgeRepo.findUpdatedSinceSync(this._lastIndexTime); let added = 0; let removed = 0; for (const r of changed) { if (r.lifecycle === 'deprecated') { // 已废弃 → 从索引中移除 if (this.scorer.removeDocument(r.id)) { removed++; } continue; } // 解析文档文本(复用 buildIndex 逻辑) const text = this._buildDocText(r); const meta = this._buildDocMeta(r); this.scorer.updateDocument(r.id, text, meta); added++; } this._lastIndexTime = new Date().toISOString(); if (added > 0 || removed > 0) { this.logger.info('Search index refreshed (incremental)', { added, removed }); } } catch (err) { // 增量失败 → 降级全量重建 this.logger.warn('Incremental refresh failed, falling back to full rebuild', { error: err.message, }); this._indexed = false; this.buildIndex(); } } /** * 从 DB 行构建索引文本 * * 高价值字段(title, trigger)通过重复出现提升 TF 权重 * — title ×3, trigger ×2, description ×1.5(通过重复 token 实现) * 这确保标题匹配的文档获得显著更高的分数 * 注:此逻辑主要服务于 BM25Scorer,FieldWeightedScorer 内部已有字段权重机制 */ _buildDocText(r) { let contentText = ''; try { const content = JSON.parse(r.content || '{}'); contentText = [content.pattern, content.rationale, content.markdown] .filter(Boolean) .join(' '); } catch { /* ignore */ } let tagText = ''; try { tagText = JSON.parse(r.tags || '[]').join(' '); } catch { /* ignore */ } // Field boosting via token repetition: // title ×2, trigger ×2, description ×1, others ×1 // 使用较温和的 boost 避免长文档 avgLength 膨胀导致 content 匹配被过度稀释 const title = r.title || ''; const trigger = r.trigger || ''; const desc = r.description || ''; const fields = [ title, title, // ×2 boost trigger, trigger, // ×2 boost desc, // ×1 (no boost — description already contributes naturally) r.language, r.category, r.knowledgeType, tagText, contentText, ]; return fields.filter(Boolean).join(' '); } /** * 从 DB 行构建文档 meta */ _buildDocMeta(r) { let parsedTags = []; try { parsedTags = JSON.parse(r.tags || '[]'); } catch { /* ignore */ } let usageCount = 0; let authorityScore = 0; try { const stats = JSON.parse(r.stats || '{}'); usageCount = (stats.adoptions || 0) + (stats.applications || 0) + (stats.searchHits || 0); authorityScore = stats.authority || 0; } catch { /* ignore */ } let qualityOverall = 0; try { qualityOverall = JSON.parse(r.quality || '{}').overall || 0; } catch { /* ignore */ } // 提取 description 和 contentText 供 FieldWeightedScorer 字段级评分使用 let contentText = ''; try { const content = JSON.parse(r.content || '{}'); contentText = [content.pattern, content.rationale, content.markdown] .filter(Boolean) .join(' '); } catch { /* ignore */ } return { type: 'knowledge', title: r.title, trigger: r.trigger || '', description: r.description || '', contentText, status: r.lifecycle, knowledgeType: r.knowledgeType, kind: r.kind || 'pattern', language: r.language || '', category: r.category || '', updatedAt: r.updatedAt || null, createdAt: r.createdAt || null, difficulty: r.difficulty || 'intermediate', tags: parsedTags, usageCount, authorityScore, qualityScore: qualityOverall, }; } /** 获取索引统计(如果尚未构建索引,自动触发构建) */ getStats() { return { indexed: this._indexed, totalDocuments: this.scorer.totalDocs, avgDocLength: Math.round(this.scorer.avgLength * 10) / 10, cacheSize: this._cache.size, uniqueTokens: Object.keys(this.scorer.docFreq).length, hasVectorStore: !!this.vectorStore, hasVectorService: !!this.vectorService, hasAiProvider: !!this.aiProvider, }; } _getCache(key) { const entry = this._cache.get(key); if (!entry) { return null; } if (Date.now() - entry.time > this._cacheMaxAge) { this._cache.delete(key); return null; } // LRU: 重新插入以更新 Map 迭代顺序,使热点 key 不被淘汰 this._cache.delete(key); this._cache.set(key, entry); return entry.data; } _setCache(key, data) { // LRU:超限时批量淘汰最旧的 20% if (this._cache.size > 500) { const toDelete = Math.floor(this._cache.size * 0.2); const keys = this._cache.keys(); for (let i = 0; i < toDelete; i++) { const k = keys.next().value; if (k !== undefined) { this._cache.delete(k); } } } this._cache.set(key, { data, time: Date.now() }); } } export default SearchEngine;