UNPKG

autosnippet

Version:

Extract code patterns into a knowledge base for AI coding assistants

204 lines (203 loc) 7.8 kB
/** * RedundancyAnalyzer — 多维冗余检测 * * 从 CandidateAggregator 的标题 Jaccard 扩展到四维内容级相似度: * 维度 1: title Jaccard ≥ 0.7 * 维度 2: doClause + dontClause 文本相似度 ≥ 0.6 * 维度 3: coreCode 去空白后字符级相似度 ≥ 0.8 * 维度 4: guard regex 完全相同 * * 综合: weighted_sum(0.2*d1 + 0.3*d2 + 0.3*d3 + 0.2*d4) ≥ 0.65 */ import { CONSUMABLE_LIFECYCLES } from '../../domain/knowledge/Lifecycle.js'; import Logger from '../../infrastructure/logging/Logger.js'; import { ContradictionDetector } from './ContradictionDetector.js'; /* ────────────────────── Constants ────────────────────── */ const WEIGHTS = { title: 0.2, clause: 0.3, code: 0.3, guard: 0.2 }; const REDUNDANCY_THRESHOLD = 0.65; /* ────────────────────── Class ────────────────────── */ export class RedundancyAnalyzer { #knowledgeRepo; #signalBus; #reportStore; #logger = Logger.getInstance(); constructor(knowledgeRepo, options = {}) { this.#knowledgeRepo = knowledgeRepo; this.#signalBus = options.signalBus ?? null; this.#reportStore = options.reportStore ?? null; } /** * 分析所有 active/staging 条目之间的冗余 */ async analyzeAll() { const recipes = await this.#loadRecipes(); const results = []; for (let i = 0; i < recipes.length; i++) { for (let j = i + 1; j < recipes.length; j++) { const result = this.analyzePair(recipes[i], recipes[j]); if (result) { results.push(result); } } } if (this.#reportStore && results.length > 0) { for (const r of results) { void this.#reportStore.write({ category: 'analysis', type: 'redundancy_report', producer: 'RedundancyAnalyzer', data: { recipeA: r.recipeA, redundantWith: r.recipeB, dimensions: r.dimensions, similarity: r.similarity, }, timestamp: Date.now(), }); } } if (this.#signalBus && results.length > 0) { this.#signalBus.send('lifecycle', 'RedundancyAnalyzer', 1, { metadata: { redundantPairCount: results.length }, }); } this.#logger.debug(`RedundancyAnalyzer: found ${results.length} redundant pairs`); return results; } /** * 分析两条 Recipe 的冗余度 */ analyzePair(a, b) { const d1 = RedundancyAnalyzer.#titleJaccard(a.title, b.title); const d2 = this.#clauseSimilarity(a, b); const d3 = RedundancyAnalyzer.#codeSimilarity(a.coreCode, b.coreCode); const d4 = a.guardPattern && b.guardPattern && a.guardPattern === b.guardPattern ? 1.0 : 0; const similarity = WEIGHTS.title * d1 + WEIGHTS.clause * d2 + WEIGHTS.code * d3 + WEIGHTS.guard * d4; if (similarity < REDUNDANCY_THRESHOLD) { return null; } return { recipeA: a.id, recipeB: b.id, similarity: Math.round(similarity * 100) / 100, dimensions: { title: Math.round(d1 * 100) / 100, clause: Math.round(d2 * 100) / 100, code: Math.round(d3 * 100) / 100, guard: d4, }, }; } /* ── Internal ── */ async #loadRecipes() { try { const entries = await this.#knowledgeRepo.findAllByLifecycles(CONSUMABLE_LIFECYCLES); return entries.map((e) => ({ id: e.id, title: e.title, doClause: e.doClause || null, dontClause: e.dontClause || null, coreCode: e.coreCode || null, guardPattern: e.content?.pattern || null, })); } catch { return []; } } /** 维度 1: 标题 Jaccard 相似度 */ static #titleJaccard(titleA, titleB) { const wordsA = ContradictionDetector.extractTopicWords(titleA); const wordsB = ContradictionDetector.extractTopicWords(titleB); if (wordsA.size === 0 && wordsB.size === 0) { return 0; } let intersection = 0; for (const w of wordsA) { if (wordsB.has(w)) { intersection++; } } const union = wordsA.size + wordsB.size - intersection; return union === 0 ? 0 : intersection / union; } /** 维度 2: doClause + dontClause 文本相似度 */ #clauseSimilarity(a, b) { const textA = [a.doClause, a.dontClause].filter(Boolean).join(' '); const textB = [b.doClause, b.dontClause].filter(Boolean).join(' '); if (!textA || !textB) { return 0; } const wordsA = ContradictionDetector.extractTopicWords(textA); const wordsB = ContradictionDetector.extractTopicWords(textB); if (wordsA.size === 0 && wordsB.size === 0) { return 0; } let intersection = 0; for (const w of wordsA) { if (wordsB.has(w)) { intersection++; } } const union = wordsA.size + wordsB.size - intersection; return union === 0 ? 0 : intersection / union; } /** 维度 3: coreCode 去空白后字符级相似度 (简化 Levenshtein → 公共子串比率) */ static #codeSimilarity(codeA, codeB) { if (!codeA || !codeB) { return 0; } const a = codeA.replace(/\s+/g, ''); const b = codeB.replace(/\s+/g, ''); if (a.length === 0 && b.length === 0) { return 0; } // 使用最长公共子串(LCS)比率作为相似度的近似 // 对于较长的代码,使用 n-gram 方法避免 O(n²) 开销 const maxLen = Math.max(a.length, b.length); if (maxLen > 2000) { return RedundancyAnalyzer.#ngramSimilarity(a, b, 4); } const lcsLen = RedundancyAnalyzer.#lcsLength(a, b); return (2 * lcsLen) / (a.length + b.length); } /** 最长公共子序列长度(O(n*m) 但只用 2 行空间) */ static #lcsLength(a, b) { const m = a.length; const n = b.length; let prev = new Uint16Array(n + 1); let curr = new Uint16Array(n + 1); for (let i = 1; i <= m; i++) { for (let j = 1; j <= n; j++) { if (a[i - 1] === b[j - 1]) { curr[j] = prev[j - 1] + 1; } else { curr[j] = Math.max(prev[j], curr[j - 1]); } } [prev, curr] = [curr, prev]; curr.fill(0); } return prev[n]; } /** n-gram 相似度(大文本用) */ static #ngramSimilarity(a, b, n) { const ngramsA = new Set(); for (let i = 0; i <= a.length - n; i++) { ngramsA.add(a.slice(i, i + n)); } const ngramsB = new Set(); for (let i = 0; i <= b.length - n; i++) { ngramsB.add(b.slice(i, i + n)); } let intersection = 0; for (const ng of ngramsA) { if (ngramsB.has(ng)) { intersection++; } } const union = ngramsA.size + ngramsB.size - intersection; return union === 0 ? 0 : intersection / union; } }