UNPKG

arela

Version:

AI-powered CTO with multi-agent orchestration, code summarization, visual testing (web + mobile) for blazing fast development.

124 lines 3.85 kB
import { RelevanceScorer } from "./scorer.js"; /** * SemanticDeduplicator - Removes semantically similar/duplicate items * * Uses cosine similarity to detect duplicates. When duplicates are found, * keeps the item with the higher score. */ export class SemanticDeduplicator { scorer; threshold; constructor(threshold = 0.85) { this.scorer = new RelevanceScorer(); this.threshold = threshold; // 85% similarity = duplicate } /** * Remove semantically similar items * Keeps highest-scoring item from each duplicate group */ deduplicate(items) { if (items.length === 0) return []; // Sort by score descending - process highest scored items first const sorted = [...items].sort((a, b) => b.score - a.score); const unique = []; for (const item of sorted) { const isDuplicate = this.isDuplicate(item, unique); if (!isDuplicate) { unique.push(item); } } return unique; } /** * Check if item is duplicate of any existing unique items */ isDuplicate(item, uniqueItems) { const content1 = this.getContent(item); if (!content1) return false; for (const existing of uniqueItems) { const content2 = this.getContent(existing); if (!content2) continue; const similarity = this.scorer.cosineSimilarity(content1, content2); if (similarity >= this.threshold) { return true; } } return false; } /** * Extract content from item */ getContent(item) { if (typeof item.content === "string") { return item.content; } if (item.content && typeof item.content === "object") { return JSON.stringify(item.content); } return ""; } /** * Set deduplication threshold */ setThreshold(threshold) { this.threshold = Math.max(0, Math.min(1, threshold)); } /** * Get current threshold */ getThreshold() { return this.threshold; } /** * Find duplicate groups in items (for debugging/analysis) */ findDuplicateGroups(items) { const groups = []; const processed = new Set(); for (let i = 0; i < items.length; i++) { if (processed.has(i)) continue; const group = [items[i]]; processed.add(i); const content1 = this.getContent(items[i]); if (!content1) continue; // Find all duplicates of this item for (let j = i + 1; j < items.length; j++) { if (processed.has(j)) continue; const content2 = this.getContent(items[j]); if (!content2) continue; const similarity = this.scorer.cosineSimilarity(content1, content2); if (similarity >= this.threshold) { group.push(items[j]); processed.add(j); } } if (group.length > 1) { groups.push(group); } } return groups; } /** * Get deduplication statistics */ getStats(original, deduplicated) { const originalCount = original.length; const deduplicatedCount = deduplicated.length; const removedCount = originalCount - deduplicatedCount; const deduplicationRate = originalCount > 0 ? removedCount / originalCount : 0; return { originalCount, deduplicatedCount, removedCount, deduplicationRate, }; } } //# sourceMappingURL=dedup.js.map