arela
Version:
AI-powered CTO with multi-agent orchestration, code summarization, visual testing (web + mobile) for blazing fast development.
124 lines • 3.85 kB
JavaScript
import { RelevanceScorer } from "./scorer.js";
/**
* SemanticDeduplicator - Removes semantically similar/duplicate items
*
* Uses cosine similarity to detect duplicates. When duplicates are found,
* keeps the item with the higher score.
*/
export class SemanticDeduplicator {
scorer;
threshold;
constructor(threshold = 0.85) {
this.scorer = new RelevanceScorer();
this.threshold = threshold; // 85% similarity = duplicate
}
/**
* Remove semantically similar items
* Keeps highest-scoring item from each duplicate group
*/
deduplicate(items) {
if (items.length === 0)
return [];
// Sort by score descending - process highest scored items first
const sorted = [...items].sort((a, b) => b.score - a.score);
const unique = [];
for (const item of sorted) {
const isDuplicate = this.isDuplicate(item, unique);
if (!isDuplicate) {
unique.push(item);
}
}
return unique;
}
/**
* Check if item is duplicate of any existing unique items
*/
isDuplicate(item, uniqueItems) {
const content1 = this.getContent(item);
if (!content1)
return false;
for (const existing of uniqueItems) {
const content2 = this.getContent(existing);
if (!content2)
continue;
const similarity = this.scorer.cosineSimilarity(content1, content2);
if (similarity >= this.threshold) {
return true;
}
}
return false;
}
/**
* Extract content from item
*/
getContent(item) {
if (typeof item.content === "string") {
return item.content;
}
if (item.content && typeof item.content === "object") {
return JSON.stringify(item.content);
}
return "";
}
/**
* Set deduplication threshold
*/
setThreshold(threshold) {
this.threshold = Math.max(0, Math.min(1, threshold));
}
/**
* Get current threshold
*/
getThreshold() {
return this.threshold;
}
/**
* Find duplicate groups in items (for debugging/analysis)
*/
findDuplicateGroups(items) {
const groups = [];
const processed = new Set();
for (let i = 0; i < items.length; i++) {
if (processed.has(i))
continue;
const group = [items[i]];
processed.add(i);
const content1 = this.getContent(items[i]);
if (!content1)
continue;
// Find all duplicates of this item
for (let j = i + 1; j < items.length; j++) {
if (processed.has(j))
continue;
const content2 = this.getContent(items[j]);
if (!content2)
continue;
const similarity = this.scorer.cosineSimilarity(content1, content2);
if (similarity >= this.threshold) {
group.push(items[j]);
processed.add(j);
}
}
if (group.length > 1) {
groups.push(group);
}
}
return groups;
}
/**
* Get deduplication statistics
*/
getStats(original, deduplicated) {
const originalCount = original.length;
const deduplicatedCount = deduplicated.length;
const removedCount = originalCount - deduplicatedCount;
const deduplicationRate = originalCount > 0 ? removedCount / originalCount : 0;
return {
originalCount,
deduplicatedCount,
removedCount,
deduplicationRate,
};
}
}
//# sourceMappingURL=dedup.js.map