UNPKG

double-context

Version:

Intelligently optimize and compress context for LLM prompts to greatly increase effective context window

52 lines 2.11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.deduplicate = deduplicate; const embeddings_1 = require("./embeddings"); async function deduplicate(chunks, embeddingProvider, semanticThreshold = 0.9) { if (!chunks || chunks.length === 0) return []; // Phase 1: Simple text-based deduplication const uniqueChunks = new Set(); const textDedupedChunks = []; for (const chunk of chunks) { const normalizedChunk = chunk.trim().toLowerCase(); if (!uniqueChunks.has(normalizedChunk)) { uniqueChunks.add(normalizedChunk); textDedupedChunks.push(chunk); } } // Phase 2: Semantic deduplication with embeddings if (!embeddingProvider || textDedupedChunks.length <= 1) { return textDedupedChunks; } try { // Generate embeddings for all chunks const embeddings = await embeddingProvider.getEmbeddings(textDedupedChunks); const semanticUnique = []; const processedEmbeddings = []; for (let i = 0; i < textDedupedChunks.length; i++) { const currentChunk = textDedupedChunks[i]; const currentEmbedding = embeddings[i]; let isDuplicate = false; // Check similarity against all previously processed chunks for (let j = 0; j < processedEmbeddings.length; j++) { const similarity = (0, embeddings_1.cosineSimilarity)(currentEmbedding, processedEmbeddings[j]); if (similarity >= semanticThreshold) { isDuplicate = true; break; } } if (!isDuplicate) { semanticUnique.push(currentChunk); processedEmbeddings.push(currentEmbedding); } } return semanticUnique; } catch (error) { // Fallback to text-based deduplication if embeddings fail console.warn('Semantic deduplication failed, falling back to text-based:', error); return textDedupedChunks; } } //# sourceMappingURL=dedupe.js.map