UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

307 lines (306 loc) 11.9 kB
/** * Semantic Chunker * * LLM-powered semantic chunking that groups related content together. * Uses embedding similarity to determine natural breakpoints. * Best for complex documents where meaning should drive segmentation. */ import { randomUUID } from "crypto"; import { ProviderFactory } from "../../factories/providerFactory.js"; import { logger } from "../../utils/logger.js"; /** * Semantic chunker implementation * Uses embedding similarity to find natural content boundaries */ export class SemanticChunker { strategy = "semantic"; async chunk(text, config) { const { maxSize = 1000, overlap = 0, joinThreshold = 100, modelName = "text-embedding-3-small", provider = "openai", similarityThreshold = 0.7, trimWhitespace = true, metadata = {}, } = config || {}; const documentId = randomUUID(); const chunks = []; if (!text || text.length === 0) { return chunks; } // First, split into initial segments (paragraphs or sentences) const segments = this.splitIntoSegments(text, joinThreshold); if (segments.length <= 1) { // Single segment, no need for semantic analysis chunks.push({ id: randomUUID(), text: trimWhitespace ? text.trim() : text, metadata: { documentId, chunkIndex: 0, totalChunks: 1, startPosition: 0, endPosition: text.length, documentType: "text", custom: metadata, }, }); return chunks; } try { // Get embeddings for each segment const embeddings = await this.getEmbeddings(segments, provider, modelName); // Find semantic breakpoints const breakpoints = this.findSemanticBreakpoints(embeddings, similarityThreshold); // Group segments by semantic similarity const groups = this.groupSegments(segments, breakpoints, maxSize); // Create chunks from groups let chunkIndex = 0; let currentPosition = 0; for (const group of groups) { const chunkText = group.join("\n\n"); const finalText = trimWhitespace ? chunkText.trim() : chunkText; if (finalText.length > 0) { chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: currentPosition, endPosition: currentPosition + chunkText.length, documentType: "text", custom: { ...metadata, segmentCount: group.length, }, }, }); chunkIndex++; } currentPosition += chunkText.length + 2; // +2 for separator } // Handle overlap if configured if (overlap > 0) { chunks.forEach((chunk, i) => { if (i > 0) { // Add overlap from previous chunk const prevText = chunks[i - 1].text; const overlapText = prevText.slice(-overlap); chunk.text = overlapText + "\n" + chunk.text; } }); } } catch (error) { // Fallback to simple chunking if embeddings fail logger.warn("[SemanticChunker] Embedding failed, falling back to simple chunking", { error: error instanceof Error ? error.message : String(error), }); return this.fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace); } // Update total chunks count chunks.forEach((chunk) => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Split text into initial segments for embedding */ splitIntoSegments(text, minSize) { const segments = []; // Split by double newlines (paragraphs) const paragraphs = text.split(/\n\n+/); let currentSegment = ""; for (const paragraph of paragraphs) { const trimmed = paragraph.trim(); if (trimmed.length === 0) { continue; } if (currentSegment.length === 0) { currentSegment = trimmed; } else if (currentSegment.length + trimmed.length < minSize) { // Join small paragraphs currentSegment += "\n\n" + trimmed; } else { // Save current and start new if (currentSegment.length > 0) { segments.push(currentSegment); } currentSegment = trimmed; } } // Don't forget the last segment if (currentSegment.length > 0) { segments.push(currentSegment); } return segments; } /** * Get embeddings for segments */ async getEmbeddings(segments, provider, modelName) { const embeddingProvider = await ProviderFactory.createProvider(provider, modelName); // Check if provider has embed method if (typeof embeddingProvider.embed !== "function") { throw new Error(`Provider ${provider} does not support embeddings`); } const embeddings = []; // Process in batches to avoid rate limits const batchSize = 10; for (let i = 0; i < segments.length; i += batchSize) { const batch = segments.slice(i, i + batchSize); for (const segment of batch) { try { const embedding = await embeddingProvider.embed(segment); embeddings.push(embedding); } catch (error) { logger.warn("[SemanticChunker] Failed to embed segment", { error: error instanceof Error ? error.message : String(error), }); // Use zero vector as fallback embeddings.push(new Array(1536).fill(0)); } } } return embeddings; } /** * Find semantic breakpoints using cosine similarity */ findSemanticBreakpoints(embeddings, threshold) { const breakpoints = []; for (let i = 1; i < embeddings.length; i++) { const similarity = this.cosineSimilarity(embeddings[i - 1], embeddings[i]); // If similarity is below threshold, it's a breakpoint if (similarity < threshold) { breakpoints.push(i); } } return breakpoints; } /** * Group segments based on breakpoints and size limits */ groupSegments(segments, breakpoints, maxSize) { const groups = []; let currentGroup = []; let currentSize = 0; let breakpointIndex = 0; for (let i = 0; i < segments.length; i++) { const segment = segments[i]; const segmentSize = segment.length; // Check if we're at a breakpoint or exceeding size const isBreakpoint = breakpointIndex < breakpoints.length && breakpoints[breakpointIndex] === i; if ((currentSize + segmentSize > maxSize && currentGroup.length > 0) || (isBreakpoint && currentGroup.length > 0)) { // Save current group groups.push(currentGroup); currentGroup = []; currentSize = 0; } if (isBreakpoint) { breakpointIndex++; } currentGroup.push(segment); currentSize += segmentSize; } // Don't forget the last group if (currentGroup.length > 0) { groups.push(currentGroup); } return groups; } /** * Calculate cosine similarity between two vectors */ cosineSimilarity(a, b) { if (a.length !== b.length) { return 0; } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } const denominator = Math.sqrt(normA) * Math.sqrt(normB); return denominator === 0 ? 0 : dotProduct / denominator; } /** * Fallback to simple chunking when embeddings fail */ fallbackChunk(text, maxSize, overlap, documentId, metadata, trimWhitespace) { const effectiveMaxSize = Math.max(maxSize, 1); const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1); const chunks = []; let start = 0; let chunkIndex = 0; while (start < text.length) { let end = Math.min(start + effectiveMaxSize, text.length); // Try to break at paragraph boundary if (end < text.length) { const searchStart = Math.max(start, end - 200); const searchText = text.slice(searchStart, end); const paragraphBreak = searchText.lastIndexOf("\n\n"); if (paragraphBreak > 0) { end = searchStart + paragraphBreak; } } const chunkText = text.slice(start, end); const finalText = trimWhitespace ? chunkText.trim() : chunkText; if (finalText.length > 0) { chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: start, endPosition: end, documentType: "text", custom: { ...metadata, fallbackChunking: true, }, }, }); chunkIndex++; } start = Math.max(start + 1, end - effectiveOverlap); } return chunks; } validateConfig(config) { const errors = []; const warnings = []; const semConfig = config; if (semConfig.maxSize !== undefined && semConfig.maxSize <= 0) { errors.push("maxSize must be greater than 0"); } if (semConfig.overlap !== undefined && semConfig.overlap < 0) { errors.push("overlap must be non-negative"); } if (semConfig.overlap !== undefined && semConfig.maxSize !== undefined && semConfig.overlap >= semConfig.maxSize) { errors.push("overlap must be less than maxSize"); } if (semConfig.similarityThreshold !== undefined) { if (semConfig.similarityThreshold < 0 || semConfig.similarityThreshold > 1) { errors.push("similarityThreshold must be between 0 and 1"); } } if (semConfig.joinThreshold !== undefined && semConfig.joinThreshold < 0) { errors.push("joinThreshold must be non-negative"); } warnings.push("Semantic chunking requires an embedding provider. Ensure API credentials are configured."); return { valid: errors.length === 0, errors, warnings, }; } }