UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

248 lines (247 loc) 8.88 kB
/** * HTML-aware Chunker * * Splits HTML documents based on tag structure while preserving semantics. * Best for web pages, email templates, and structured HTML content. */ import { randomUUID } from "crypto"; /** * HTML-aware chunker implementation * Splits based on HTML structure (tags, elements) */ export class HTMLChunker { strategy = "html"; defaultSplitTags = [ "div", "p", "section", "article", "main", "aside", "header", "footer", "nav", "li", "tr", "td", "th", ]; defaultPreserveTags = [ "pre", "code", "table", "ul", "ol", "blockquote", ]; async chunk(text, config) { const { maxSize = 1000, overlap = 0, splitTags = this.defaultSplitTags, preserveTags = this.defaultPreserveTags, extractTextOnly = false, includeTagMetadata = true, trimWhitespace = true, metadata = {}, } = config || {}; const documentId = randomUUID(); const chunks = []; if (!text || text.length === 0) { return chunks; } // Extract and split by structural tags const sections = this.splitByTags(text, splitTags, preserveTags); let chunkIndex = 0; let currentPosition = 0; for (const section of sections) { const { content, tagName, attributes } = section; // Process content let processedContent = content; if (extractTextOnly) { processedContent = this.extractText(content); } // Split if content is too large const contentChunks = this.splitContent(processedContent, maxSize, overlap); for (const contentChunk of contentChunks) { const finalText = trimWhitespace ? contentChunk.trim() : contentChunk; if (finalText.length > 0) { const chunkMetadata = { ...metadata, }; if (includeTagMetadata && tagName) { chunkMetadata.tagName = tagName; if (attributes && Object.keys(attributes).length > 0) { chunkMetadata.attributes = attributes; } } chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: currentPosition, endPosition: currentPosition + contentChunk.length, documentType: "html", custom: chunkMetadata, }, }); chunkIndex++; } currentPosition += contentChunk.length; } } // Update total chunks count chunks.forEach((chunk) => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Split HTML by structural tags */ splitByTags(html, splitTags, preserveTags) { const sections = []; // Create regex pattern for split tags const tagPattern = new RegExp(`<(${splitTags.join("|")})([^>]*)>([\\s\\S]*?)</\\1>`, "gi"); let lastIndex = 0; let match; // Reset regex tagPattern.lastIndex = 0; while ((match = tagPattern.exec(html)) !== null) { // Content before this tag if (match.index > lastIndex) { const beforeContent = html.slice(lastIndex, match.index).trim(); if (beforeContent.length > 0) { sections.push({ content: beforeContent, }); } } const tagName = match[1].toLowerCase(); const attributeString = match[2]; const innerContent = match[3]; // Parse attributes const attributes = this.parseAttributes(attributeString); // Check if this tag should be preserved as a unit const shouldPreserve = preserveTags.some((pt) => innerContent.toLowerCase().includes(`<${pt}`)); if (shouldPreserve) { // Keep the full tag content sections.push({ content: match[0], tagName, attributes, }); } else { // Just the inner content sections.push({ content: innerContent, tagName, attributes, }); } lastIndex = match.index + match[0].length; } // Don't forget content after the last tag if (lastIndex < html.length) { const remaining = html.slice(lastIndex).trim(); if (remaining.length > 0) { sections.push({ content: remaining, }); } } // If no tags found, return entire text as one section if (sections.length === 0 && html.trim()) { sections.push({ content: html.trim(), }); } return sections; } /** * Parse HTML attributes from string */ parseAttributes(attributeString) { const attributes = {}; const attrPattern = /(\w+)(?:=["']([^"']*?)["'])?/g; let match; while ((match = attrPattern.exec(attributeString)) !== null) { const name = match[1]; const value = match[2] || ""; attributes[name] = value; } return attributes; } /** * Extract plain text from HTML */ extractText(html) { return (html // Remove script and style elements .replace(/<script[\s\S]*?<\/script>/gi, "") .replace(/<style[\s\S]*?<\/style>/gi, "") // Remove HTML comments .replace(/<!--[\s\S]*?-->/g, "") // Replace block elements with newlines .replace(/<\/(p|div|br|h[1-6]|li|tr)>/gi, "\n") // Remove remaining tags .replace(/<[^>]+>/g, "") // Decode common HTML entities .replace(/&nbsp;/gi, " ") .replace(/&amp;/gi, "&") .replace(/&lt;/gi, "<") .replace(/&gt;/gi, ">") .replace(/&quot;/gi, '"') .replace(/&#039;/gi, "'") // Normalize whitespace .replace(/\s+/g, " ") .trim()); } /** * Split content that exceeds max size */ splitContent(content, maxSize, overlap) { const effectiveMaxSize = Math.max(maxSize, 1); const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1); if (content.length <= effectiveMaxSize) { return [content]; } const chunks = []; let start = 0; while (start < content.length) { let end = Math.min(start + effectiveMaxSize, content.length); // Try to break at a natural boundary if (end < content.length) { const searchStart = Math.max(start, end - 100); const searchText = content.slice(searchStart, end); // Look for paragraph/sentence break const breakMatch = searchText.match(/[.!?\n]\s+/); if (breakMatch && breakMatch.index !== undefined) { end = searchStart + breakMatch.index + 1; } } chunks.push(content.slice(start, end)); start = Math.max(start + 1, end - effectiveOverlap); } return chunks; } validateConfig(config) { const errors = []; const warnings = []; const htmlConfig = config; if (htmlConfig.maxSize !== undefined && htmlConfig.maxSize <= 0) { errors.push("maxSize must be greater than 0"); } if (htmlConfig.overlap !== undefined && htmlConfig.overlap < 0) { errors.push("overlap must be non-negative"); } if (htmlConfig.overlap !== undefined && htmlConfig.maxSize !== undefined && htmlConfig.overlap >= htmlConfig.maxSize) { errors.push("overlap must be less than maxSize"); } if (htmlConfig.splitTags !== undefined && htmlConfig.splitTags.length === 0) { warnings.push("No split tags specified, using defaults"); } return { valid: errors.length === 0, errors, warnings, }; } }