@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
248 lines (247 loc) • 8.88 kB
JavaScript
/**
* HTML-aware Chunker
*
* Splits HTML documents based on tag structure while preserving semantics.
* Best for web pages, email templates, and structured HTML content.
*/
import { randomUUID } from "crypto";
/**
* HTML-aware chunker implementation
* Splits based on HTML structure (tags, elements)
*/
export class HTMLChunker {
strategy = "html";
defaultSplitTags = [
"div",
"p",
"section",
"article",
"main",
"aside",
"header",
"footer",
"nav",
"li",
"tr",
"td",
"th",
];
defaultPreserveTags = [
"pre",
"code",
"table",
"ul",
"ol",
"blockquote",
];
async chunk(text, config) {
const { maxSize = 1000, overlap = 0, splitTags = this.defaultSplitTags, preserveTags = this.defaultPreserveTags, extractTextOnly = false, includeTagMetadata = true, trimWhitespace = true, metadata = {}, } = config || {};
const documentId = randomUUID();
const chunks = [];
if (!text || text.length === 0) {
return chunks;
}
// Extract and split by structural tags
const sections = this.splitByTags(text, splitTags, preserveTags);
let chunkIndex = 0;
let currentPosition = 0;
for (const section of sections) {
const { content, tagName, attributes } = section;
// Process content
let processedContent = content;
if (extractTextOnly) {
processedContent = this.extractText(content);
}
// Split if content is too large
const contentChunks = this.splitContent(processedContent, maxSize, overlap);
for (const contentChunk of contentChunks) {
const finalText = trimWhitespace ? contentChunk.trim() : contentChunk;
if (finalText.length > 0) {
const chunkMetadata = {
...metadata,
};
if (includeTagMetadata && tagName) {
chunkMetadata.tagName = tagName;
if (attributes && Object.keys(attributes).length > 0) {
chunkMetadata.attributes = attributes;
}
}
chunks.push({
id: randomUUID(),
text: finalText,
metadata: {
documentId,
chunkIndex,
startPosition: currentPosition,
endPosition: currentPosition + contentChunk.length,
documentType: "html",
custom: chunkMetadata,
},
});
chunkIndex++;
}
currentPosition += contentChunk.length;
}
}
// Update total chunks count
chunks.forEach((chunk) => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Split HTML by structural tags
*/
splitByTags(html, splitTags, preserveTags) {
const sections = [];
// Create regex pattern for split tags
const tagPattern = new RegExp(`<(${splitTags.join("|")})([^>]*)>([\\s\\S]*?)</\\1>`, "gi");
let lastIndex = 0;
let match;
// Reset regex
tagPattern.lastIndex = 0;
while ((match = tagPattern.exec(html)) !== null) {
// Content before this tag
if (match.index > lastIndex) {
const beforeContent = html.slice(lastIndex, match.index).trim();
if (beforeContent.length > 0) {
sections.push({
content: beforeContent,
});
}
}
const tagName = match[1].toLowerCase();
const attributeString = match[2];
const innerContent = match[3];
// Parse attributes
const attributes = this.parseAttributes(attributeString);
// Check if this tag should be preserved as a unit
const shouldPreserve = preserveTags.some((pt) => innerContent.toLowerCase().includes(`<${pt}`));
if (shouldPreserve) {
// Keep the full tag content
sections.push({
content: match[0],
tagName,
attributes,
});
}
else {
// Just the inner content
sections.push({
content: innerContent,
tagName,
attributes,
});
}
lastIndex = match.index + match[0].length;
}
// Don't forget content after the last tag
if (lastIndex < html.length) {
const remaining = html.slice(lastIndex).trim();
if (remaining.length > 0) {
sections.push({
content: remaining,
});
}
}
// If no tags found, return entire text as one section
if (sections.length === 0 && html.trim()) {
sections.push({
content: html.trim(),
});
}
return sections;
}
/**
* Parse HTML attributes from string
*/
parseAttributes(attributeString) {
const attributes = {};
const attrPattern = /(\w+)(?:=["']([^"']*?)["'])?/g;
let match;
while ((match = attrPattern.exec(attributeString)) !== null) {
const name = match[1];
const value = match[2] || "";
attributes[name] = value;
}
return attributes;
}
/**
* Extract plain text from HTML
*/
extractText(html) {
return (html
// Remove script and style elements
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<style[\s\S]*?<\/style>/gi, "")
// Remove HTML comments
.replace(/<!--[\s\S]*?-->/g, "")
// Replace block elements with newlines
.replace(/<\/(p|div|br|h[1-6]|li|tr)>/gi, "\n")
// Remove remaining tags
.replace(/<[^>]+>/g, "")
// Decode common HTML entities
.replace(/ /gi, " ")
.replace(/&/gi, "&")
.replace(/</gi, "<")
.replace(/>/gi, ">")
.replace(/"/gi, '"')
.replace(/'/gi, "'")
// Normalize whitespace
.replace(/\s+/g, " ")
.trim());
}
/**
* Split content that exceeds max size
*/
splitContent(content, maxSize, overlap) {
const effectiveMaxSize = Math.max(maxSize, 1);
const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1);
if (content.length <= effectiveMaxSize) {
return [content];
}
const chunks = [];
let start = 0;
while (start < content.length) {
let end = Math.min(start + effectiveMaxSize, content.length);
// Try to break at a natural boundary
if (end < content.length) {
const searchStart = Math.max(start, end - 100);
const searchText = content.slice(searchStart, end);
// Look for paragraph/sentence break
const breakMatch = searchText.match(/[.!?\n]\s+/);
if (breakMatch && breakMatch.index !== undefined) {
end = searchStart + breakMatch.index + 1;
}
}
chunks.push(content.slice(start, end));
start = Math.max(start + 1, end - effectiveOverlap);
}
return chunks;
}
validateConfig(config) {
const errors = [];
const warnings = [];
const htmlConfig = config;
if (htmlConfig.maxSize !== undefined && htmlConfig.maxSize <= 0) {
errors.push("maxSize must be greater than 0");
}
if (htmlConfig.overlap !== undefined && htmlConfig.overlap < 0) {
errors.push("overlap must be non-negative");
}
if (htmlConfig.overlap !== undefined &&
htmlConfig.maxSize !== undefined &&
htmlConfig.overlap >= htmlConfig.maxSize) {
errors.push("overlap must be less than maxSize");
}
if (htmlConfig.splitTags !== undefined &&
htmlConfig.splitTags.length === 0) {
warnings.push("No split tags specified, using defaults");
}
return {
valid: errors.length === 0,
errors,
warnings,
};
}
}