UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

383 lines (382 loc) 15.2 kB
/** * Markdown-aware Chunker * * Splits markdown documents based on header structure while preserving formatting. * Best for documentation, README files, and structured markdown content. */ import { randomUUID } from "crypto"; /** * Markdown-aware chunker implementation * Splits based on markdown structure (headers, code blocks, etc.) */ export class MarkdownChunker { strategy = "markdown"; async chunk(text, config) { const { maxSize = 1000, overlap = 0, headerLevels = [1, 2, 3], preserveCodeBlocks = true, includeHeader = true, stripFormatting = false, trimWhitespace = true, metadata = {}, } = config || {}; const documentId = randomUUID(); const chunks = []; if (!text || text.length === 0) { return chunks; } // Build header regex pattern const headerPattern = new RegExp(`^(#{${Math.min(...headerLevels)},${Math.max(...headerLevels)}})\\s+(.+)$`, "gm"); // Split by headers while preserving them const sections = this.splitByHeaders(text, headerPattern, includeHeader); let chunkIndex = 0; let currentPosition = 0; for (const section of sections) { const { header, content, level } = section; // Handle code blocks let processedContent = content; const codeBlocks = []; if (preserveCodeBlocks) { processedContent = content.replace(/```[\s\S]*?```|`[^`]+`/g, (match) => { const placeholder = `__CODE_BLOCK_${codeBlocks.length}__`; codeBlocks.push({ placeholder, code: match }); return placeholder; }); } // Split content if too large const effectiveMaxSize = Math.max(maxSize - (header?.length || 0), 100); const contentChunks = this.splitContent(processedContent, effectiveMaxSize, overlap); for (const contentChunk of contentChunks) { let chunkText = header && includeHeader ? `${header}\n\n${contentChunk}` : contentChunk; // Restore code blocks for (const { placeholder, code } of codeBlocks) { chunkText = chunkText.replace(placeholder, code); } // Strip formatting if requested if (stripFormatting) { chunkText = this.stripMarkdown(chunkText); } const finalText = trimWhitespace ? chunkText.trim() : chunkText; if (finalText.length > 0) { chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: currentPosition, endPosition: currentPosition + chunkText.length, documentType: "markdown", headerLevel: level ?? undefined, header: header?.replace(/^#+\s*/, "") ?? undefined, custom: metadata, }, }); chunkIndex++; } currentPosition += chunkText.length; } } // Update total chunks count chunks.forEach((chunk) => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } splitByHeaders(text, headerPattern, _includeHeader) { const sections = []; let lastIndex = 0; let match; let currentHeader = null; let currentLevel = null; // Reset regex headerPattern.lastIndex = 0; while ((match = headerPattern.exec(text)) !== null) { // Content before this header if (match.index > lastIndex) { const content = text.slice(lastIndex, match.index); if (content.trim()) { sections.push({ header: currentHeader, content: content.trim(), level: currentLevel, }); } } currentHeader = match[0]; currentLevel = match[1].length; // Number of # characters lastIndex = match.index + match[0].length; } // Don't forget content after the last header if (lastIndex < text.length) { const content = text.slice(lastIndex); if (content.trim()) { sections.push({ header: currentHeader, content: content.trim(), level: currentLevel, }); } } // If no headers found, return entire text as one section if (sections.length === 0 && text.trim()) { sections.push({ header: null, content: text.trim(), level: null, }); } return sections; } splitContent(content, maxSize, overlap) { const effectiveMaxSize = Math.max(maxSize, 1); const effectiveOverlap = Math.min(Math.max(overlap, 0), effectiveMaxSize - 1); if (content.length <= effectiveMaxSize) { return [content]; } // Use table-aware splitting const lines = content.split("\n"); const tableRanges = this.detectTableRanges(lines); if (tableRanges.length > 0) { return this.splitContentTableAware(content, lines, tableRanges, effectiveMaxSize, effectiveOverlap); } return this.splitPlainContent(content, effectiveMaxSize, effectiveOverlap); } /** * Detect contiguous table blocks in lines. * Returns array of { start, end } line index ranges (inclusive). */ detectTableRanges(lines) { // Simple pipe-prefixed line check (single character class — no backtracking) const TABLE_ROW_RE = /^\|[^\r\n]{1,10000}/; // Per-cell separator regex applied AFTER splitting on "|" — safe because // each cell is short and bounded by pipe delimiters (CodeQL: js/polynomial-redos) const SEPARATOR_CELL_RE = /^[\t ]*:?-+:?[\t ]*$/; const ranges = []; let i = 0; while (i < lines.length) { const currentLine = lines[i]; const separatorLine = lines[i + 1]; if (i + 1 < lines.length && currentLine !== undefined && separatorLine !== undefined && TABLE_ROW_RE.test(currentLine) && this.isTableSeparator(separatorLine, SEPARATOR_CELL_RE)) { const start = i; i += 2; while (i < lines.length) { const row = lines[i]; if (row === undefined || !TABLE_ROW_RE.test(row)) { break; } i++; } ranges.push({ start, end: i - 1 }); } else { i++; } } return ranges; } /** Check if a line is a markdown table separator (e.g. |---|---|). */ isTableSeparator(line, cellRe) { const trimmed = line.trimEnd(); if (!trimmed.startsWith("|")) { return false; } // Split by "|" → ["", "---", "---", ""] for "|---|---|" const cells = trimmed.split("|"); cells.shift(); // remove leading empty element const lastCell = cells.at(-1); if (cells.length > 0 && lastCell?.trim() === "") { cells.pop(); // remove trailing empty element } if (cells.length === 0) { return false; } return cells.every((cell) => cellRe.test(cell)); } /** * Split content while preserving markdown tables. */ splitContentTableAware(content, lines, tableRanges, maxSize, overlap) { // Build segments: alternating non-table and table blocks const segments = []; let lineIdx = 0; for (const range of tableRanges) { if (lineIdx < range.start) { const text = lines.slice(lineIdx, range.start).join("\n").trim(); if (text) { segments.push({ text, isTable: false }); } } const tableText = lines.slice(range.start, range.end + 1).join("\n"); segments.push({ text: tableText, isTable: true }); lineIdx = range.end + 1; } if (lineIdx < lines.length) { const text = lines.slice(lineIdx).join("\n").trim(); if (text) { segments.push({ text, isTable: false }); } } const result = []; let current = ""; for (const seg of segments) { if (!seg.isTable) { const pieces = this.splitPlainContent(seg.text, maxSize, overlap); for (const piece of pieces) { if (current.length === 0) { current = piece; } else if (current.length + 1 + piece.length <= maxSize) { current += "\n" + piece; } else { result.push(current); current = piece; } } } else { if (seg.text.length <= maxSize) { if (current.length === 0) { current = seg.text; } else if (current.length + 2 + seg.text.length <= maxSize) { current += "\n\n" + seg.text; } else { result.push(current); current = seg.text; } } else { if (current) { result.push(current); current = ""; } const tableChunks = this.splitTableByRows(seg.text, maxSize); result.push(...tableChunks); } } } if (current) { result.push(current); } return result.length > 0 ? result : [content]; } /** * Split a table on row boundaries, repeating header + separator in each chunk. */ splitTableByRows(tableText, maxSize) { const rows = tableText.split("\n"); if (rows.length < 3) { return [tableText]; } const headerRow = rows[0] ?? ""; const separatorRow = rows[1] ?? ""; const headerBlock = headerRow + "\n" + separatorRow; const dataRows = rows.slice(2); if (headerBlock.length > maxSize) { return this.splitPlainContent(tableText, maxSize, 0); } const chunks = []; let currentChunk = headerBlock; for (const row of dataRows) { // Guard: single row exceeds budget — flush and emit as standalone chunk const singleRowChunk = `${headerBlock}\n${row}`; if (singleRowChunk.length > maxSize) { if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } chunks.push(singleRowChunk); currentChunk = headerBlock; continue; } const candidate = currentChunk + "\n" + row; if (candidate.length <= maxSize) { currentChunk = candidate; } else { if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } currentChunk = headerBlock + "\n" + row; } } if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } return chunks.length > 0 ? chunks : [tableText]; } splitPlainContent(content, maxSize, overlap) { if (content.length <= maxSize) { return [content]; } const chunks = []; let start = 0; while (start < content.length) { let end = Math.min(start + maxSize, content.length); // Try to break at a paragraph or sentence boundary if (end < content.length) { const searchStart = Math.max(start, end - 200); const searchText = content.slice(searchStart, end); // Look for paragraph break first const paragraphBreak = searchText.lastIndexOf("\n\n"); if (paragraphBreak > 0) { end = searchStart + paragraphBreak; } else { // Look for sentence break const sentenceBreak = searchText.search(/[.!?]\s+[A-Z]/); if (sentenceBreak > 0) { end = searchStart + sentenceBreak + 1; } } } chunks.push(content.slice(start, end)); start = Math.max(start + 1, end - overlap); } return chunks; } stripMarkdown(text) { return text .replace(/^#+\s+/gm, "") // Headers .replace(/\*\*(.+?)\*\*/g, "$1") // Bold .replace(/\*(.+?)\*/g, "$1") // Italic .replace(/__(.+?)__/g, "$1") // Bold (underscore) .replace(/_(.+?)_/g, "$1") // Italic (underscore) .replace(/`(.+?)`/g, "$1") // Inline code .replace(/```[\s\S]*?```/g, "") // Code blocks .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") // Links .replace(/!\[([^\]]*)\]\([^)]+\)/g, "$1"); // Images } validateConfig(config) { const errors = []; const warnings = []; const mdConfig = config; if (mdConfig.maxSize !== undefined && mdConfig.maxSize <= 0) { errors.push("maxSize must be greater than 0"); } if (mdConfig.headerLevels !== undefined) { if (mdConfig.headerLevels.length === 0) { errors.push("headerLevels must not be empty"); } for (const level of mdConfig.headerLevels) { if (level < 1 || level > 6) { errors.push(`Invalid header level: ${level}. Must be between 1 and 6`); } } } if (mdConfig.overlap !== undefined && mdConfig.overlap < 0) { errors.push("overlap must be non-negative"); } if (mdConfig.overlap !== undefined && mdConfig.maxSize !== undefined && mdConfig.overlap >= mdConfig.maxSize) { errors.push("overlap must be less than maxSize"); } return { valid: errors.length === 0, errors, warnings, }; } }