UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

315 lines 12.5 kB
/** * Markdown Chunker * * Splits markdown content by headers and structural elements. * Preserves markdown tables by detecting table boundaries and splitting * on row boundaries when a table exceeds the max chunk size. */ import { BaseChunker, DEFAULT_CHUNKER_CONFIG } from "./BaseChunker.js"; /** Matches a markdown table separator row like |---|---| or |:--:|---:| */ const TABLE_SEPARATOR_RE = /^\|[\s:]*-+[\s:]*(\|[\s:]*-+[\s:]*)*\|?\s*$/; /** Matches a line that looks like a table row (starts with |) */ const TABLE_ROW_RE = /^\|.+\|?\s*$/; /** * Detect contiguous table blocks in text. * Returns an array of { start, end } line index ranges (inclusive). * A table is a sequence of lines where the second line is a separator. */ function detectTableRanges(lines) { const ranges = []; let i = 0; while (i < lines.length) { const currentLine = lines[i]; const separatorLine = lines[i + 1]; // A table needs at least a header row + separator if (i + 1 < lines.length && currentLine !== undefined && separatorLine !== undefined && TABLE_ROW_RE.test(currentLine) && TABLE_SEPARATOR_RE.test(separatorLine)) { const start = i; // Advance past header + separator i += 2; // Consume remaining data rows while (i < lines.length) { const row = lines[i]; if (row === undefined || !TABLE_ROW_RE.test(row)) { break; } i++; } ranges.push({ start, end: i - 1 }); } else { i++; } } return ranges; } /** * Markdown Chunker */ export class MarkdownChunker extends BaseChunker { strategy = "markdown"; getDefaultConfig() { return { ...DEFAULT_CHUNKER_CONFIG, maxSize: 1000, overlap: 50, }; } async doChunk(content, config) { const maxSize = config.maxSize ?? 1000; // Split by headers const headerPattern = /^(#{1,6})\s+(.+)$/gm; const sections = []; let lastIndex = 0; let match = headerPattern.exec(content); while (match !== null) { // Add content before this header if (match.index > lastIndex) { const prevContent = content.slice(lastIndex, match.index).trim(); if (prevContent && sections.length > 0) { const lastSection = sections[sections.length - 1]; if (lastSection) { lastSection.content += "\n\n" + prevContent; } } else if (prevContent) { sections.push({ header: "", content: prevContent, level: 0 }); } } sections.push({ header: match[0], content: "", level: match[1]?.length ?? 1, }); lastIndex = match.index + match[0].length; match = headerPattern.exec(content); } // Add remaining content if (lastIndex < content.length) { const remaining = content.slice(lastIndex).trim(); if (remaining) { if (sections.length > 0) { const lastSection = sections[sections.length - 1]; if (lastSection) { lastSection.content += remaining; } } else { sections.push({ header: "", content: remaining, level: 0 }); } } } // Convert sections to chunks const chunks = []; let offset = 0; for (let i = 0; i < sections.length; i++) { const section = sections[i]; if (!section) { continue; } const fullContent = section.header ? section.header + "\n\n" + section.content.trim() : section.content.trim(); if (!fullContent) { continue; } // Split if too large — use table-aware splitting if (fullContent.length > maxSize) { const subChunks = this.splitContentTableAware(fullContent, maxSize); for (const sub of subChunks) { const startOffset = content.indexOf(sub, offset); chunks.push(this.createChunk(sub, chunks.length, startOffset >= 0 ? startOffset : offset, startOffset >= 0 ? startOffset + sub.length : offset + sub.length, "unknown", { sectionContext: section.header })); if (startOffset >= 0) { offset = startOffset + sub.length; } } } else { const startOffset = content.indexOf(fullContent, offset); chunks.push(this.createChunk(fullContent, chunks.length, startOffset >= 0 ? startOffset : offset, startOffset >= 0 ? startOffset + fullContent.length : offset + fullContent.length, "unknown", { sectionContext: section.header })); if (startOffset >= 0) { offset = startOffset + fullContent.length; } } } return chunks; } /** * Split content while preserving markdown tables. * * Strategy: * 1. Identify table blocks in the content. * 2. Split content into segments: non-table text and table blocks. * 3. Non-table text is split using paragraph/sentence boundaries (existing logic). * 4. Tables that fit in a chunk are kept intact. * 5. Oversized tables are split on row boundaries, repeating the header row. */ splitContentTableAware(content, maxSize) { const lines = content.split("\n"); const tableRanges = detectTableRanges(lines); // If no tables, fall back to existing splitting logic if (tableRanges.length === 0) { return this.splitPlainContent(content, maxSize, this.config.overlap ?? 0); } // Build segments: alternating non-table and table blocks const segments = []; let lineIdx = 0; for (const range of tableRanges) { // Non-table text before this table if (lineIdx < range.start) { const text = lines.slice(lineIdx, range.start).join("\n").trim(); if (text) { segments.push({ text, isTable: false }); } } // The table itself const tableText = lines.slice(range.start, range.end + 1).join("\n"); segments.push({ text: tableText, isTable: true }); lineIdx = range.end + 1; } // Trailing non-table text if (lineIdx < lines.length) { const text = lines.slice(lineIdx).join("\n").trim(); if (text) { segments.push({ text, isTable: false }); } } // Now produce chunks, trying to pack segments together up to maxSize const result = []; let current = ""; for (const seg of segments) { if (!seg.isTable) { // Non-table text: try to append, split if needed const pieces = this.splitPlainContent(seg.text, maxSize, this.config.overlap ?? 0); for (const piece of pieces) { if (current.length === 0) { current = piece; } else if (current.length + 1 + piece.length <= maxSize) { current += "\n" + piece; } else { result.push(current); current = piece; } } } else { // Table block if (seg.text.length <= maxSize) { // Table fits — try to append to current chunk if (current.length === 0) { current = seg.text; } else if (current.length + 2 + seg.text.length <= maxSize) { current += "\n\n" + seg.text; } else { result.push(current); current = seg.text; } } else { // Oversized table — flush current, then split table on row boundaries if (current) { result.push(current); current = ""; } const tableChunks = this.splitTableByRows(seg.text, maxSize); result.push(...tableChunks); } } } if (current) { result.push(current); } return result.length > 0 ? result : [content]; } /** * Split a table on row boundaries, repeating header + separator in each chunk. */ splitTableByRows(tableText, maxSize) { const rows = tableText.split("\n"); if (rows.length < 3) { // Not a proper table (need header + separator + at least 1 data row) return [tableText]; } const headerRow = rows[0] ?? ""; const separatorRow = rows[1] ?? ""; const headerBlock = headerRow + "\n" + separatorRow; const dataRows = rows.slice(2); // If even the header doesn't fit, fall back to size-based split if (headerBlock.length > maxSize) { return this.splitPlainContent(tableText, maxSize, this.config.overlap ?? 0); } const chunks = []; let currentChunk = headerBlock; for (const row of dataRows) { // Guard: single row exceeds budget — flush and emit as standalone chunk const singleRowChunk = `${headerBlock}\n${row}`; if (singleRowChunk.length > maxSize) { if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } chunks.push(singleRowChunk); currentChunk = headerBlock; continue; } const candidate = currentChunk + "\n" + row; if (candidate.length <= maxSize) { currentChunk = candidate; } else { // Flush current chunk (skip if it only contains the header) if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } // Start new chunk with header repeated currentChunk = headerBlock + "\n" + row; } } if (currentChunk.length > headerBlock.length) { chunks.push(currentChunk); } return chunks.length > 0 ? chunks : [tableText]; } /** * Split non-table text using paragraph and sentence boundaries. * This is the original splitContent logic extracted for reuse. */ splitPlainContent(content, maxSize, overlap = 0) { if (content.length <= maxSize) { return [content]; } const chunks = []; let start = 0; while (start < content.length) { let end = Math.min(start + maxSize, content.length); if (end < content.length) { const searchStart = Math.max(start, end - 200); const searchText = content.slice(searchStart, end); // Look for paragraph break first const paragraphBreak = searchText.lastIndexOf("\n\n"); if (paragraphBreak > 0) { end = searchStart + paragraphBreak; } else { // Look for sentence break const sentenceBreak = searchText.search(/[.!?]\s+[A-Z]/); if (sentenceBreak > 0) { end = searchStart + sentenceBreak + 1; } } } chunks.push(content.slice(start, end)); start = Math.max(start + 1, end - overlap); } return chunks; } } //# sourceMappingURL=MarkdownChunker.js.map