UNPKG

chunkdown

Version:

A tree-based markdown text splitter that understands document structure to create semantically meaningful chunks for RAG applications

1,478 lines (1,467 loc) 42.4 kB
import { fromMarkdown } from "mdast-util-from-markdown"; import { gfmFromMarkdown, gfmToMarkdown } from "mdast-util-gfm"; import { toMarkdown } from "mdast-util-to-markdown"; import { gfm } from "micromark-extension-gfm"; import { visit } from "unist-util-visit"; import { toString } from "mdast-util-to-string"; //#region src/markdown.ts const fromMarkdown$1 = (value) => { return fromMarkdown(value, { extensions: [gfm()], mdastExtensions: [gfmFromMarkdown()] }); }; const toMarkdown$1 = (tree) => { return toMarkdown(tree, { resourceLink: true, extensions: [gfmToMarkdown({ tablePipeAlign: false })] }); }; /** * Apply transform functions to nodes in the tree based on configured rules. * Transforms are applied in a single pass through the tree. */ function applyTransformations(tree, rules) { const transformerMap = /* @__PURE__ */ new Map(); for (const [type, rule] of Object.entries(rules)) if (rule?.transform) transformerMap.set(type, rule.transform); if (transformerMap.size === 0) return tree; if (transformerMap.has("formatting")) { const formatting = transformerMap.get("formatting"); transformerMap.set("strong", transformerMap.get("strong") ?? formatting); transformerMap.set("emphasis", transformerMap.get("emphasis") ?? formatting); transformerMap.set("delete", transformerMap.get("delete") ?? formatting); transformerMap.delete("formatting"); } visit(tree, Array.from(transformerMap.keys()), (node, index, parent) => { const transform = transformerMap.get(node.type); if (transform && parent && typeof index === "number") { const result = transform(node, { parent, index, root: tree }); if (result) parent.children[index] = result; if (result === null) { parent.children.splice(index, 1); return index; } } }); return tree; } /** * Preprocess a markdown tree based on the provided splitter options. */ function preprocessMarkdown(tree, options) { let normalizedTree = tree; if (options.rules) { normalizedTree = normalizeReferences(normalizedTree, options.rules); normalizedTree = applyTransformations(normalizedTree, options.rules); } return normalizedTree; } /** * Normalize reference-style links and images to inline style. */ function normalizeReferences(tree, rules) { const nodeTypes = []; if (rules.link?.style === "inline") nodeTypes.push("linkReference"); if (rules.image?.style === "inline") nodeTypes.push("imageReference"); if (nodeTypes.length === 0) return tree; const definitions = /* @__PURE__ */ new Map(); visit(tree, "definition", (node) => { const id = node.identifier.toLowerCase(); definitions.set(id, node); }); if (definitions.size === 0) return tree; const usedDefinitions = /* @__PURE__ */ new Set(); visit(tree, nodeTypes, (node, index, parent) => { if (!parent || typeof index !== "number") return; if (node.type === "linkReference") { const linkRef = node; const id = linkRef.identifier.toLowerCase(); const def = definitions.get(id); if (def && rules.link?.style === "inline") { const link = { type: "link", url: def.url, title: def.title, children: linkRef.children, position: linkRef.position }; parent.children[index] = link; usedDefinitions.add(id); } } else if (node.type === "imageReference") { const imageRef = node; const id = imageRef.identifier.toLowerCase(); const def = definitions.get(id); if (def && rules.image?.style === "inline") { const image = { type: "image", url: def.url, title: def.title, alt: imageRef.alt, position: imageRef.position }; parent.children[index] = image; usedDefinitions.add(id); } } }); if (usedDefinitions.size > 0) tree.children = tree.children.filter((node) => { if (node.type !== "definition") return true; const id = node.identifier.toLowerCase(); return !usedDefinitions.has(id); }); return tree; } //#endregion //#region src/ast.ts /** * Transform a flat mdast AST into a hierarchical structure where headings * contain their associated content and nested subsections. */ const createHierarchicalAST = (root) => { /** * Transform nodes into hierarchical sections using a simple iterative approach * Groups consecutive non-section children into orphaned sections (depth 0, heading undefined) */ const transform = (nodes) => { const result = []; let i = 0; while (i < nodes.length) { const node = nodes[i]; if (node.type === "heading") { /** * Start a new section */ const section = { type: "section", depth: node.depth, heading: node, children: [] }; /** * Move past the heading */ i++; /** * Collect all content until we hit a heading of same or higher level or a thematic break */ while (i < nodes.length) { const nextNode = nodes[i]; if (nextNode.type === "heading" && nextNode.depth <= node.depth) /** * Found a heading at same or higher level - stop collecting */ break; if (nextNode.type === "thematicBreak") { /** * Found a thematic break - include it at the end of this section * The thematic break logically closes the section */ section.children.push(nextNode); i++; break; } /** * Add this content to the section */ section.children.push(nextNode); i++; } section.children = transform(section.children.filter((child) => !isSection(child))); result.push(section); } else { /** * Regular non-heading content (including orphaned thematic breaks) */ result.push(node); i++; } } return result; }; const sections = transform(root.children); /** * Group consecutive non-section children into orphaned sections */ const groupedSections = []; let orphanedContent = []; for (const child of sections) if (isSection(child)) { /** * If we have accumulated orphaned content, create an orphaned section */ if (orphanedContent.length > 0) { const orphanedSection = { type: "section", depth: 0, heading: void 0, children: orphanedContent }; groupedSections.push(orphanedSection); orphanedContent = []; } /** * Add the regular section */ groupedSections.push(child); } else /** * Add orphaned content */ orphanedContent.push(child); /** * Add any remaining orphaned content to the grouped sections */ if (orphanedContent.length > 0) { const orphanedSection = { type: "section", depth: 0, heading: void 0, children: orphanedContent }; groupedSections.push(orphanedSection); } return { type: "root", children: groupedSections }; }; /** * Check if a node is a Section */ const isSection = (node) => { return node?.type === "section"; }; /** * Create a root node from a single node or an array of nodes. * If the node is already a root node, it is returned as is. * If the node is an array of nodes, a root node is created with the nodes as children. */ const createTree = (nodes) => { if (Array.isArray(nodes)) return { type: "root", children: nodes }; if (nodes.type === "root") return nodes; return createTree([nodes]); }; /** * Create a section node from a partial section. */ const createSection = (section) => { return { type: "section", depth: section.depth ?? 0, heading: section.heading, children: section.children ?? [] }; }; /** * Convert hierarchical AST back to flat structure */ const flattenHierarchicalAST = (ast) => { const flatten = (nodes) => { const result = []; for (const node of nodes) if (isSection(node)) { if (node.heading) result.push(node.heading); result.push(...flatten(node.children)); } else result.push(node); return result; }; return { type: "root", children: flatten(ast.children) }; }; //#endregion //#region src/size.ts /** * Calculate the content size of markdown content or AST node * Uses the actual text content without markdown formatting characters * * @param input - The markdown text or AST node to measure * @returns The size of the actual text content (without formatting) */ const getContentSize = (input) => { if (!input) return 0; if (typeof input === "string") return getContentSize(fromMarkdown$1(input)); return toString(input).length; }; const getSectionSize = (section) => { let totalLength = 0; if (section.heading) totalLength = getContentSize(section.heading); for (const child of section.children) if (isSection(child)) totalLength += getSectionSize(child); else totalLength += getContentSize(child); return totalLength; }; /** * Split chunks by maxRawSize limit as a hard constraint on raw markdown length * Splits chunks that exceed the limit, preferring whitespace boundaries * * @param chunks - Array of markdown chunks * @param maxRawSize - Maximum raw character length per chunk * @returns Generator yielding chunks with no chunk exceeding maxRawSize */ function* splitByMaxRawSize(chunks, maxRawSize) { for (const chunk of chunks) { if (chunk.length <= maxRawSize) { yield chunk; continue; } let remaining = chunk; while (remaining.length > maxRawSize) { let splitPos = maxRawSize; let foundWhitespace = false; for (let i = maxRawSize - 1; i >= Math.floor(maxRawSize * .8); i--) if (/\s/.test(remaining[i])) { splitPos = i; foundWhitespace = true; break; } const splitChunk = remaining.substring(0, splitPos); remaining = foundWhitespace ? remaining.substring(splitPos).trim() : remaining.substring(splitPos); const trimmedChunk = splitChunk.trim(); if (trimmedChunk.length > 0) yield trimmedChunk; } if (remaining.length > 0) yield remaining; } } //#endregion //#region src/splitters/base.ts /** * Abstract base class for node splitters */ var AbstractNodeSplitter = class { options; chunkSize; maxOverflowRatio; maxAllowedSize; maxRawSize; splitRules; constructor(options) { this.options = options; this.chunkSize = options.chunkSize; this.maxOverflowRatio = Math.max(1, options.maxOverflowRatio ?? 1); this.maxAllowedSize = this.chunkSize * this.maxOverflowRatio; this.maxRawSize = options.maxRawSize; /** * Normalize all split rules from SimpleSplitRule to ComplexSplitRule */ this.splitRules = {}; if (options.rules) for (const nodeType in options.rules) { const key = nodeType; const nodeRule = options.rules[key]; if (nodeRule?.split) if (nodeRule.split === "never-split") this.splitRules[key] = { rule: "never-split" }; else if (nodeRule.split === "allow-split") this.splitRules[key] = { rule: "allow-split" }; else this.splitRules[key] = nodeRule.split; } } /** * Check if a node can be split based on its split rule */ canSplitNode(node) { let splitRule = this.splitRules[node.type]; /** * Formatting nodes can also be configured with the formatting split rule */ if (!splitRule && (node.type === "strong" || node.type === "emphasis" || node.type === "delete")) splitRule = this.splitRules.formatting; /** * No rule defaults to allow splitting */ if (!splitRule) return true; /** * Never split the node even if it exceeds the size limit */ if (splitRule.rule === "never-split") return false; /** * Allow splitting the node */ if (splitRule.rule === "allow-split") return true; /** * Protected the node up to the size limit */ if (splitRule.rule === "size-split") return getContentSize(node) > splitRule.size; /** * Default to allow splitting */ return true; } }; //#endregion //#region src/splitters/blockquote.ts /** * Blockquote splitter */ var BlockquoteSplitter = class extends AbstractNodeSplitter { splitRule; constructor(options) { super(options); this.splitRule = this.splitRules.blockquote; } splitText(text) { const blockquote = fromMarkdown$1(text).children[0]; if (blockquote.type !== "blockquote") throw new Error("Text is not a blockquote"); return this.splitNode(blockquote).map((chunk) => toMarkdown$1(chunk).trim()); } splitNode(blockquote) { const subBlockquotes = []; for (const subBlockquote of this.splitBlockquote(blockquote)) subBlockquotes.push(subBlockquote); return subBlockquotes; } *splitBlockquote(blockquote) { if (!this.canSplitNode(blockquote)) { yield blockquote; return; } let subBlockquote = { ...blockquote, children: [] }; let subBlockquoteSize = 0; for (const block of blockquote.children) { const blockSize = getContentSize({ ...blockquote, children: [block] }); /** * If the current sub-blockquote is too large, yield it and start a new sub-blockquote */ if (subBlockquoteSize + blockSize > this.maxAllowedSize) { if (subBlockquote.children.length > 0) yield subBlockquote; subBlockquote = { ...blockquote, children: [] }; subBlockquoteSize = 0; } /** * If the current block is too large, split it and yield the chunks */ if (blockSize > this.maxAllowedSize) { for (const subBlock of this.splitBlock(blockquote, block)) yield subBlock; subBlockquote = { ...blockquote, children: [] }; subBlockquoteSize = 0; continue; } /** * If the current block fits, add it to the sub-blockquote */ subBlockquote.children.push(block); subBlockquoteSize += blockSize; } /** * If there are any remaining blocks in the sub-blockquote, yield it */ if (subBlockquote.children.length > 0) yield subBlockquote; } *splitBlock(blockquote, block) { /** * Convert the block to a tree */ const blockTree = { type: "root", children: [block] }; const blockChunks = new TreeSplitter(this.options).splitNode(blockTree); /** * Wrap each chunk back into blockquote and yield it */ for (const chunk of blockChunks) { if (!("children" in chunk) || chunk.children.length === 0) continue; yield { ...blockquote, children: chunk.children }; } } }; //#endregion //#region src/splitters/list.ts /** * List splitter */ var ListSplitter = class extends AbstractNodeSplitter { splitRule; constructor(options) { super(options); this.splitRule = this.splitRules.list; } splitText(text) { const list = fromMarkdown$1(text).children[0]; if (list.type !== "list") throw new Error("Text is not a list"); return this.splitNode(list).map((chunk) => toMarkdown$1(chunk).trim()); } splitNode(node) { const nodes = []; for (const subList of this.splitList(node)) nodes.push(subList); return nodes; } *splitList(list) { if (!this.canSplitNode(list)) { yield list; return; } let subList = { ...list, children: [] }; let subListSize = 0; const listOriginalStart = list.start || 1; let listItemIndex = 0; for (const listItem of list.children) { const listItemSize = getContentSize(listItem); /** * If the current sublist is too large, yield it and start a new sublist */ if (subListSize + listItemSize > this.maxAllowedSize) { if (subList.children.length > 0) { if (list.ordered) subList.start = listOriginalStart + listItemIndex; yield subList; /** * Sub list items are all added to the same sub list, so we need to increment the list item index by the number of items in the sub list */ listItemIndex += subList.children.length; } subList = { ...list, children: [] }; subListSize = 0; } /** * If the current list item is too large, split it and yield the chunks */ if (listItemSize > this.maxAllowedSize) { subList = { ...list, children: [] }; subListSize = listItemSize; if (list.ordered) subList.start = listOriginalStart + listItemIndex; for (const subListItem of this.splitListItem(subList, listItem)) yield subListItem; subList = { ...list, children: [] }; subListSize = 0; /** * Sub list items are all added to the same sub list, so we need to increment the list item index by 1 */ listItemIndex += 1; continue; } /** * If the current list item fits, add it to the sublist */ subList.children.push(listItem); subListSize += listItemSize; } /** * If there are any remaining items in the sublist, yield it */ if (subList.children.length > 0) { if (list.ordered) subList.start = listOriginalStart + listItemIndex; yield subList; } } *splitListItem(list, listItem) { /** * Convert the list item to a tree */ const listItemTree = { type: "root", children: listItem.children }; const listItemChunks = new TreeSplitter(this.options).splitNode(listItemTree); for (let i = 0; i < listItemChunks.length; i++) { const chunk = listItemChunks[i]; if (!("children" in chunk) || chunk.children.length === 0) continue; /** * Wrap the first chunk back into list item and yield it. * The remaining chunks are yielded as is. */ if (i === 0) { const subListItem = { ...listItem, children: chunk.children }; yield { ...list, children: [subListItem] }; } else yield createTree(chunk); } } }; //#endregion //#region src/splitters/table.ts /** * Table splitter */ var TableSplitter = class extends AbstractNodeSplitter { splitRule; constructor(options) { super(options); this.splitRule = this.splitRules.table; } splitText(text) { const table = fromMarkdown$1(text).children[0]; if (table.type !== "table") throw new Error("Text is not a table"); return this.splitNode(table).map((chunk) => toMarkdown$1(chunk).trim()); } splitNode(table) { const nodes = []; for (const node of this.splitTable(table)) nodes.push(node); return nodes; } *splitTable(table) { if (!this.canSplitNode(table)) { yield table; return; } if (table.children.length === 0) return; const headerRow = table.children[0]; let subTable = { ...table, children: [headerRow] }; let subTableSize = 0; for (let i = 1; i < table.children.length; i++) { const row = table.children[i]; const rowSize = getContentSize(row); /** * If the current sub-table is too large, yield it and start a new sub-table */ if (subTableSize + rowSize > this.maxAllowedSize) { if (subTable.children.length > 1) yield subTable; subTable = { ...table, children: [headerRow] }; subTableSize = 0; } /** * If the current row is too large, split it by cells and yield mini-tables */ if (rowSize > this.maxAllowedSize) { for (const node of this.splitTableRow(table, headerRow, row)) yield node; subTable = { ...table, children: [headerRow] }; subTableSize = 0; continue; } /** * If the current row fits, add it to the sub-table */ subTable.children.push(row); subTableSize += rowSize; } /** * If there are any remaining rows in the sub-table, yield it */ if (subTable.children.length > 1) yield subTable; } *splitTableRow(table, headerRow, row) { /** * Create mini-tables for each cell, pairing it with its corresponding header cell */ for (let cellIndex = 0; cellIndex < row.children.length; cellIndex++) { const cell = row.children[cellIndex]; const headerCell = headerRow.children[cellIndex]; /** * Create a mini-table with one column: header cell + data cell */ const miniHeaderRow = { type: "tableRow", children: [headerCell] }; const miniDataRow = { type: "tableRow", children: [cell] }; const miniTable = { ...table, children: [miniHeaderRow, miniDataRow] }; /** * If the mini-table (single cell + header) is still too large, * split the cell content using the tree splitter */ if (getContentSize(miniDataRow) > this.maxAllowedSize) for (const node of this.splitTableCell(table, headerCell, cell)) yield node; else yield miniTable; } } *splitTableCell(table, headerCell, cell) { /** * Convert the cell content to a tree structure */ const cellTree = { type: "root", children: cell.children }; const cellChunks = new TreeSplitter(this.options).splitNode(cellTree); /** * Create a mini-table for each chunk with the header cell */ for (const chunk of cellChunks) { if (!("children" in chunk) || chunk.children.length === 0) continue; const miniHeaderRow = { type: "tableRow", children: [headerCell] }; const miniDataRow = { type: "tableRow", children: [{ ...cell, children: chunk.children }] }; yield { ...table, children: [miniHeaderRow, miniDataRow] }; } } }; //#endregion //#region src/splitters/text.ts var TextSplitter = class extends AbstractNodeSplitter { patterns; constructor(options) { super(options); let priority = 0; this.patterns = [ { regex: /\.(?=\n)/g, type: "period_before_newline", priority: priority++ }, { regex: /(?<!^\s*(?:\d+|[a-zA-Z]+|[ivxlcdmIVXLCDM]+))\.\s+(?=[A-Z])/g, type: "period_before_uppercase", priority: priority++ }, { regex: /[?!]+(?=\s|$)/g, type: "question_exclamation", priority: priority++ }, { regex: /(?<!^\s*(?:\d+|[a-zA-Z]+|[ivxlcdmIVXLCDM]+))\.(?!\s*[a-z])(?!\s*\.)(?!\s*\d)/g, type: "period_safe", priority: priority++ }, { regex: /[:;](?=\s)/g, type: "colon_semicolon", priority: priority++ }, { regex: /\([^)]*\)|\[[^\]]*\]|\{[^}]*\}/g, type: "bracket_pairs", priority: priority++ }, { regex: /"[^"]*"|'[^']*'|`[^`]*`|´[^´]*´|'[^']*'|'[^']*'/g, type: "quote_pairs", priority: priority++ }, { regex: /\n/g, type: "line_break", priority: priority++ }, { regex: /,(?=\s)/g, type: "comma", priority: priority++ }, { regex: /\s[–—-]\s/g, type: "dashes", priority: priority++ }, { regex: /\.{3,}/g, type: "ellipsis", priority: priority++ }, { regex: /\./g, type: "period_fallback", priority: priority++ }, { regex: /\s+/g, type: "whitespace", priority: priority++ } ]; } splitText(text) { const ast = fromMarkdown$1(text); return this.splitNode(ast).map((chunk) => toMarkdown$1(chunk).trim()).filter((chunk) => chunk.length > 0); } splitNode(node) { const text = toMarkdown$1(node); const ast = fromMarkdown$1(text); const protectedRanges = this.extractProtectedRangesFromAST(ast); const boundaries = this.extractSemanticBoundaries(text, protectedRanges); const nodes = []; for (const textChunk of this.splitRecursive(text, boundaries, protectedRanges)) { const root = { type: "root", children: [{ type: "html", value: textChunk }] }; nodes.push(root); } return nodes; } /** * Extract protected ranges from markdown AST nodes * Uses mdast position information to identify constructs that should never be split * * @param ast - Parsed mdast AST with position information * @returns Array of protected ranges that must stay together */ extractProtectedRangesFromAST(ast) { const ranges = []; /** * Recursively traverse AST nodes to find inline constructs that need protection */ const traverse = (node) => { /** * Only protect nodes that have position information */ if (!node.position?.start?.offset || node.position?.end?.offset === void 0) { /** * Still traverse children even if this node lacks position info */ if ("children" in node && Array.isArray(node.children)) node.children.forEach(traverse); return; } const start = node.position.start.offset; const end = node.position.end.offset; /** * Protect inline markdown constructs that should never be split */ switch (node.type) { case "link": case "linkReference": case "image": case "imageReference": case "inlineCode": case "emphasis": case "strong": case "delete": case "heading": if (!this.canSplitNode(node)) ranges.push({ start, end, type: node.type }); break; } /** * Recursively traverse children */ if ("children" in node && Array.isArray(node.children)) node.children.forEach(traverse); }; traverse(ast); /** * Sort by start position and merge only truly overlapping ranges */ const sortedRanges = ranges.sort((a, b) => a.start - b.start); const mergedRanges = []; for (const range of sortedRanges) { const lastMerged = mergedRanges[mergedRanges.length - 1]; if (lastMerged && range.start < lastMerged.end) { /** * Only merge truly overlapping ranges (not adjacent ones) */ lastMerged.end = Math.max(lastMerged.end, range.end); lastMerged.type = `${lastMerged.type}+${range.type}`; } else /** * Non-overlapping range - add it as separate range */ mergedRanges.push(range); } return mergedRanges; } /** * Adjust protected ranges for a substring operation * When working with substrings, the protected ranges need to be recalculated * * @param protectedRanges - Original protected ranges * @param substringStart - Start position of the substring in the original text * @param substringEnd - End position of the substring in the original text * @returns Adjusted protected ranges for the substring */ adjustProtectedRangesForSubstring(protectedRanges, substringStart, substringEnd) { const adjustedRanges = []; for (const range of protectedRanges) /** * Only include ranges that intersect with the substring */ if (range.end > substringStart && range.start < substringEnd) { /** * Adjust the range positions relative to the substring */ const adjustedRange = { start: Math.max(0, range.start - substringStart), end: Math.min(substringEnd - substringStart, range.end - substringStart), type: range.type }; /** * Only include valid ranges (where start < end) */ if (adjustedRange.start < adjustedRange.end) adjustedRanges.push(adjustedRange); } return adjustedRanges; } /** * Find all semantic boundaries with text-based pattern matching * Since structural boundaries are handled by hierarchical AST processing, * this function only identifies semantic text boundaries for fine-grained splitting * * @param text - The text to analyze * @param protectedRanges - Ranges that should not be split * @returns Array of boundaries sorted by priority (desc), then position (asc) */ extractSemanticBoundaries(text, protectedRanges) { const boundaries = []; /** * Find all semantic boundaries for each pattern */ for (const pattern of this.patterns) { /** * Reset lastIndex to ensure the regex starts from the beginning * This is important because the regex objects are reused across calls */ pattern.regex.lastIndex = 0; let match; while ((match = pattern.regex.exec(text)) !== null) { const position = match.index + match[0].length; /** * Only add boundary if not protected */ if (!this.isPositionProtected(position, protectedRanges)) boundaries.push({ position, type: pattern.type, priority: pattern.priority }); } } /** * Sort by priority (ascending), then by position (ascending) * This gives us the highest priority boundaries first, in positional order */ return boundaries.sort((a, b) => a.priority !== b.priority ? a.priority - b.priority : a.position - b.position); } /** * Check if a position falls within any protected range using binary search * Protected ranges are sorted by start position, so we can use binary search * * @param position - Position to check * @param protectedRanges - Sorted array of protected ranges * @returns True if position is within any protected range */ isPositionProtected(position, protectedRanges) { /** * For small arrays, linear search is faster */ if (protectedRanges.length < 10) return protectedRanges.some((range) => position > range.start && position < range.end); /** * Binary search for larger arrays */ let left = 0; let right = protectedRanges.length - 1; while (left <= right) { const mid = Math.floor((left + right) / 2); const range = protectedRanges[mid]; if (position > range.start && position < range.end) return true; if (position <= range.start) right = mid - 1; else left = mid + 1; } return false; } /** * Adjust boundary positions for a substring operation * @param boundaries - Original boundaries * @param substringStart - Start position of substring in original text * @param substringEnd - End position of substring in original text * @returns Boundaries adjusted for the substring */ adjustBoundariesForSubstring(boundaries, substringStart, substringEnd) { return boundaries.filter((b) => b.position > substringStart && b.position <= substringEnd).map((b) => ({ ...b, position: b.position - substringStart })); } /** * Recursively split text using boundary priority hierarchy * Iterates through distinct priority levels (each semantic boundary type has unique priority) * Each recursive call uses only boundaries with lower or equal priority than current level * * @param text - The text to split * @param boundaries - Available boundaries sorted by priority desc, position asc * @param protectedRanges - Pre-computed protected ranges from AST * @param originalOffset - Offset of this text in the original document * @returns Generator yielding text chunks */ *splitRecursive(text, boundaries, protectedRanges, originalOffset = 0) { /** * Text fits within limits */ if (getContentSize(text) <= this.maxAllowedSize) { yield text; return; } /** * If no boundaries available, yield as single chunk (protected) */ if (boundaries.length === 0) { yield text; return; } for (const boundary of boundaries) { /** * Get positions within current text bounds (exclude start and end positions) */ const validPositions = boundaries.filter((b) => b.priority === boundary.priority).map((b) => b.position).filter((pos) => pos > 0 && pos < text.length).sort((a, b) => a - b); if (validPositions.length === 0) continue; /** * Generalized boundary selection strategy: * Length=1 => [0], Length=2 => [0,1], Length=3 => [1], Length=4 => [1,2], etc. */ const mid = Math.floor(validPositions.length / 2); /** * Pick the best candidate from the position candidates */ const { position, firstPart, secondPart, firstPartSize, secondPartSize } = (validPositions.length % 2 === 1 ? [mid] : [mid - 1, mid]).map((index) => { const position$1 = validPositions[index]; const firstPart$1 = text.substring(0, position$1); const secondPart$1 = text.substring(position$1); const firstPartSize$1 = getContentSize(firstPart$1); const secondPartSize$1 = getContentSize(secondPart$1); return { position: position$1, firstPart: firstPart$1, secondPart: secondPart$1, firstPartSize: firstPartSize$1, secondPartSize: secondPartSize$1, bothWithinLimits: firstPartSize$1 <= this.maxAllowedSize && secondPartSize$1 <= this.maxAllowedSize, distance: Math.abs(firstPartSize$1 - secondPartSize$1) }; }).sort((a, b) => { /** * Primary: bothWithinLimits */ if (a.bothWithinLimits && !b.bothWithinLimits) return -1; if (!a.bothWithinLimits && b.bothWithinLimits) return 1; /** * Secondary: distance (smaller is better) */ return a.distance - b.distance; })[0]; /** * Calculate actual positions for boundary adjustments */ const firstPartActualStart = 0; const firstPartActualEnd = position; const secondPartActualStart = position; const secondPartActualEnd = text.length; /** * Priority is ascending, so lower or equal priority boundaries for next level */ const lowerPriorityBoundaries = boundaries.filter((b) => b.priority >= boundary.priority); /** * Recursively process first part if needed */ if (firstPartSize <= this.maxAllowedSize) yield firstPart; else { const firstPartRanges = this.adjustProtectedRangesForSubstring(protectedRanges, originalOffset, originalOffset + position); const firstPartBoundaries = this.adjustBoundariesForSubstring(lowerPriorityBoundaries, firstPartActualStart, firstPartActualEnd); yield* this.splitRecursive(firstPart, firstPartBoundaries, firstPartRanges, originalOffset); } /** * Recursively process second part if needed */ if (secondPartSize <= this.maxAllowedSize) yield secondPart; else { const secondPartRanges = this.adjustProtectedRangesForSubstring(protectedRanges, originalOffset + position, originalOffset + text.length); const secondPartBoundaries = this.adjustBoundariesForSubstring(lowerPriorityBoundaries, secondPartActualStart, secondPartActualEnd); yield* this.splitRecursive(secondPart, secondPartBoundaries, secondPartRanges, originalOffset + secondPartActualStart); } /** * Return after yielding chunks from this valid split */ return; } /** * Yield text as single chunk */ yield text; } }; //#endregion //#region src/splitters/tree.ts var TreeSplitter = class extends AbstractNodeSplitter { nodeSplitters; textSplitter; constructor(options) { super(options); /** * Initialize node splitters */ this.nodeSplitters = new Map([ ["list", new ListSplitter(options)], ["table", new TableSplitter(options)], ["blockquote", new BlockquoteSplitter(options)] ]); /** * Text splitter for inline content */ this.textSplitter = new TextSplitter(options); } splitText(text) { const node = fromMarkdown$1(text); return this.splitNode(node).map((chunk) => toMarkdown$1(chunk).trim()).filter((chunk) => chunk.length > 0); } splitNode(node) { /** * Create a hierarchical AST from the root node */ const hierachicalRoot = createHierarchicalAST(createTree(node)); /** * Split the hierarchical AST into chunks */ const chunks = []; for (const chunk of this.splitTree(hierachicalRoot)) /** * If the chunk is a section, flatten it to a root node * Otherwise, return the chunk as is */ if (isSection(chunk)) chunks.push(flattenHierarchicalAST({ type: "root", children: [chunk] })); else chunks.push(chunk); return chunks; } /** * Main generator that splits hierarchical AST into chunks * All children are sections (including orphaned sections created by createHierarchicalAST) */ *splitTree(hierarchicalAST) { for (const section of hierarchicalAST.children) { /** * If the section fits within the allowed size, yield it and continue to the next section */ if (getSectionSize(section) <= this.maxAllowedSize) { yield section; continue; } /** * If the section is too large, split it down intelligently */ yield* this.splitHierarchicalSection(section); } } /** * Splits a hierarchical section, deciding whether to keep it together or break it down. * Uses hierarchical approach with merging optimization to maximize chunk utilization */ *splitHierarchicalSection(section) { /** * Separate immediate content from nested sections */ const immediateContent = []; const nestedSections = []; for (const child of section.children) if (isSection(child)) nestedSections.push(child); else immediateContent.push(child); /** * Create parent section with immediate content if it exists */ const parentSection = immediateContent.length > 0 || section.heading ? createSection({ depth: section.depth, heading: section.heading, children: immediateContent }) : null; /** * If no nested sections, just process the parent */ if (nestedSections.length === 0) { if (parentSection) yield* this.splitSection(parentSection); return; } /** * Try to merge parent with as many child sections as possible */ const parentSize = parentSection ? getSectionSize(parentSection) : 0; if (parentSection && parentSize <= this.maxAllowedSize) { /** * Find consecutive child sections that can merge with parent */ let accumulatedSize = parentSize; let mergeCount = 0; for (const childSection of nestedSections) { const childSize = getSectionSize(childSection); if (accumulatedSize + childSize <= this.maxAllowedSize) { mergeCount++; accumulatedSize += childSize; } else break; } /** * If we can merge some children with parent, do it */ if (mergeCount > 0) { yield createSection({ ...parentSection, children: [...parentSection.children, ...nestedSections.slice(0, mergeCount)] }); /** * Process remaining child sections */ const remainingSections = nestedSections.slice(mergeCount); if (remainingSections.length > 0) yield* this.mergeSiblingSections(remainingSections); return; } } /** * Parent couldn't be merged with children - process separately */ if (parentSection) yield* this.splitSection(parentSection); /** * Process all child sections through sibling merging */ yield* this.mergeSiblingSections(nestedSections); } /** * Splits section content with grouping to maximize chunk utilization * Works for both regular sections (with heading) and orphaned sections (without heading) */ *splitSection(section) { /** * Extract immediate content (non-section children) */ const contentItems = []; for (const child of section.children) if (!isSection(child)) contentItems.push(child); /** * Handle empty sections */ if (contentItems.length === 0) { if (section.heading) /** * Process only heading */ yield* this.splitSubNode(section.heading); return; } let currentItems = []; let currentItemsSize = 0; /** * Start with heading if it exists */ if (section.heading) { currentItems.push(section.heading); currentItemsSize = getContentSize(section.heading); } for (const item of contentItems) { /** * Calculate item size once */ const itemSize = getContentSize(item); const potentialSize = currentItemsSize + itemSize; if (potentialSize <= this.maxAllowedSize) { /** * Item fits - add to current group to maximize utilization */ currentItems.push(item); currentItemsSize = potentialSize; } else { /** * Item doesn't fit - yield current group and handle this item */ if (currentItems.length > 0) { yield createTree(currentItems); currentItems = []; currentItemsSize = 0; } if (itemSize <= this.maxAllowedSize) { /** * Item fits alone - start new group with it */ currentItems = [item]; currentItemsSize = itemSize; } else /** * Item too large even alone - needs further splitting */ yield* this.splitSubNode(item); } } /** * Yield final group */ if (currentItems.length > 0) yield createTree(currentItems); } /** * Splits individual nodes, delegating to specialized splitters when needed */ *splitSubNode(node) { if (getContentSize(node) <= this.maxAllowedSize) yield node; else { /** * Get the appropriate splitter for the node type */ const splitter = this.nodeSplitters.get(node.type); /** * If the splitter exists, split the node and yield the result. * Otherwise, split the node using the text splitter. */ if (splitter) yield* splitter.splitNode(node); else yield* this.textSplitter.splitNode(node); } } /** * Merges sibling sections by grouping consecutive sections that fit within allowed size * Groups siblings at the same hierarchical level to maximize chunk utilization */ *mergeSiblingSections(sections) { let siblings = []; let siblingsSize = 0; /** * Depth of the siblings' parent section. * Use -1 because we are merging sections at the same hierarchical level. */ const siblingsDepth = Math.max(1, sections[0].depth) - 1; for (const section of sections) { const sectionSize = getSectionSize(section); /** * If section is too large by itself, yield current group and process section separately */ if (sectionSize > this.maxAllowedSize) { /** * Yield accumulated group if any */ if (siblings.length > 0) yield createSection({ depth: siblingsDepth, children: siblings }); /** * Process oversized section */ yield* this.splitHierarchicalSection(section); /** * Reset group */ siblings = []; siblingsSize = 0; continue; } /** * If adding this section would exceed limit, yield current group first */ const combinedSize = siblingsSize + sectionSize; if (siblings.length > 0 && combinedSize > this.maxAllowedSize) { yield createSection({ depth: siblingsDepth, children: siblings }); /** * Reset group */ siblings = []; siblingsSize = 0; } /** * Add section to current group */ siblings.push(section); siblingsSize += sectionSize; } /** * Yield remaining group */ if (siblings.length > 0) yield createSection({ depth: siblingsDepth, children: siblings }); } }; //#endregion //#region src/chunkdown.ts /** * Default node rules: * - Links * - Never split * - Normalize to inline style * - Images * - Never split * - Normalize to inline style */ const defaultNodeRules = { link: { split: "never-split", style: "inline" }, image: { split: "never-split", style: "inline" } }; var Chunkdown = class { options; splitter; constructor(options) { this.options = { ...options, maxOverflowRatio: Math.max(1, options.maxOverflowRatio ?? 1) }; this.splitter = new TreeSplitter(this.options); } get chunkSize() { return this.options.chunkSize; } get maxOverflowRatio() { return this.options.maxOverflowRatio ?? 1; } get maxRawSize() { return this.options.maxRawSize; } splitText(text) { const root = fromMarkdown$1(text); const chunks = this.splitNode(root).map((node) => toMarkdown$1(node).trim()).filter((chunk) => chunk.length > 0); if (this.options.maxRawSize !== void 0) return Array.from(splitByMaxRawSize(chunks, this.options.maxRawSize)); return chunks; } splitNode(root) { const preparedRoot = preprocessMarkdown(root, this.options); return this.splitter.splitNode(preparedRoot); } }; /** * Create a new Chunkdown instance. * Applies default node rules if no custom rules are provided. */ const chunkdown = (options) => { const rules = options.rules ?? defaultNodeRules; return new Chunkdown({ ...options, rules }); }; //#endregion export { chunkdown };