chunkdown
Version:
A tree-based markdown text splitter that understands document structure to create semantically meaningful chunks for RAG applications
1,478 lines (1,467 loc) • 42.4 kB
JavaScript
import { fromMarkdown } from "mdast-util-from-markdown";
import { gfmFromMarkdown, gfmToMarkdown } from "mdast-util-gfm";
import { toMarkdown } from "mdast-util-to-markdown";
import { gfm } from "micromark-extension-gfm";
import { visit } from "unist-util-visit";
import { toString } from "mdast-util-to-string";
//#region src/markdown.ts
const fromMarkdown$1 = (value) => {
return fromMarkdown(value, {
extensions: [gfm()],
mdastExtensions: [gfmFromMarkdown()]
});
};
const toMarkdown$1 = (tree) => {
return toMarkdown(tree, {
resourceLink: true,
extensions: [gfmToMarkdown({ tablePipeAlign: false })]
});
};
/**
* Apply transform functions to nodes in the tree based on configured rules.
* Transforms are applied in a single pass through the tree.
*/
function applyTransformations(tree, rules) {
const transformerMap = /* @__PURE__ */ new Map();
for (const [type, rule] of Object.entries(rules)) if (rule?.transform) transformerMap.set(type, rule.transform);
if (transformerMap.size === 0) return tree;
if (transformerMap.has("formatting")) {
const formatting = transformerMap.get("formatting");
transformerMap.set("strong", transformerMap.get("strong") ?? formatting);
transformerMap.set("emphasis", transformerMap.get("emphasis") ?? formatting);
transformerMap.set("delete", transformerMap.get("delete") ?? formatting);
transformerMap.delete("formatting");
}
visit(tree, Array.from(transformerMap.keys()), (node, index, parent) => {
const transform = transformerMap.get(node.type);
if (transform && parent && typeof index === "number") {
const result = transform(node, {
parent,
index,
root: tree
});
if (result) parent.children[index] = result;
if (result === null) {
parent.children.splice(index, 1);
return index;
}
}
});
return tree;
}
/**
* Preprocess a markdown tree based on the provided splitter options.
*/
function preprocessMarkdown(tree, options) {
let normalizedTree = tree;
if (options.rules) {
normalizedTree = normalizeReferences(normalizedTree, options.rules);
normalizedTree = applyTransformations(normalizedTree, options.rules);
}
return normalizedTree;
}
/**
* Normalize reference-style links and images to inline style.
*/
function normalizeReferences(tree, rules) {
const nodeTypes = [];
if (rules.link?.style === "inline") nodeTypes.push("linkReference");
if (rules.image?.style === "inline") nodeTypes.push("imageReference");
if (nodeTypes.length === 0) return tree;
const definitions = /* @__PURE__ */ new Map();
visit(tree, "definition", (node) => {
const id = node.identifier.toLowerCase();
definitions.set(id, node);
});
if (definitions.size === 0) return tree;
const usedDefinitions = /* @__PURE__ */ new Set();
visit(tree, nodeTypes, (node, index, parent) => {
if (!parent || typeof index !== "number") return;
if (node.type === "linkReference") {
const linkRef = node;
const id = linkRef.identifier.toLowerCase();
const def = definitions.get(id);
if (def && rules.link?.style === "inline") {
const link = {
type: "link",
url: def.url,
title: def.title,
children: linkRef.children,
position: linkRef.position
};
parent.children[index] = link;
usedDefinitions.add(id);
}
} else if (node.type === "imageReference") {
const imageRef = node;
const id = imageRef.identifier.toLowerCase();
const def = definitions.get(id);
if (def && rules.image?.style === "inline") {
const image = {
type: "image",
url: def.url,
title: def.title,
alt: imageRef.alt,
position: imageRef.position
};
parent.children[index] = image;
usedDefinitions.add(id);
}
}
});
if (usedDefinitions.size > 0) tree.children = tree.children.filter((node) => {
if (node.type !== "definition") return true;
const id = node.identifier.toLowerCase();
return !usedDefinitions.has(id);
});
return tree;
}
//#endregion
//#region src/ast.ts
/**
* Transform a flat mdast AST into a hierarchical structure where headings
* contain their associated content and nested subsections.
*/
const createHierarchicalAST = (root) => {
/**
* Transform nodes into hierarchical sections using a simple iterative approach
* Groups consecutive non-section children into orphaned sections (depth 0, heading undefined)
*/
const transform = (nodes) => {
const result = [];
let i = 0;
while (i < nodes.length) {
const node = nodes[i];
if (node.type === "heading") {
/**
* Start a new section
*/
const section = {
type: "section",
depth: node.depth,
heading: node,
children: []
};
/**
* Move past the heading
*/
i++;
/**
* Collect all content until we hit a heading of same or higher level or a thematic break
*/
while (i < nodes.length) {
const nextNode = nodes[i];
if (nextNode.type === "heading" && nextNode.depth <= node.depth)
/**
* Found a heading at same or higher level - stop collecting
*/
break;
if (nextNode.type === "thematicBreak") {
/**
* Found a thematic break - include it at the end of this section
* The thematic break logically closes the section
*/
section.children.push(nextNode);
i++;
break;
}
/**
* Add this content to the section
*/
section.children.push(nextNode);
i++;
}
section.children = transform(section.children.filter((child) => !isSection(child)));
result.push(section);
} else {
/**
* Regular non-heading content (including orphaned thematic breaks)
*/
result.push(node);
i++;
}
}
return result;
};
const sections = transform(root.children);
/**
* Group consecutive non-section children into orphaned sections
*/
const groupedSections = [];
let orphanedContent = [];
for (const child of sections) if (isSection(child)) {
/**
* If we have accumulated orphaned content, create an orphaned section
*/
if (orphanedContent.length > 0) {
const orphanedSection = {
type: "section",
depth: 0,
heading: void 0,
children: orphanedContent
};
groupedSections.push(orphanedSection);
orphanedContent = [];
}
/**
* Add the regular section
*/
groupedSections.push(child);
} else
/**
* Add orphaned content
*/
orphanedContent.push(child);
/**
* Add any remaining orphaned content to the grouped sections
*/
if (orphanedContent.length > 0) {
const orphanedSection = {
type: "section",
depth: 0,
heading: void 0,
children: orphanedContent
};
groupedSections.push(orphanedSection);
}
return {
type: "root",
children: groupedSections
};
};
/**
* Check if a node is a Section
*/
const isSection = (node) => {
return node?.type === "section";
};
/**
* Create a root node from a single node or an array of nodes.
* If the node is already a root node, it is returned as is.
* If the node is an array of nodes, a root node is created with the nodes as children.
*/
const createTree = (nodes) => {
if (Array.isArray(nodes)) return {
type: "root",
children: nodes
};
if (nodes.type === "root") return nodes;
return createTree([nodes]);
};
/**
* Create a section node from a partial section.
*/
const createSection = (section) => {
return {
type: "section",
depth: section.depth ?? 0,
heading: section.heading,
children: section.children ?? []
};
};
/**
* Convert hierarchical AST back to flat structure
*/
const flattenHierarchicalAST = (ast) => {
const flatten = (nodes) => {
const result = [];
for (const node of nodes) if (isSection(node)) {
if (node.heading) result.push(node.heading);
result.push(...flatten(node.children));
} else result.push(node);
return result;
};
return {
type: "root",
children: flatten(ast.children)
};
};
//#endregion
//#region src/size.ts
/**
* Calculate the content size of markdown content or AST node
* Uses the actual text content without markdown formatting characters
*
* @param input - The markdown text or AST node to measure
* @returns The size of the actual text content (without formatting)
*/
const getContentSize = (input) => {
if (!input) return 0;
if (typeof input === "string") return getContentSize(fromMarkdown$1(input));
return toString(input).length;
};
const getSectionSize = (section) => {
let totalLength = 0;
if (section.heading) totalLength = getContentSize(section.heading);
for (const child of section.children) if (isSection(child)) totalLength += getSectionSize(child);
else totalLength += getContentSize(child);
return totalLength;
};
/**
* Split chunks by maxRawSize limit as a hard constraint on raw markdown length
* Splits chunks that exceed the limit, preferring whitespace boundaries
*
* @param chunks - Array of markdown chunks
* @param maxRawSize - Maximum raw character length per chunk
* @returns Generator yielding chunks with no chunk exceeding maxRawSize
*/
function* splitByMaxRawSize(chunks, maxRawSize) {
for (const chunk of chunks) {
if (chunk.length <= maxRawSize) {
yield chunk;
continue;
}
let remaining = chunk;
while (remaining.length > maxRawSize) {
let splitPos = maxRawSize;
let foundWhitespace = false;
for (let i = maxRawSize - 1; i >= Math.floor(maxRawSize * .8); i--) if (/\s/.test(remaining[i])) {
splitPos = i;
foundWhitespace = true;
break;
}
const splitChunk = remaining.substring(0, splitPos);
remaining = foundWhitespace ? remaining.substring(splitPos).trim() : remaining.substring(splitPos);
const trimmedChunk = splitChunk.trim();
if (trimmedChunk.length > 0) yield trimmedChunk;
}
if (remaining.length > 0) yield remaining;
}
}
//#endregion
//#region src/splitters/base.ts
/**
* Abstract base class for node splitters
*/
var AbstractNodeSplitter = class {
options;
chunkSize;
maxOverflowRatio;
maxAllowedSize;
maxRawSize;
splitRules;
constructor(options) {
this.options = options;
this.chunkSize = options.chunkSize;
this.maxOverflowRatio = Math.max(1, options.maxOverflowRatio ?? 1);
this.maxAllowedSize = this.chunkSize * this.maxOverflowRatio;
this.maxRawSize = options.maxRawSize;
/**
* Normalize all split rules from SimpleSplitRule to ComplexSplitRule
*/
this.splitRules = {};
if (options.rules) for (const nodeType in options.rules) {
const key = nodeType;
const nodeRule = options.rules[key];
if (nodeRule?.split) if (nodeRule.split === "never-split") this.splitRules[key] = { rule: "never-split" };
else if (nodeRule.split === "allow-split") this.splitRules[key] = { rule: "allow-split" };
else this.splitRules[key] = nodeRule.split;
}
}
/**
* Check if a node can be split based on its split rule
*/
canSplitNode(node) {
let splitRule = this.splitRules[node.type];
/**
* Formatting nodes can also be configured with the formatting split rule
*/
if (!splitRule && (node.type === "strong" || node.type === "emphasis" || node.type === "delete")) splitRule = this.splitRules.formatting;
/**
* No rule defaults to allow splitting
*/
if (!splitRule) return true;
/**
* Never split the node even if it exceeds the size limit
*/
if (splitRule.rule === "never-split") return false;
/**
* Allow splitting the node
*/
if (splitRule.rule === "allow-split") return true;
/**
* Protected the node up to the size limit
*/
if (splitRule.rule === "size-split") return getContentSize(node) > splitRule.size;
/**
* Default to allow splitting
*/
return true;
}
};
//#endregion
//#region src/splitters/blockquote.ts
/**
* Blockquote splitter
*/
var BlockquoteSplitter = class extends AbstractNodeSplitter {
splitRule;
constructor(options) {
super(options);
this.splitRule = this.splitRules.blockquote;
}
splitText(text) {
const blockquote = fromMarkdown$1(text).children[0];
if (blockquote.type !== "blockquote") throw new Error("Text is not a blockquote");
return this.splitNode(blockquote).map((chunk) => toMarkdown$1(chunk).trim());
}
splitNode(blockquote) {
const subBlockquotes = [];
for (const subBlockquote of this.splitBlockquote(blockquote)) subBlockquotes.push(subBlockquote);
return subBlockquotes;
}
*splitBlockquote(blockquote) {
if (!this.canSplitNode(blockquote)) {
yield blockquote;
return;
}
let subBlockquote = {
...blockquote,
children: []
};
let subBlockquoteSize = 0;
for (const block of blockquote.children) {
const blockSize = getContentSize({
...blockquote,
children: [block]
});
/**
* If the current sub-blockquote is too large, yield it and start a new sub-blockquote
*/
if (subBlockquoteSize + blockSize > this.maxAllowedSize) {
if (subBlockquote.children.length > 0) yield subBlockquote;
subBlockquote = {
...blockquote,
children: []
};
subBlockquoteSize = 0;
}
/**
* If the current block is too large, split it and yield the chunks
*/
if (blockSize > this.maxAllowedSize) {
for (const subBlock of this.splitBlock(blockquote, block)) yield subBlock;
subBlockquote = {
...blockquote,
children: []
};
subBlockquoteSize = 0;
continue;
}
/**
* If the current block fits, add it to the sub-blockquote
*/
subBlockquote.children.push(block);
subBlockquoteSize += blockSize;
}
/**
* If there are any remaining blocks in the sub-blockquote, yield it
*/
if (subBlockquote.children.length > 0) yield subBlockquote;
}
*splitBlock(blockquote, block) {
/**
* Convert the block to a tree
*/
const blockTree = {
type: "root",
children: [block]
};
const blockChunks = new TreeSplitter(this.options).splitNode(blockTree);
/**
* Wrap each chunk back into blockquote and yield it
*/
for (const chunk of blockChunks) {
if (!("children" in chunk) || chunk.children.length === 0) continue;
yield {
...blockquote,
children: chunk.children
};
}
}
};
//#endregion
//#region src/splitters/list.ts
/**
* List splitter
*/
var ListSplitter = class extends AbstractNodeSplitter {
splitRule;
constructor(options) {
super(options);
this.splitRule = this.splitRules.list;
}
splitText(text) {
const list = fromMarkdown$1(text).children[0];
if (list.type !== "list") throw new Error("Text is not a list");
return this.splitNode(list).map((chunk) => toMarkdown$1(chunk).trim());
}
splitNode(node) {
const nodes = [];
for (const subList of this.splitList(node)) nodes.push(subList);
return nodes;
}
*splitList(list) {
if (!this.canSplitNode(list)) {
yield list;
return;
}
let subList = {
...list,
children: []
};
let subListSize = 0;
const listOriginalStart = list.start || 1;
let listItemIndex = 0;
for (const listItem of list.children) {
const listItemSize = getContentSize(listItem);
/**
* If the current sublist is too large, yield it and start a new sublist
*/
if (subListSize + listItemSize > this.maxAllowedSize) {
if (subList.children.length > 0) {
if (list.ordered) subList.start = listOriginalStart + listItemIndex;
yield subList;
/**
* Sub list items are all added to the same sub list, so we need to increment the list item index by the number of items in the sub list
*/
listItemIndex += subList.children.length;
}
subList = {
...list,
children: []
};
subListSize = 0;
}
/**
* If the current list item is too large, split it and yield the chunks
*/
if (listItemSize > this.maxAllowedSize) {
subList = {
...list,
children: []
};
subListSize = listItemSize;
if (list.ordered) subList.start = listOriginalStart + listItemIndex;
for (const subListItem of this.splitListItem(subList, listItem)) yield subListItem;
subList = {
...list,
children: []
};
subListSize = 0;
/**
* Sub list items are all added to the same sub list, so we need to increment the list item index by 1
*/
listItemIndex += 1;
continue;
}
/**
* If the current list item fits, add it to the sublist
*/
subList.children.push(listItem);
subListSize += listItemSize;
}
/**
* If there are any remaining items in the sublist, yield it
*/
if (subList.children.length > 0) {
if (list.ordered) subList.start = listOriginalStart + listItemIndex;
yield subList;
}
}
*splitListItem(list, listItem) {
/**
* Convert the list item to a tree
*/
const listItemTree = {
type: "root",
children: listItem.children
};
const listItemChunks = new TreeSplitter(this.options).splitNode(listItemTree);
for (let i = 0; i < listItemChunks.length; i++) {
const chunk = listItemChunks[i];
if (!("children" in chunk) || chunk.children.length === 0) continue;
/**
* Wrap the first chunk back into list item and yield it.
* The remaining chunks are yielded as is.
*/
if (i === 0) {
const subListItem = {
...listItem,
children: chunk.children
};
yield {
...list,
children: [subListItem]
};
} else yield createTree(chunk);
}
}
};
//#endregion
//#region src/splitters/table.ts
/**
* Table splitter
*/
var TableSplitter = class extends AbstractNodeSplitter {
splitRule;
constructor(options) {
super(options);
this.splitRule = this.splitRules.table;
}
splitText(text) {
const table = fromMarkdown$1(text).children[0];
if (table.type !== "table") throw new Error("Text is not a table");
return this.splitNode(table).map((chunk) => toMarkdown$1(chunk).trim());
}
splitNode(table) {
const nodes = [];
for (const node of this.splitTable(table)) nodes.push(node);
return nodes;
}
*splitTable(table) {
if (!this.canSplitNode(table)) {
yield table;
return;
}
if (table.children.length === 0) return;
const headerRow = table.children[0];
let subTable = {
...table,
children: [headerRow]
};
let subTableSize = 0;
for (let i = 1; i < table.children.length; i++) {
const row = table.children[i];
const rowSize = getContentSize(row);
/**
* If the current sub-table is too large, yield it and start a new sub-table
*/
if (subTableSize + rowSize > this.maxAllowedSize) {
if (subTable.children.length > 1) yield subTable;
subTable = {
...table,
children: [headerRow]
};
subTableSize = 0;
}
/**
* If the current row is too large, split it by cells and yield mini-tables
*/
if (rowSize > this.maxAllowedSize) {
for (const node of this.splitTableRow(table, headerRow, row)) yield node;
subTable = {
...table,
children: [headerRow]
};
subTableSize = 0;
continue;
}
/**
* If the current row fits, add it to the sub-table
*/
subTable.children.push(row);
subTableSize += rowSize;
}
/**
* If there are any remaining rows in the sub-table, yield it
*/
if (subTable.children.length > 1) yield subTable;
}
*splitTableRow(table, headerRow, row) {
/**
* Create mini-tables for each cell, pairing it with its corresponding header cell
*/
for (let cellIndex = 0; cellIndex < row.children.length; cellIndex++) {
const cell = row.children[cellIndex];
const headerCell = headerRow.children[cellIndex];
/**
* Create a mini-table with one column: header cell + data cell
*/
const miniHeaderRow = {
type: "tableRow",
children: [headerCell]
};
const miniDataRow = {
type: "tableRow",
children: [cell]
};
const miniTable = {
...table,
children: [miniHeaderRow, miniDataRow]
};
/**
* If the mini-table (single cell + header) is still too large,
* split the cell content using the tree splitter
*/
if (getContentSize(miniDataRow) > this.maxAllowedSize) for (const node of this.splitTableCell(table, headerCell, cell)) yield node;
else yield miniTable;
}
}
*splitTableCell(table, headerCell, cell) {
/**
* Convert the cell content to a tree structure
*/
const cellTree = {
type: "root",
children: cell.children
};
const cellChunks = new TreeSplitter(this.options).splitNode(cellTree);
/**
* Create a mini-table for each chunk with the header cell
*/
for (const chunk of cellChunks) {
if (!("children" in chunk) || chunk.children.length === 0) continue;
const miniHeaderRow = {
type: "tableRow",
children: [headerCell]
};
const miniDataRow = {
type: "tableRow",
children: [{
...cell,
children: chunk.children
}]
};
yield {
...table,
children: [miniHeaderRow, miniDataRow]
};
}
}
};
//#endregion
//#region src/splitters/text.ts
var TextSplitter = class extends AbstractNodeSplitter {
patterns;
constructor(options) {
super(options);
let priority = 0;
this.patterns = [
{
regex: /\.(?=\n)/g,
type: "period_before_newline",
priority: priority++
},
{
regex: /(?<!^\s*(?:\d+|[a-zA-Z]+|[ivxlcdmIVXLCDM]+))\.\s+(?=[A-Z])/g,
type: "period_before_uppercase",
priority: priority++
},
{
regex: /[?!]+(?=\s|$)/g,
type: "question_exclamation",
priority: priority++
},
{
regex: /(?<!^\s*(?:\d+|[a-zA-Z]+|[ivxlcdmIVXLCDM]+))\.(?!\s*[a-z])(?!\s*\.)(?!\s*\d)/g,
type: "period_safe",
priority: priority++
},
{
regex: /[:;](?=\s)/g,
type: "colon_semicolon",
priority: priority++
},
{
regex: /\([^)]*\)|\[[^\]]*\]|\{[^}]*\}/g,
type: "bracket_pairs",
priority: priority++
},
{
regex: /"[^"]*"|'[^']*'|`[^`]*`|´[^´]*´|'[^']*'|'[^']*'/g,
type: "quote_pairs",
priority: priority++
},
{
regex: /\n/g,
type: "line_break",
priority: priority++
},
{
regex: /,(?=\s)/g,
type: "comma",
priority: priority++
},
{
regex: /\s[–—-]\s/g,
type: "dashes",
priority: priority++
},
{
regex: /\.{3,}/g,
type: "ellipsis",
priority: priority++
},
{
regex: /\./g,
type: "period_fallback",
priority: priority++
},
{
regex: /\s+/g,
type: "whitespace",
priority: priority++
}
];
}
splitText(text) {
const ast = fromMarkdown$1(text);
return this.splitNode(ast).map((chunk) => toMarkdown$1(chunk).trim()).filter((chunk) => chunk.length > 0);
}
splitNode(node) {
const text = toMarkdown$1(node);
const ast = fromMarkdown$1(text);
const protectedRanges = this.extractProtectedRangesFromAST(ast);
const boundaries = this.extractSemanticBoundaries(text, protectedRanges);
const nodes = [];
for (const textChunk of this.splitRecursive(text, boundaries, protectedRanges)) {
const root = {
type: "root",
children: [{
type: "html",
value: textChunk
}]
};
nodes.push(root);
}
return nodes;
}
/**
* Extract protected ranges from markdown AST nodes
* Uses mdast position information to identify constructs that should never be split
*
* @param ast - Parsed mdast AST with position information
* @returns Array of protected ranges that must stay together
*/
extractProtectedRangesFromAST(ast) {
const ranges = [];
/**
* Recursively traverse AST nodes to find inline constructs that need protection
*/
const traverse = (node) => {
/**
* Only protect nodes that have position information
*/
if (!node.position?.start?.offset || node.position?.end?.offset === void 0) {
/**
* Still traverse children even if this node lacks position info
*/
if ("children" in node && Array.isArray(node.children)) node.children.forEach(traverse);
return;
}
const start = node.position.start.offset;
const end = node.position.end.offset;
/**
* Protect inline markdown constructs that should never be split
*/
switch (node.type) {
case "link":
case "linkReference":
case "image":
case "imageReference":
case "inlineCode":
case "emphasis":
case "strong":
case "delete":
case "heading":
if (!this.canSplitNode(node)) ranges.push({
start,
end,
type: node.type
});
break;
}
/**
* Recursively traverse children
*/
if ("children" in node && Array.isArray(node.children)) node.children.forEach(traverse);
};
traverse(ast);
/**
* Sort by start position and merge only truly overlapping ranges
*/
const sortedRanges = ranges.sort((a, b) => a.start - b.start);
const mergedRanges = [];
for (const range of sortedRanges) {
const lastMerged = mergedRanges[mergedRanges.length - 1];
if (lastMerged && range.start < lastMerged.end) {
/**
* Only merge truly overlapping ranges (not adjacent ones)
*/
lastMerged.end = Math.max(lastMerged.end, range.end);
lastMerged.type = `${lastMerged.type}+${range.type}`;
} else
/**
* Non-overlapping range - add it as separate range
*/
mergedRanges.push(range);
}
return mergedRanges;
}
/**
* Adjust protected ranges for a substring operation
* When working with substrings, the protected ranges need to be recalculated
*
* @param protectedRanges - Original protected ranges
* @param substringStart - Start position of the substring in the original text
* @param substringEnd - End position of the substring in the original text
* @returns Adjusted protected ranges for the substring
*/
adjustProtectedRangesForSubstring(protectedRanges, substringStart, substringEnd) {
const adjustedRanges = [];
for (const range of protectedRanges)
/**
* Only include ranges that intersect with the substring
*/
if (range.end > substringStart && range.start < substringEnd) {
/**
* Adjust the range positions relative to the substring
*/
const adjustedRange = {
start: Math.max(0, range.start - substringStart),
end: Math.min(substringEnd - substringStart, range.end - substringStart),
type: range.type
};
/**
* Only include valid ranges (where start < end)
*/
if (adjustedRange.start < adjustedRange.end) adjustedRanges.push(adjustedRange);
}
return adjustedRanges;
}
/**
* Find all semantic boundaries with text-based pattern matching
* Since structural boundaries are handled by hierarchical AST processing,
* this function only identifies semantic text boundaries for fine-grained splitting
*
* @param text - The text to analyze
* @param protectedRanges - Ranges that should not be split
* @returns Array of boundaries sorted by priority (desc), then position (asc)
*/
extractSemanticBoundaries(text, protectedRanges) {
const boundaries = [];
/**
* Find all semantic boundaries for each pattern
*/
for (const pattern of this.patterns) {
/**
* Reset lastIndex to ensure the regex starts from the beginning
* This is important because the regex objects are reused across calls
*/
pattern.regex.lastIndex = 0;
let match;
while ((match = pattern.regex.exec(text)) !== null) {
const position = match.index + match[0].length;
/**
* Only add boundary if not protected
*/
if (!this.isPositionProtected(position, protectedRanges)) boundaries.push({
position,
type: pattern.type,
priority: pattern.priority
});
}
}
/**
* Sort by priority (ascending), then by position (ascending)
* This gives us the highest priority boundaries first, in positional order
*/
return boundaries.sort((a, b) => a.priority !== b.priority ? a.priority - b.priority : a.position - b.position);
}
/**
* Check if a position falls within any protected range using binary search
* Protected ranges are sorted by start position, so we can use binary search
*
* @param position - Position to check
* @param protectedRanges - Sorted array of protected ranges
* @returns True if position is within any protected range
*/
isPositionProtected(position, protectedRanges) {
/**
* For small arrays, linear search is faster
*/
if (protectedRanges.length < 10) return protectedRanges.some((range) => position > range.start && position < range.end);
/**
* Binary search for larger arrays
*/
let left = 0;
let right = protectedRanges.length - 1;
while (left <= right) {
const mid = Math.floor((left + right) / 2);
const range = protectedRanges[mid];
if (position > range.start && position < range.end) return true;
if (position <= range.start) right = mid - 1;
else left = mid + 1;
}
return false;
}
/**
* Adjust boundary positions for a substring operation
* @param boundaries - Original boundaries
* @param substringStart - Start position of substring in original text
* @param substringEnd - End position of substring in original text
* @returns Boundaries adjusted for the substring
*/
adjustBoundariesForSubstring(boundaries, substringStart, substringEnd) {
return boundaries.filter((b) => b.position > substringStart && b.position <= substringEnd).map((b) => ({
...b,
position: b.position - substringStart
}));
}
/**
* Recursively split text using boundary priority hierarchy
* Iterates through distinct priority levels (each semantic boundary type has unique priority)
* Each recursive call uses only boundaries with lower or equal priority than current level
*
* @param text - The text to split
* @param boundaries - Available boundaries sorted by priority desc, position asc
* @param protectedRanges - Pre-computed protected ranges from AST
* @param originalOffset - Offset of this text in the original document
* @returns Generator yielding text chunks
*/
*splitRecursive(text, boundaries, protectedRanges, originalOffset = 0) {
/**
* Text fits within limits
*/
if (getContentSize(text) <= this.maxAllowedSize) {
yield text;
return;
}
/**
* If no boundaries available, yield as single chunk (protected)
*/
if (boundaries.length === 0) {
yield text;
return;
}
for (const boundary of boundaries) {
/**
* Get positions within current text bounds (exclude start and end positions)
*/
const validPositions = boundaries.filter((b) => b.priority === boundary.priority).map((b) => b.position).filter((pos) => pos > 0 && pos < text.length).sort((a, b) => a - b);
if (validPositions.length === 0) continue;
/**
* Generalized boundary selection strategy:
* Length=1 => [0], Length=2 => [0,1], Length=3 => [1], Length=4 => [1,2], etc.
*/
const mid = Math.floor(validPositions.length / 2);
/**
* Pick the best candidate from the position candidates
*/
const { position, firstPart, secondPart, firstPartSize, secondPartSize } = (validPositions.length % 2 === 1 ? [mid] : [mid - 1, mid]).map((index) => {
const position$1 = validPositions[index];
const firstPart$1 = text.substring(0, position$1);
const secondPart$1 = text.substring(position$1);
const firstPartSize$1 = getContentSize(firstPart$1);
const secondPartSize$1 = getContentSize(secondPart$1);
return {
position: position$1,
firstPart: firstPart$1,
secondPart: secondPart$1,
firstPartSize: firstPartSize$1,
secondPartSize: secondPartSize$1,
bothWithinLimits: firstPartSize$1 <= this.maxAllowedSize && secondPartSize$1 <= this.maxAllowedSize,
distance: Math.abs(firstPartSize$1 - secondPartSize$1)
};
}).sort((a, b) => {
/**
* Primary: bothWithinLimits
*/
if (a.bothWithinLimits && !b.bothWithinLimits) return -1;
if (!a.bothWithinLimits && b.bothWithinLimits) return 1;
/**
* Secondary: distance (smaller is better)
*/
return a.distance - b.distance;
})[0];
/**
* Calculate actual positions for boundary adjustments
*/
const firstPartActualStart = 0;
const firstPartActualEnd = position;
const secondPartActualStart = position;
const secondPartActualEnd = text.length;
/**
* Priority is ascending, so lower or equal priority boundaries for next level
*/
const lowerPriorityBoundaries = boundaries.filter((b) => b.priority >= boundary.priority);
/**
* Recursively process first part if needed
*/
if (firstPartSize <= this.maxAllowedSize) yield firstPart;
else {
const firstPartRanges = this.adjustProtectedRangesForSubstring(protectedRanges, originalOffset, originalOffset + position);
const firstPartBoundaries = this.adjustBoundariesForSubstring(lowerPriorityBoundaries, firstPartActualStart, firstPartActualEnd);
yield* this.splitRecursive(firstPart, firstPartBoundaries, firstPartRanges, originalOffset);
}
/**
* Recursively process second part if needed
*/
if (secondPartSize <= this.maxAllowedSize) yield secondPart;
else {
const secondPartRanges = this.adjustProtectedRangesForSubstring(protectedRanges, originalOffset + position, originalOffset + text.length);
const secondPartBoundaries = this.adjustBoundariesForSubstring(lowerPriorityBoundaries, secondPartActualStart, secondPartActualEnd);
yield* this.splitRecursive(secondPart, secondPartBoundaries, secondPartRanges, originalOffset + secondPartActualStart);
}
/**
* Return after yielding chunks from this valid split
*/
return;
}
/**
* Yield text as single chunk
*/
yield text;
}
};
//#endregion
//#region src/splitters/tree.ts
var TreeSplitter = class extends AbstractNodeSplitter {
nodeSplitters;
textSplitter;
constructor(options) {
super(options);
/**
* Initialize node splitters
*/
this.nodeSplitters = new Map([
["list", new ListSplitter(options)],
["table", new TableSplitter(options)],
["blockquote", new BlockquoteSplitter(options)]
]);
/**
* Text splitter for inline content
*/
this.textSplitter = new TextSplitter(options);
}
splitText(text) {
const node = fromMarkdown$1(text);
return this.splitNode(node).map((chunk) => toMarkdown$1(chunk).trim()).filter((chunk) => chunk.length > 0);
}
splitNode(node) {
/**
* Create a hierarchical AST from the root node
*/
const hierachicalRoot = createHierarchicalAST(createTree(node));
/**
* Split the hierarchical AST into chunks
*/
const chunks = [];
for (const chunk of this.splitTree(hierachicalRoot))
/**
* If the chunk is a section, flatten it to a root node
* Otherwise, return the chunk as is
*/
if (isSection(chunk)) chunks.push(flattenHierarchicalAST({
type: "root",
children: [chunk]
}));
else chunks.push(chunk);
return chunks;
}
/**
* Main generator that splits hierarchical AST into chunks
* All children are sections (including orphaned sections created by createHierarchicalAST)
*/
*splitTree(hierarchicalAST) {
for (const section of hierarchicalAST.children) {
/**
* If the section fits within the allowed size, yield it and continue to the next section
*/
if (getSectionSize(section) <= this.maxAllowedSize) {
yield section;
continue;
}
/**
* If the section is too large, split it down intelligently
*/
yield* this.splitHierarchicalSection(section);
}
}
/**
* Splits a hierarchical section, deciding whether to keep it together or break it down.
* Uses hierarchical approach with merging optimization to maximize chunk utilization
*/
*splitHierarchicalSection(section) {
/**
* Separate immediate content from nested sections
*/
const immediateContent = [];
const nestedSections = [];
for (const child of section.children) if (isSection(child)) nestedSections.push(child);
else immediateContent.push(child);
/**
* Create parent section with immediate content if it exists
*/
const parentSection = immediateContent.length > 0 || section.heading ? createSection({
depth: section.depth,
heading: section.heading,
children: immediateContent
}) : null;
/**
* If no nested sections, just process the parent
*/
if (nestedSections.length === 0) {
if (parentSection) yield* this.splitSection(parentSection);
return;
}
/**
* Try to merge parent with as many child sections as possible
*/
const parentSize = parentSection ? getSectionSize(parentSection) : 0;
if (parentSection && parentSize <= this.maxAllowedSize) {
/**
* Find consecutive child sections that can merge with parent
*/
let accumulatedSize = parentSize;
let mergeCount = 0;
for (const childSection of nestedSections) {
const childSize = getSectionSize(childSection);
if (accumulatedSize + childSize <= this.maxAllowedSize) {
mergeCount++;
accumulatedSize += childSize;
} else break;
}
/**
* If we can merge some children with parent, do it
*/
if (mergeCount > 0) {
yield createSection({
...parentSection,
children: [...parentSection.children, ...nestedSections.slice(0, mergeCount)]
});
/**
* Process remaining child sections
*/
const remainingSections = nestedSections.slice(mergeCount);
if (remainingSections.length > 0) yield* this.mergeSiblingSections(remainingSections);
return;
}
}
/**
* Parent couldn't be merged with children - process separately
*/
if (parentSection) yield* this.splitSection(parentSection);
/**
* Process all child sections through sibling merging
*/
yield* this.mergeSiblingSections(nestedSections);
}
/**
* Splits section content with grouping to maximize chunk utilization
* Works for both regular sections (with heading) and orphaned sections (without heading)
*/
*splitSection(section) {
/**
* Extract immediate content (non-section children)
*/
const contentItems = [];
for (const child of section.children) if (!isSection(child)) contentItems.push(child);
/**
* Handle empty sections
*/
if (contentItems.length === 0) {
if (section.heading)
/**
* Process only heading
*/
yield* this.splitSubNode(section.heading);
return;
}
let currentItems = [];
let currentItemsSize = 0;
/**
* Start with heading if it exists
*/
if (section.heading) {
currentItems.push(section.heading);
currentItemsSize = getContentSize(section.heading);
}
for (const item of contentItems) {
/**
* Calculate item size once
*/
const itemSize = getContentSize(item);
const potentialSize = currentItemsSize + itemSize;
if (potentialSize <= this.maxAllowedSize) {
/**
* Item fits - add to current group to maximize utilization
*/
currentItems.push(item);
currentItemsSize = potentialSize;
} else {
/**
* Item doesn't fit - yield current group and handle this item
*/
if (currentItems.length > 0) {
yield createTree(currentItems);
currentItems = [];
currentItemsSize = 0;
}
if (itemSize <= this.maxAllowedSize) {
/**
* Item fits alone - start new group with it
*/
currentItems = [item];
currentItemsSize = itemSize;
} else
/**
* Item too large even alone - needs further splitting
*/
yield* this.splitSubNode(item);
}
}
/**
* Yield final group
*/
if (currentItems.length > 0) yield createTree(currentItems);
}
/**
* Splits individual nodes, delegating to specialized splitters when needed
*/
*splitSubNode(node) {
if (getContentSize(node) <= this.maxAllowedSize) yield node;
else {
/**
* Get the appropriate splitter for the node type
*/
const splitter = this.nodeSplitters.get(node.type);
/**
* If the splitter exists, split the node and yield the result.
* Otherwise, split the node using the text splitter.
*/
if (splitter) yield* splitter.splitNode(node);
else yield* this.textSplitter.splitNode(node);
}
}
/**
* Merges sibling sections by grouping consecutive sections that fit within allowed size
* Groups siblings at the same hierarchical level to maximize chunk utilization
*/
*mergeSiblingSections(sections) {
let siblings = [];
let siblingsSize = 0;
/**
* Depth of the siblings' parent section.
* Use -1 because we are merging sections at the same hierarchical level.
*/
const siblingsDepth = Math.max(1, sections[0].depth) - 1;
for (const section of sections) {
const sectionSize = getSectionSize(section);
/**
* If section is too large by itself, yield current group and process section separately
*/
if (sectionSize > this.maxAllowedSize) {
/**
* Yield accumulated group if any
*/
if (siblings.length > 0) yield createSection({
depth: siblingsDepth,
children: siblings
});
/**
* Process oversized section
*/
yield* this.splitHierarchicalSection(section);
/**
* Reset group
*/
siblings = [];
siblingsSize = 0;
continue;
}
/**
* If adding this section would exceed limit, yield current group first
*/
const combinedSize = siblingsSize + sectionSize;
if (siblings.length > 0 && combinedSize > this.maxAllowedSize) {
yield createSection({
depth: siblingsDepth,
children: siblings
});
/**
* Reset group
*/
siblings = [];
siblingsSize = 0;
}
/**
* Add section to current group
*/
siblings.push(section);
siblingsSize += sectionSize;
}
/**
* Yield remaining group
*/
if (siblings.length > 0) yield createSection({
depth: siblingsDepth,
children: siblings
});
}
};
//#endregion
//#region src/chunkdown.ts
/**
* Default node rules:
* - Links
* - Never split
* - Normalize to inline style
* - Images
* - Never split
* - Normalize to inline style
*/
const defaultNodeRules = {
link: {
split: "never-split",
style: "inline"
},
image: {
split: "never-split",
style: "inline"
}
};
var Chunkdown = class {
options;
splitter;
constructor(options) {
this.options = {
...options,
maxOverflowRatio: Math.max(1, options.maxOverflowRatio ?? 1)
};
this.splitter = new TreeSplitter(this.options);
}
get chunkSize() {
return this.options.chunkSize;
}
get maxOverflowRatio() {
return this.options.maxOverflowRatio ?? 1;
}
get maxRawSize() {
return this.options.maxRawSize;
}
splitText(text) {
const root = fromMarkdown$1(text);
const chunks = this.splitNode(root).map((node) => toMarkdown$1(node).trim()).filter((chunk) => chunk.length > 0);
if (this.options.maxRawSize !== void 0) return Array.from(splitByMaxRawSize(chunks, this.options.maxRawSize));
return chunks;
}
splitNode(root) {
const preparedRoot = preprocessMarkdown(root, this.options);
return this.splitter.splitNode(preparedRoot);
}
};
/**
* Create a new Chunkdown instance.
* Applies default node rules if no custom rules are provided.
*/
const chunkdown = (options) => {
const rules = options.rules ?? defaultNodeRules;
return new Chunkdown({
...options,
rules
});
};
//#endregion
export { chunkdown };