chunkdown
Version:
A tree-based markdown text splitter that understands document structure to create semantically meaningful chunks for RAG applications
206 lines (205 loc) • 6 kB
TypeScript
import { Blockquote, Delete, Emphasis, Image, Link, List, Nodes, Parent, Root, Strong, Table } from "mdast";
//#region src/splitters/interface.d.ts
interface NodeSplitter<T extends Nodes = Nodes> {
splitText(text: string): Array<string>;
splitNode(node: T): Array<Nodes>;
}
//#endregion
//#region src/types.d.ts
type Formatting = Strong | Emphasis | Delete;
/**
* Mapping of node type names to their corresponding mdast node types.
*/
type NodeTypes = {
link: Link;
image: Image;
strong: Strong;
emphasis: Emphasis;
delete: Delete;
formatting: Formatting;
list: List;
table: Table;
blockquote: Blockquote;
};
/**
* Link style options
* - 'inline': Convert reference-style links to inline links
* - 'preserve': Keep original style
*/
type LinkStyle = 'inline' | 'preserve';
/**
* Image style options
* - 'inline': Convert reference-style images to inline images
* - 'preserve': Keep original style
*/
type ImageStyle = 'inline' | 'preserve';
type SplitterOptions = {
/**
* Preferred chunk size.
* Content size will be calculated using the actual text content without markdown formatting characters.
* That means the raw text length of each chunk will be usually greater than the `chunkSize`.
*/
chunkSize: number;
/**
* Maximum overflow ratio for preserving semantic units.
* - 1.0 = strict size limits, no overflow allowed
* - >1.0 = allow overflow to preserve semantic coherence
* For example, 1.5 means allow chunks up to 50% larger than chunkSize
* to keep semantic units (sections, lists, code blocks) together.
* If undefined or less than 1.0, defaults to 1.0.
*/
maxOverflowRatio?: number;
/**
* Optional maximum raw markdown length (characters) for embedding model compatibility.
* If set, chunks will be further split when their raw markdown exceeds this limit.
* Useful for embedding models with character limits (e.g., 7000 for OpenAI text-embedding-3-large).
* If undefined, defaults to chunkSize * maxOverflowRatio * 4 for reasonable safety.
*/
maxRawSize?: number;
/**
* Optional rules for nodes.
* Can be configured for specific node types.
*/
rules?: Partial<NodeRules>;
};
/**
* Context provided to transform functions
*/
type TransformContext = {
/**
* Parent node containing this node
*/
parent?: Parent;
/**
* Index of this node in parent's children
*/
index?: number;
/**
* Root node of the entire tree
*/
root: Root;
};
/**
* Transform function for a specific node type.
* Returns:
* - Modified node to replace the original
* - null to remove the node
* - undefined to keep the node unchanged
*/
type NodeTransform<NODE extends Nodes> = (node: NODE, context: TransformContext) => NODE | null | undefined;
/**
* Node-specific rules
*/
type NodeRules = { [K in keyof NodeTypes]?: NodeRule<NodeTypes[K] extends Nodes ? NodeTypes[K] : never> };
/**
* Node-specific rule
*/
type NodeRule<NODE extends Nodes> = NODE extends Link ? {
/**
* Split rule
* - 'never-split': Never split the node
* - 'allow-split': Allow splitting the node
* - 'size-split': Split the node if its content size exceeds a certain limit
*/
split?: SplitRule<NODE>;
/**
* Normalize links
* - 'inline': Convert reference-style links to inline links
* - 'preserve': Keep original style
* - undefined: Keep original style
*/
style?: LinkStyle;
/**
* Transform function to modify or filter link nodes
*/
transform?: NodeTransform<NODE>;
} : NODE extends Image ? {
/**
* Split rule
* - 'never-split': Never split the node
* - 'allow-split': Allow splitting the node
* - 'size-split': Split the node if its content size exceeds a certain limit
*/
split?: SplitRule<NODE>;
/**
* Normalize images
* - 'inline': Convert reference-style images to inline images
* - 'preserve': Keep original style
* - undefined: Keep original style
*/
style?: ImageStyle;
/**
* Transform function to modify or filter image nodes
*/
transform?: NodeTransform<NODE>;
} : {
/**
* Split rule
* - 'never-split': Never split the node
* - 'allow-split': Allow splitting the node
* - 'size-split': Split the node if its content size exceeds a certain limit
*/
split?: SplitRule<NODE>;
/**
* Transform function to modify or filter nodes
*/
transform?: NodeTransform<NODE>;
};
/**
* Rule for splitting.
* Can be a simple string or a complex object.
*/
type SplitRule<NODE extends Nodes> = SimpleSplitRule | ComplexSplitRule<NODE>;
/**
* Simple splitting rule.
*/
type SimpleSplitRule = 'never-split' | 'allow-split';
/**
* Complex splitting rule.
*/
type ComplexSplitRule<NODE extends Nodes> = NeverSplitRule<NODE> | AllowSplitRule<NODE> | SizeSplitRule<NODE>;
/**
* Never split a node.
*/
type NeverSplitRule<NODE extends Nodes> = NODE extends Table | List ? {
rule: 'never-split';
} : {
rule: 'never-split';
};
/**
* Allow splitting a node.
*/
type AllowSplitRule<NODE extends Nodes> = NODE extends Table | List ? {
rule: 'allow-split';
} : {
rule: 'allow-split';
};
/**
* Split a node if its content size exceeds a certain limit.
*/
type SizeSplitRule<NODE extends Nodes> = NODE extends Table | List ? {
rule: 'size-split';
size: number;
} : {
rule: 'size-split';
size: number;
};
//#endregion
//#region src/chunkdown.d.ts
declare class Chunkdown implements NodeSplitter<Root> {
private options;
private splitter;
constructor(options: SplitterOptions);
get chunkSize(): number;
get maxOverflowRatio(): number;
get maxRawSize(): number | undefined;
splitText(text: string): string[];
splitNode(root: Root): Array<Nodes>;
}
/**
* Create a new Chunkdown instance.
* Applies default node rules if no custom rules are provided.
*/
declare const chunkdown: (options: SplitterOptions) => Chunkdown;
//#endregion
export { type NodeRule, type NodeRules, type NodeTransform, type SplitterOptions, type TransformContext, chunkdown };