UNPKG

chunkdown

Version:

A tree-based markdown text splitter that understands document structure to create semantically meaningful chunks for RAG applications

206 lines (205 loc) 6 kB
import { Blockquote, Delete, Emphasis, Image, Link, List, Nodes, Parent, Root, Strong, Table } from "mdast"; //#region src/splitters/interface.d.ts interface NodeSplitter<T extends Nodes = Nodes> { splitText(text: string): Array<string>; splitNode(node: T): Array<Nodes>; } //#endregion //#region src/types.d.ts type Formatting = Strong | Emphasis | Delete; /** * Mapping of node type names to their corresponding mdast node types. */ type NodeTypes = { link: Link; image: Image; strong: Strong; emphasis: Emphasis; delete: Delete; formatting: Formatting; list: List; table: Table; blockquote: Blockquote; }; /** * Link style options * - 'inline': Convert reference-style links to inline links * - 'preserve': Keep original style */ type LinkStyle = 'inline' | 'preserve'; /** * Image style options * - 'inline': Convert reference-style images to inline images * - 'preserve': Keep original style */ type ImageStyle = 'inline' | 'preserve'; type SplitterOptions = { /** * Preferred chunk size. * Content size will be calculated using the actual text content without markdown formatting characters. * That means the raw text length of each chunk will be usually greater than the `chunkSize`. */ chunkSize: number; /** * Maximum overflow ratio for preserving semantic units. * - 1.0 = strict size limits, no overflow allowed * - >1.0 = allow overflow to preserve semantic coherence * For example, 1.5 means allow chunks up to 50% larger than chunkSize * to keep semantic units (sections, lists, code blocks) together. * If undefined or less than 1.0, defaults to 1.0. */ maxOverflowRatio?: number; /** * Optional maximum raw markdown length (characters) for embedding model compatibility. * If set, chunks will be further split when their raw markdown exceeds this limit. * Useful for embedding models with character limits (e.g., 7000 for OpenAI text-embedding-3-large). * If undefined, defaults to chunkSize * maxOverflowRatio * 4 for reasonable safety. */ maxRawSize?: number; /** * Optional rules for nodes. * Can be configured for specific node types. */ rules?: Partial<NodeRules>; }; /** * Context provided to transform functions */ type TransformContext = { /** * Parent node containing this node */ parent?: Parent; /** * Index of this node in parent's children */ index?: number; /** * Root node of the entire tree */ root: Root; }; /** * Transform function for a specific node type. * Returns: * - Modified node to replace the original * - null to remove the node * - undefined to keep the node unchanged */ type NodeTransform<NODE extends Nodes> = (node: NODE, context: TransformContext) => NODE | null | undefined; /** * Node-specific rules */ type NodeRules = { [K in keyof NodeTypes]?: NodeRule<NodeTypes[K] extends Nodes ? NodeTypes[K] : never> }; /** * Node-specific rule */ type NodeRule<NODE extends Nodes> = NODE extends Link ? { /** * Split rule * - 'never-split': Never split the node * - 'allow-split': Allow splitting the node * - 'size-split': Split the node if its content size exceeds a certain limit */ split?: SplitRule<NODE>; /** * Normalize links * - 'inline': Convert reference-style links to inline links * - 'preserve': Keep original style * - undefined: Keep original style */ style?: LinkStyle; /** * Transform function to modify or filter link nodes */ transform?: NodeTransform<NODE>; } : NODE extends Image ? { /** * Split rule * - 'never-split': Never split the node * - 'allow-split': Allow splitting the node * - 'size-split': Split the node if its content size exceeds a certain limit */ split?: SplitRule<NODE>; /** * Normalize images * - 'inline': Convert reference-style images to inline images * - 'preserve': Keep original style * - undefined: Keep original style */ style?: ImageStyle; /** * Transform function to modify or filter image nodes */ transform?: NodeTransform<NODE>; } : { /** * Split rule * - 'never-split': Never split the node * - 'allow-split': Allow splitting the node * - 'size-split': Split the node if its content size exceeds a certain limit */ split?: SplitRule<NODE>; /** * Transform function to modify or filter nodes */ transform?: NodeTransform<NODE>; }; /** * Rule for splitting. * Can be a simple string or a complex object. */ type SplitRule<NODE extends Nodes> = SimpleSplitRule | ComplexSplitRule<NODE>; /** * Simple splitting rule. */ type SimpleSplitRule = 'never-split' | 'allow-split'; /** * Complex splitting rule. */ type ComplexSplitRule<NODE extends Nodes> = NeverSplitRule<NODE> | AllowSplitRule<NODE> | SizeSplitRule<NODE>; /** * Never split a node. */ type NeverSplitRule<NODE extends Nodes> = NODE extends Table | List ? { rule: 'never-split'; } : { rule: 'never-split'; }; /** * Allow splitting a node. */ type AllowSplitRule<NODE extends Nodes> = NODE extends Table | List ? { rule: 'allow-split'; } : { rule: 'allow-split'; }; /** * Split a node if its content size exceeds a certain limit. */ type SizeSplitRule<NODE extends Nodes> = NODE extends Table | List ? { rule: 'size-split'; size: number; } : { rule: 'size-split'; size: number; }; //#endregion //#region src/chunkdown.d.ts declare class Chunkdown implements NodeSplitter<Root> { private options; private splitter; constructor(options: SplitterOptions); get chunkSize(): number; get maxOverflowRatio(): number; get maxRawSize(): number | undefined; splitText(text: string): string[]; splitNode(root: Root): Array<Nodes>; } /** * Create a new Chunkdown instance. * Applies default node rules if no custom rules are provided. */ declare const chunkdown: (options: SplitterOptions) => Chunkdown; //#endregion export { type NodeRule, type NodeRules, type NodeTransform, type SplitterOptions, type TransformContext, chunkdown };