llm-splitter
Version:
Efficient, configurable text chunking utility for LLM vectorization. Returns rich chunk metadata.
56 lines (55 loc) • 2.67 kB
TypeScript
export declare enum ChunkStrategy {
character = "character",
paragraph = "paragraph"
}
export interface Chunk {
text: string | string[] | null;
start: number;
end: number;
isBoundary?: boolean;
}
export interface SplitOptions {
chunkSize?: number;
chunkOverlap?: number;
splitter?: (input: string) => string[];
chunkStrategy?: keyof typeof ChunkStrategy;
}
/**
* Split text into parts of text (or null if the part is ignored) and their start and end indices.
*
* While this function takes an array of strings, the `start` and `end` indices are from the
* perspective of the entire input array as a joined long single string.
*
* @param {string[]} inputs - The inputs to split.
* @param {Function} splitter - The function to split the text.
* @param {number} baseOffset - The base offset to add to the start and end positions.
* @returns {Chunk[]}
*/
export declare function splitToParts(inputs: string[], splitter: (input: string) => string[], baseOffset?: number): Chunk[];
/**
* Split text into chunks.
*
* ## Chunk Structure
* Note that when splitting into tokens if an array is passed to input, the array item boundary is
* *always* a token boundary.
*
* In the returned structure, `start` is the start of the first token in the chunk and `end` is
* the end of the last token. In between there may be unmatched / discarded parts between tokens
* (e.g. if you split on whitespace, there may be spaces between tokens). The `text` field of
* the returned chunk will include all the text or array of texts from the start to the end,
* inclusive of the unmatched parts.
*
* ## Chunk Strategy
* The `chunkStrategy` option allows you to specify how the chunks are grouped.
* - `character`: There is no grouping preference here. Fit as many whole tokens as possible into a chunk.
* - `paragraph`: Group tokens by paragraphs. If a paragraph exceeds the chunk size, it will be split across multiple chunks.
*
* @param {string|string[]} input - The input (string or array of strings) to split.
* @param {Object} options
* @param {number} options.chunkSize - The max number of tokens (from splitter) of each chunk.
* @param {number} options.chunkOverlap - The overlapping number of tokens (from splitter) to include from previous chunk.
* @param {Function} options.splitter - The function to split the text.
* @param {string} options.chunkStrategy - The strategy used to group tokens into chunks.
* @returns {Array<{text: string | null, start: number, end: number}>}
*/
export declare function split(input: string | string[], { chunkSize, chunkOverlap, splitter, chunkStrategy }?: SplitOptions): Chunk[];