llm-splitter
Version:
Efficient, configurable text chunking utility for LLM vectorization. Returns rich chunk metadata.
234 lines • 10 kB
JavaScript
import { getChunk } from './get-chunk.js';
export var ChunkStrategy;
(function (ChunkStrategy) {
ChunkStrategy["character"] = "character";
ChunkStrategy["paragraph"] = "paragraph";
})(ChunkStrategy || (ChunkStrategy = {}));
// Boundary functions take an array of string input and then return an array of arrays.
// (Confusing!). The key part is that each sub-array is a group of parts that must *all* be
// included in the same chunk if another sub-array is in the chunk, or fill the chunk and split
// apart in subsequent chunks.
const BOUNDARIES = {
[ChunkStrategy.character]: (inputs) => [inputs],
[ChunkStrategy.paragraph]: (inputs) => {
const groups = [];
for (const input of inputs)
for (const paragraph of input.split(/\n\n/))
groups.push([paragraph]);
return groups;
}
};
const CHUNK_STRATEGIES = new Set(Object.keys(BOUNDARIES));
/**
* Assert that the chunk strategy is valid.
* @param {unknown} chunkStrategy The chunk strategy to validate.
* @throws {Error} If the chunk strategy is invalid.
*/
function assertChunkStrategy(chunkStrategy) {
if (!CHUNK_STRATEGIES.has(chunkStrategy))
throw new Error(`Invalid chunk strategy. Must be one of: ${[...CHUNK_STRATEGIES].join(', ')}`);
}
/**
* Split text into parts of text (or null if the part is ignored) and their start and end indices.
*
* While this function takes an array of strings, the `start` and `end` indices are from the
* perspective of the entire input array as a joined long single string.
*
* @param {string[]} inputs - The inputs to split.
* @param {Function} splitter - The function to split the text.
* @param {number} baseOffset - The base offset to add to the start and end positions.
* @returns {Chunk[]}
*/
export function splitToParts(inputs, splitter, baseOffset = 0) {
const parts = [];
let offset = 0;
for (const input of inputs) {
let inputStart = 0;
const inputParts = splitter(input);
for (const part of inputParts) {
let partFound = false;
let partStart = inputStart;
// Validation
if (typeof part !== 'string')
throw new Error(`Splitter returned a non-string part: ${part} for input: ${input}`);
// Ignore empty string.
if (part.length === 0)
continue;
// Catch up cursor.
while (partStart < input.length) {
// Found a match of the part in the input.
if (input.startsWith(part, partStart)) {
// Just capture the matched part...
partFound = true;
parts.push({
text: part,
start: partStart + offset + baseOffset,
end: partStart + part.length + offset + baseOffset
});
inputStart = partStart + part.length;
break;
}
// No match found, move cursor forward.
// Ignore and discard unmatched parts.
partStart++;
}
if (!partFound)
throw new Error(`Splitter did not return any parts for input (${input.length}): "${input.slice(0, 20)}"... with part (${part.length}): "${part.slice(0, 20)}"...`);
}
// Update offset.
// Ignore and discard unmatched parts.
offset += input.length;
}
return parts;
}
// Little helpers
const splitValidate = ({ chunkSize, chunkOverlap, splitter, chunkStrategy }) => {
assertChunkStrategy(chunkStrategy);
if (typeof chunkSize !== 'number' || !Number.isInteger(chunkSize))
throw new Error('Chunk size must be a positive integer');
if (chunkSize < 1)
throw new Error('Chunk size must be at least 1');
if (typeof chunkOverlap !== 'number' || !Number.isInteger(chunkOverlap))
throw new Error(`Chunk overlap must be a non-negative integer. Found: ${chunkOverlap}`);
if (chunkOverlap < 0)
throw new Error('Chunk overlap must be at least 0');
if (chunkOverlap >= chunkSize)
throw new Error('Chunk overlap must be less than chunk size');
if (typeof splitter !== 'function')
throw new Error('Splitter must be a function');
};
class ChunkParts {
input;
chunkOverlap;
parts = [];
lastEmittedPart = null;
lastBoundaryPart = null;
constructor(input, chunkOverlap) {
this.input = input;
this.chunkOverlap = chunkOverlap;
}
get length() {
return this.parts.length;
}
push(part) {
this.parts.push(part);
if (part.isBoundary)
this.lastBoundaryPart = part;
}
hasUnEmittedParts() {
// First chunk.
if (this.lastEmittedPart === null)
return this.parts.length > 0;
// Subsequent chunks.
if (this.parts.length > 0) {
// Check if have un-emitted parts past the end of last emitted part.
const lastPart = this.parts[this.parts.length - 1];
return lastPart.end > this.lastEmittedPart.end;
}
// Otherwise, we have no un-emitted parts.
return false;
}
emit() {
// Sanity check.
if (this.parts.length === 0)
throw new Error('Chunk parts is empty');
// Prepare chunk.
const start = this.parts[0].start;
this.lastEmittedPart = this.parts[this.parts.length - 1];
const end = this.lastEmittedPart.end;
const chunk = {
text: getChunk(this.input, start, end),
start,
end
};
// Reset state.
// At this point, we seed the new parts array with the overlap, if any found.
if (this.chunkOverlap > 0) {
this.parts = this.parts.slice(-this.chunkOverlap);
}
else {
this.parts = [];
}
// Clear out last boundary. We consider a new chunk to have "no" previous boundary.
this.lastBoundaryPart = null;
return chunk;
}
}
/**
* Split text into chunks.
*
* ## Chunk Structure
* Note that when splitting into tokens if an array is passed to input, the array item boundary is
* *always* a token boundary.
*
* In the returned structure, `start` is the start of the first token in the chunk and `end` is
* the end of the last token. In between there may be unmatched / discarded parts between tokens
* (e.g. if you split on whitespace, there may be spaces between tokens). The `text` field of
* the returned chunk will include all the text or array of texts from the start to the end,
* inclusive of the unmatched parts.
*
* ## Chunk Strategy
* The `chunkStrategy` option allows you to specify how the chunks are grouped.
* - `character`: There is no grouping preference here. Fit as many whole tokens as possible into a chunk.
* - `paragraph`: Group tokens by paragraphs. If a paragraph exceeds the chunk size, it will be split across multiple chunks.
*
* @param {string|string[]} input - The input (string or array of strings) to split.
* @param {Object} options
* @param {number} options.chunkSize - The max number of tokens (from splitter) of each chunk.
* @param {number} options.chunkOverlap - The overlapping number of tokens (from splitter) to include from previous chunk.
* @param {Function} options.splitter - The function to split the text.
* @param {string} options.chunkStrategy - The strategy used to group tokens into chunks.
* @returns {Array<{text: string | null, start: number, end: number}>}
*/
export function split(input, { chunkSize = 512, chunkOverlap = 0, splitter = (text) => [...text], chunkStrategy = ChunkStrategy.character } = {}) {
// Validation
splitValidate({ chunkSize, chunkOverlap, splitter, chunkStrategy });
// Chunk handling.
const chunks = [];
const chunkParts = new ChunkParts(input, chunkOverlap);
// Inputs.
const inputAsArray = Array.isArray(input) ? input : [input];
const inputAsString = inputAsArray.join('');
const groups = BOUNDARIES[chunkStrategy](inputAsArray);
// Iteration.
let baseOffset = -1;
for (const group of groups) {
// Empty pre-processed group.
if (group.length === 0)
continue;
// Find the start of the first part in the group and update our offset.
const firstPart = group[0];
baseOffset = inputAsString.indexOf(firstPart, baseOffset + 1);
if (baseOffset === -1)
throw new Error(`Could not find start of group: ${group.slice(0, 20)}...`);
// Split with parts plus our offset.
const parts = splitToParts(group, splitter, baseOffset);
// Empty post-processed group.
if (parts.length === 0)
continue;
// Mark the **last** part as the boundary.
parts[parts.length - 1].isBoundary = true;
// If the current chunk has a portion with a boundary, and we can't fit this entire group
// in the current chunk, emit the existing chunk and then continue adding to a fresh chunk.
if (chunkParts.hasUnEmittedParts() &&
chunkParts.lastBoundaryPart !== null &&
chunkParts.length + parts.length > chunkSize) {
chunks.push(chunkParts.emit());
}
// Should add parts to chunks. Start iterating.
for (const part of parts) {
// Add the part to the current chunk.
chunkParts.push(part);
// Sanity check.
if (chunkParts.length > chunkSize)
throw new Error(`Chunk size is ${chunkSize}, but chunkParts.length is ${chunkParts.length} -- ${JSON.stringify(chunkParts)}`);
if (chunkParts.length === chunkSize)
chunks.push(chunkParts.emit());
}
}
// Handle last chunk.
if (chunkParts.hasUnEmittedParts())
chunks.push(chunkParts.emit());
return chunks;
}
//# sourceMappingURL=split.js.map