vector-chunker
Version:
A flexible text and data chunking library for vector databases and LLMs
148 lines (124 loc) • 4.37 kB
text/typescript
import { ChunkOptions, VectorChunk } from './types';
import { splitText } from './utils/text-splitter';
import { v4 as uuidv4 } from 'uuid';
const DEFAULT_CHUNK_SIZE = 4000;
export function chunk<T>(data: T, options: ChunkOptions = {}): VectorChunk<T[]>[] {
const {
chunkSize = DEFAULT_CHUNK_SIZE,
allowOversized = true,
format = 'json',
preserveContext = true
} = options;
try {
if (typeof data === 'string' && format === 'text') {
return splitText(data, options) as unknown as VectorChunk<T[]>[];
}
return preserveContext
? chunkStructuredDataWithContext(data, options)
: chunkStructuredData(data, chunkSize, allowOversized).map(content => ({
content,
metadata: {
id: uuidv4(),
index: 0,
totalChunks: 1,
originalSize: calculateElementSize(content)
}
}));
} catch (error) {
throw new Error(`Chunking failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
function chunkText(text: string, chunkSize: number): string[] {
const chunks: string[] = [];
for (let i = 0; i < text.length; i += chunkSize) {
chunks.push(text.slice(i, i + chunkSize));
}
return chunks;
}
function chunkStructuredData<T>(data: T, chunkSize: number, allowOversized: boolean): T[][] {
const elements = Array.isArray(data) ? [...data] : [data];
const chunks: T[][] = [];
let currentChunk: T[] = [];
let currentSize = 0;
for (const element of elements) {
const elementSize = calculateElementSize(element);
if (elementSize > chunkSize) {
if (!allowOversized) {
throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`);
}
if (currentChunk.length > 0) chunks.push([...currentChunk]);
chunks.push([element]);
currentChunk = [];
currentSize = 0;
continue;
}
if (currentSize + elementSize > chunkSize && currentChunk.length > 0) {
chunks.push([...currentChunk]);
currentChunk = [];
currentSize = 0;
}
currentChunk.push(element);
currentSize += elementSize;
}
if (currentChunk.length > 0) {
chunks.push([...currentChunk]);
}
return chunks;
}
function calculateElementSize(element: any): number {
if (typeof element === 'string') return element.length;
return JSON.stringify(element).length;
}
function chunkStructuredDataWithContext<T>(data: T, options: ChunkOptions): VectorChunk<T[]>[] {
const { chunkSize = DEFAULT_CHUNK_SIZE, allowOversized = true } = options;
const elements = Array.isArray(data) ? [...data] : [data];
const chunks: VectorChunk<T[]>[] = [];
const parentId = uuidv4();
let currentChunk: T[] = [];
let currentSize = 0;
for (const element of elements) {
const elementSize = calculateElementSize(element);
if (elementSize > chunkSize) {
if (!allowOversized) {
throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`);
}
if (currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
}
chunks.push(createStructuredChunk([element], chunks.length, parentId));
currentChunk = [];
currentSize = 0;
continue;
}
if (currentSize + elementSize > chunkSize && currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
currentChunk = [];
currentSize = 0;
}
currentChunk.push(element);
currentSize += elementSize;
}
if (currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
}
// Update totalChunks and link chunks
chunks.forEach((chunk, idx) => {
chunk.metadata.totalChunks = chunks.length;
chunk.metadata.previousChunk = idx > 0 ? chunks[idx - 1].metadata.id : undefined;
chunk.metadata.nextChunk = idx < chunks.length - 1 ? chunks[idx + 1].metadata.id : undefined;
});
return chunks;
}
function createStructuredChunk<T>(content: T[], index: number, parentId: string): VectorChunk<T[]> {
return {
content,
metadata: {
id: uuidv4(),
index,
totalChunks: 0, // Will be updated after all chunks are created
parentId,
originalSize: calculateElementSize(content)
}
};
}
export default chunk;