UNPKG

vector-chunker

Version:

A flexible text and data chunking library for vector databases and LLMs

125 lines 4.64 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.chunk = chunk; const text_splitter_1 = require("./utils/text-splitter"); const uuid_1 = require("uuid"); const DEFAULT_CHUNK_SIZE = 4000; function chunk(data, options = {}) { const { chunkSize = DEFAULT_CHUNK_SIZE, allowOversized = true, format = 'json', preserveContext = true } = options; try { if (typeof data === 'string' && format === 'text') { return (0, text_splitter_1.splitText)(data, options); } return preserveContext ? chunkStructuredDataWithContext(data, options) : chunkStructuredData(data, chunkSize, allowOversized).map(content => ({ content, metadata: { id: (0, uuid_1.v4)(), index: 0, totalChunks: 1, originalSize: calculateElementSize(content) } })); } catch (error) { throw new Error(`Chunking failed: ${error instanceof Error ? error.message : String(error)}`); } } function chunkText(text, chunkSize) { const chunks = []; for (let i = 0; i < text.length; i += chunkSize) { chunks.push(text.slice(i, i + chunkSize)); } return chunks; } function chunkStructuredData(data, chunkSize, allowOversized) { const elements = Array.isArray(data) ? [...data] : [data]; const chunks = []; let currentChunk = []; let currentSize = 0; for (const element of elements) { const elementSize = calculateElementSize(element); if (elementSize > chunkSize) { if (!allowOversized) { throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`); } if (currentChunk.length > 0) chunks.push([...currentChunk]); chunks.push([element]); currentChunk = []; currentSize = 0; continue; } if (currentSize + elementSize > chunkSize && currentChunk.length > 0) { chunks.push([...currentChunk]); currentChunk = []; currentSize = 0; } currentChunk.push(element); currentSize += elementSize; } if (currentChunk.length > 0) { chunks.push([...currentChunk]); } return chunks; } function calculateElementSize(element) { if (typeof element === 'string') return element.length; return JSON.stringify(element).length; } function chunkStructuredDataWithContext(data, options) { const { chunkSize = DEFAULT_CHUNK_SIZE, allowOversized = true } = options; const elements = Array.isArray(data) ? [...data] : [data]; const chunks = []; const parentId = (0, uuid_1.v4)(); let currentChunk = []; let currentSize = 0; for (const element of elements) { const elementSize = calculateElementSize(element); if (elementSize > chunkSize) { if (!allowOversized) { throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`); } if (currentChunk.length > 0) { chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId)); } chunks.push(createStructuredChunk([element], chunks.length, parentId)); currentChunk = []; currentSize = 0; continue; } if (currentSize + elementSize > chunkSize && currentChunk.length > 0) { chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId)); currentChunk = []; currentSize = 0; } currentChunk.push(element); currentSize += elementSize; } if (currentChunk.length > 0) { chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId)); } // Update totalChunks and link chunks chunks.forEach((chunk, idx) => { chunk.metadata.totalChunks = chunks.length; chunk.metadata.previousChunk = idx > 0 ? chunks[idx - 1].metadata.id : undefined; chunk.metadata.nextChunk = idx < chunks.length - 1 ? chunks[idx + 1].metadata.id : undefined; }); return chunks; } function createStructuredChunk(content, index, parentId) { return { content, metadata: { id: (0, uuid_1.v4)(), index, totalChunks: 0, // Will be updated after all chunks are created parentId, originalSize: calculateElementSize(content) } }; } exports.default = chunk; //# sourceMappingURL=index.js.map