vector-chunker
Version:
A flexible text and data chunking library for vector databases and LLMs
125 lines • 4.64 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.chunk = chunk;
const text_splitter_1 = require("./utils/text-splitter");
const uuid_1 = require("uuid");
const DEFAULT_CHUNK_SIZE = 4000;
function chunk(data, options = {}) {
const { chunkSize = DEFAULT_CHUNK_SIZE, allowOversized = true, format = 'json', preserveContext = true } = options;
try {
if (typeof data === 'string' && format === 'text') {
return (0, text_splitter_1.splitText)(data, options);
}
return preserveContext
? chunkStructuredDataWithContext(data, options)
: chunkStructuredData(data, chunkSize, allowOversized).map(content => ({
content,
metadata: {
id: (0, uuid_1.v4)(),
index: 0,
totalChunks: 1,
originalSize: calculateElementSize(content)
}
}));
}
catch (error) {
throw new Error(`Chunking failed: ${error instanceof Error ? error.message : String(error)}`);
}
}
function chunkText(text, chunkSize) {
const chunks = [];
for (let i = 0; i < text.length; i += chunkSize) {
chunks.push(text.slice(i, i + chunkSize));
}
return chunks;
}
function chunkStructuredData(data, chunkSize, allowOversized) {
const elements = Array.isArray(data) ? [...data] : [data];
const chunks = [];
let currentChunk = [];
let currentSize = 0;
for (const element of elements) {
const elementSize = calculateElementSize(element);
if (elementSize > chunkSize) {
if (!allowOversized) {
throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`);
}
if (currentChunk.length > 0)
chunks.push([...currentChunk]);
chunks.push([element]);
currentChunk = [];
currentSize = 0;
continue;
}
if (currentSize + elementSize > chunkSize && currentChunk.length > 0) {
chunks.push([...currentChunk]);
currentChunk = [];
currentSize = 0;
}
currentChunk.push(element);
currentSize += elementSize;
}
if (currentChunk.length > 0) {
chunks.push([...currentChunk]);
}
return chunks;
}
function calculateElementSize(element) {
if (typeof element === 'string')
return element.length;
return JSON.stringify(element).length;
}
function chunkStructuredDataWithContext(data, options) {
const { chunkSize = DEFAULT_CHUNK_SIZE, allowOversized = true } = options;
const elements = Array.isArray(data) ? [...data] : [data];
const chunks = [];
const parentId = (0, uuid_1.v4)();
let currentChunk = [];
let currentSize = 0;
for (const element of elements) {
const elementSize = calculateElementSize(element);
if (elementSize > chunkSize) {
if (!allowOversized) {
throw new Error(`Element exceeds chunk size limit (${chunkSize} characters)`);
}
if (currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
}
chunks.push(createStructuredChunk([element], chunks.length, parentId));
currentChunk = [];
currentSize = 0;
continue;
}
if (currentSize + elementSize > chunkSize && currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
currentChunk = [];
currentSize = 0;
}
currentChunk.push(element);
currentSize += elementSize;
}
if (currentChunk.length > 0) {
chunks.push(createStructuredChunk([...currentChunk], chunks.length, parentId));
}
// Update totalChunks and link chunks
chunks.forEach((chunk, idx) => {
chunk.metadata.totalChunks = chunks.length;
chunk.metadata.previousChunk = idx > 0 ? chunks[idx - 1].metadata.id : undefined;
chunk.metadata.nextChunk = idx < chunks.length - 1 ? chunks[idx + 1].metadata.id : undefined;
});
return chunks;
}
function createStructuredChunk(content, index, parentId) {
return {
content,
metadata: {
id: (0, uuid_1.v4)(),
index,
totalChunks: 0, // Will be updated after all chunks are created
parentId,
originalSize: calculateElementSize(content)
}
};
}
exports.default = chunk;
//# sourceMappingURL=index.js.map