vector-chunker
Version:
A flexible text and data chunking library for vector databases and LLMs
70 lines • 2.71 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.splitText = splitText;
const uuid_1 = require("uuid");
function splitText(text, options) {
var _a;
const { chunkSize = 4000, overlap = 200, splitOn = 'character', preserveContext = true } = options;
const chunks = [];
const parentId = (0, uuid_1.v4)();
if (!preserveContext || splitOn === 'character') {
// Simple character-based chunking without context
for (let i = 0; i < text.length; i += chunkSize) {
const content = text.slice(i, Math.min(i + chunkSize, text.length));
console.log('Creating chunk:', { content, i, chunkSize });
chunks.push(createChunk(content, chunks.length, Math.ceil(text.length / chunkSize), parentId));
}
}
else {
let segments;
if (splitOn === 'sentence') {
segments = text.match(/[^.!?]+[.!?]+/g) || [text];
}
else if (splitOn === 'paragraph') {
segments = text.split(/\n\s*\n/).filter(Boolean);
}
else if (splitOn === 'word') {
segments = text.split(/\s+/);
}
else {
segments = [text];
}
let currentChunk = '';
for (let i = 0; i < segments.length; i++) {
const segment = segments[i];
const wouldExceedLimit = (currentChunk + (currentChunk ? ' ' : '') + segment).length > chunkSize;
if (wouldExceedLimit && currentChunk) {
chunks.push(createChunk(currentChunk, chunks.length, segments.length, parentId));
currentChunk = preserveContext ?
((_a = segments[i - 1]) === null || _a === void 0 ? void 0 : _a.slice(-overlap)) + segment :
segment;
}
else {
currentChunk += (currentChunk ? ' ' : '') + segment;
}
}
if (currentChunk) {
chunks.push(createChunk(currentChunk, chunks.length, segments.length, parentId));
}
}
console.log('Final chunks:', chunks.map(c => c.content));
// Link chunks together
chunks.forEach((chunk, idx) => {
chunk.metadata.previousChunk = idx > 0 ? chunks[idx - 1].metadata.id : undefined;
chunk.metadata.nextChunk = idx < chunks.length - 1 ? chunks[idx + 1].metadata.id : undefined;
});
return chunks;
}
function createChunk(content, index, total, parentId) {
return {
content,
metadata: {
id: (0, uuid_1.v4)(),
index,
totalChunks: total,
parentId,
originalSize: content.length
}
};
}
//# sourceMappingURL=text-splitter.js.map