UNPKG

vector-chunker

Version:

A flexible text and data chunking library for vector databases and LLMs

70 lines 2.71 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.splitText = splitText; const uuid_1 = require("uuid"); function splitText(text, options) { var _a; const { chunkSize = 4000, overlap = 200, splitOn = 'character', preserveContext = true } = options; const chunks = []; const parentId = (0, uuid_1.v4)(); if (!preserveContext || splitOn === 'character') { // Simple character-based chunking without context for (let i = 0; i < text.length; i += chunkSize) { const content = text.slice(i, Math.min(i + chunkSize, text.length)); console.log('Creating chunk:', { content, i, chunkSize }); chunks.push(createChunk(content, chunks.length, Math.ceil(text.length / chunkSize), parentId)); } } else { let segments; if (splitOn === 'sentence') { segments = text.match(/[^.!?]+[.!?]+/g) || [text]; } else if (splitOn === 'paragraph') { segments = text.split(/\n\s*\n/).filter(Boolean); } else if (splitOn === 'word') { segments = text.split(/\s+/); } else { segments = [text]; } let currentChunk = ''; for (let i = 0; i < segments.length; i++) { const segment = segments[i]; const wouldExceedLimit = (currentChunk + (currentChunk ? ' ' : '') + segment).length > chunkSize; if (wouldExceedLimit && currentChunk) { chunks.push(createChunk(currentChunk, chunks.length, segments.length, parentId)); currentChunk = preserveContext ? ((_a = segments[i - 1]) === null || _a === void 0 ? void 0 : _a.slice(-overlap)) + segment : segment; } else { currentChunk += (currentChunk ? ' ' : '') + segment; } } if (currentChunk) { chunks.push(createChunk(currentChunk, chunks.length, segments.length, parentId)); } } console.log('Final chunks:', chunks.map(c => c.content)); // Link chunks together chunks.forEach((chunk, idx) => { chunk.metadata.previousChunk = idx > 0 ? chunks[idx - 1].metadata.id : undefined; chunk.metadata.nextChunk = idx < chunks.length - 1 ? chunks[idx + 1].metadata.id : undefined; }); return chunks; } function createChunk(content, index, total, parentId) { return { content, metadata: { id: (0, uuid_1.v4)(), index, totalChunks: total, parentId, originalSize: content.length } }; } //# sourceMappingURL=text-splitter.js.map