UNPKG

@gravityai-dev/pinecone

Version:

Pinecone vector database nodes for GravityWorkflow - knowledge management and vector operations

184 lines 6.39 kB
"use strict"; /** * Text chunking strategies for vector embeddings */ Object.defineProperty(exports, "__esModule", { value: true }); exports.fixedSizeChunking = fixedSizeChunking; exports.sentenceBasedChunking = sentenceBasedChunking; exports.paragraphBasedChunking = paragraphBasedChunking; exports.chunkText = chunkText; /** * Fixed-size chunking with overlap */ function fixedSizeChunking(text, options) { const { maxChunkSize, overlapSize, minChunkSize = 50 } = options; const chunks = []; let startOffset = 0; let chunkIndex = 0; while (startOffset < text.length) { // Calculate the ideal end position based on maxChunkSize let endOffset = Math.min(startOffset + maxChunkSize, text.length); // Try to break at a word boundary if we're not at the end if (endOffset < text.length) { const lastSpace = text.lastIndexOf(' ', endOffset); // Only adjust if we find a space that's reasonably close to our target if (lastSpace > startOffset + (maxChunkSize * 0.8)) { endOffset = lastSpace; } } const chunkText = text.slice(startOffset, endOffset).trim(); if (chunkText.length >= minChunkSize) { chunks.push({ text: chunkText, metadata: { chunkIndex, startOffset, endOffset, }, }); chunkIndex++; } // Move to next chunk with overlap // Calculate next start based on the actual chunk size, not the adjusted end if (endOffset >= text.length) { break; } // For overlap, go back from the end position startOffset = Math.max(endOffset - overlapSize, startOffset + minChunkSize); } // Add total chunks to metadata chunks.forEach(chunk => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Sentence-based chunking */ function sentenceBasedChunking(text, options) { const { maxChunkSize, overlapSize, minChunkSize = 50 } = options; // Simple sentence splitting (can be improved with better NLP) const sentences = text.match(/[^.!?]+[.!?]+/g) || [text]; const chunks = []; let currentChunk = ''; let currentStartOffset = 0; let chunkIndex = 0; let sentenceStartOffset = 0; for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i].trim(); const sentenceLength = sentence.length; if (currentChunk.length + sentenceLength > maxChunkSize && currentChunk.length > 0) { // Save current chunk chunks.push({ text: currentChunk.trim(), metadata: { chunkIndex, startOffset: currentStartOffset, endOffset: sentenceStartOffset, }, }); chunkIndex++; // Start new chunk with overlap const overlapSentences = []; let overlapLength = 0; for (let j = i - 1; j >= 0 && overlapLength < overlapSize; j--) { overlapSentences.unshift(sentences[j]); overlapLength += sentences[j].length; } currentChunk = overlapSentences.join(' ') + ' ' + sentence; currentStartOffset = sentenceStartOffset - overlapLength; } else { currentChunk += (currentChunk ? ' ' : '') + sentence; } sentenceStartOffset += sentenceLength + 1; // +1 for space } // Add last chunk if (currentChunk.trim().length >= minChunkSize) { chunks.push({ text: currentChunk.trim(), metadata: { chunkIndex, startOffset: currentStartOffset, endOffset: text.length, }, }); } // Add total chunks to metadata chunks.forEach(chunk => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Paragraph-based chunking */ function paragraphBasedChunking(text, options) { const { maxChunkSize, overlapSize, minChunkSize = 50 } = options; // Split by double newlines or multiple spaces const paragraphs = text.split(/\n\n+|\r\n\r\n+/).filter(p => p.trim()); const chunks = []; let currentChunk = ''; let currentStartOffset = 0; let chunkIndex = 0; let paragraphStartOffset = 0; for (let i = 0; i < paragraphs.length; i++) { const paragraph = paragraphs[i].trim(); const paragraphLength = paragraph.length; if (currentChunk.length + paragraphLength > maxChunkSize && currentChunk.length > 0) { // Save current chunk chunks.push({ text: currentChunk.trim(), metadata: { chunkIndex, startOffset: currentStartOffset, endOffset: paragraphStartOffset, }, }); chunkIndex++; // Start new chunk currentChunk = paragraph; currentStartOffset = paragraphStartOffset; } else { currentChunk += (currentChunk ? '\n\n' : '') + paragraph; } paragraphStartOffset = text.indexOf(paragraph, paragraphStartOffset) + paragraphLength; } // Add last chunk if (currentChunk.trim().length >= minChunkSize) { chunks.push({ text: currentChunk.trim(), metadata: { chunkIndex, startOffset: currentStartOffset, endOffset: text.length, }, }); } // Add total chunks to metadata chunks.forEach(chunk => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Main chunking function that delegates to specific strategies */ function chunkText(text, options = { strategy: 'fixed', maxChunkSize: 1000, overlapSize: 200, minChunkSize: 50, }) { switch (options.strategy) { case 'sentence': return sentenceBasedChunking(text, options); case 'paragraph': return paragraphBasedChunking(text, options); case 'fixed': default: return fixedSizeChunking(text, options); } } //# sourceMappingURL=strategies.js.map