@gravityai-dev/pinecone
Version:
Pinecone vector database nodes for GravityWorkflow - knowledge management and vector operations
184 lines • 6.39 kB
JavaScript
"use strict";
/**
* Text chunking strategies for vector embeddings
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.fixedSizeChunking = fixedSizeChunking;
exports.sentenceBasedChunking = sentenceBasedChunking;
exports.paragraphBasedChunking = paragraphBasedChunking;
exports.chunkText = chunkText;
/**
* Fixed-size chunking with overlap
*/
function fixedSizeChunking(text, options) {
const { maxChunkSize, overlapSize, minChunkSize = 50 } = options;
const chunks = [];
let startOffset = 0;
let chunkIndex = 0;
while (startOffset < text.length) {
// Calculate the ideal end position based on maxChunkSize
let endOffset = Math.min(startOffset + maxChunkSize, text.length);
// Try to break at a word boundary if we're not at the end
if (endOffset < text.length) {
const lastSpace = text.lastIndexOf(' ', endOffset);
// Only adjust if we find a space that's reasonably close to our target
if (lastSpace > startOffset + (maxChunkSize * 0.8)) {
endOffset = lastSpace;
}
}
const chunkText = text.slice(startOffset, endOffset).trim();
if (chunkText.length >= minChunkSize) {
chunks.push({
text: chunkText,
metadata: {
chunkIndex,
startOffset,
endOffset,
},
});
chunkIndex++;
}
// Move to next chunk with overlap
// Calculate next start based on the actual chunk size, not the adjusted end
if (endOffset >= text.length) {
break;
}
// For overlap, go back from the end position
startOffset = Math.max(endOffset - overlapSize, startOffset + minChunkSize);
}
// Add total chunks to metadata
chunks.forEach(chunk => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Sentence-based chunking
*/
function sentenceBasedChunking(text, options) {
const { maxChunkSize, overlapSize, minChunkSize = 50 } = options;
// Simple sentence splitting (can be improved with better NLP)
const sentences = text.match(/[^.!?]+[.!?]+/g) || [text];
const chunks = [];
let currentChunk = '';
let currentStartOffset = 0;
let chunkIndex = 0;
let sentenceStartOffset = 0;
for (let i = 0; i < sentences.length; i++) {
const sentence = sentences[i].trim();
const sentenceLength = sentence.length;
if (currentChunk.length + sentenceLength > maxChunkSize && currentChunk.length > 0) {
// Save current chunk
chunks.push({
text: currentChunk.trim(),
metadata: {
chunkIndex,
startOffset: currentStartOffset,
endOffset: sentenceStartOffset,
},
});
chunkIndex++;
// Start new chunk with overlap
const overlapSentences = [];
let overlapLength = 0;
for (let j = i - 1; j >= 0 && overlapLength < overlapSize; j--) {
overlapSentences.unshift(sentences[j]);
overlapLength += sentences[j].length;
}
currentChunk = overlapSentences.join(' ') + ' ' + sentence;
currentStartOffset = sentenceStartOffset - overlapLength;
}
else {
currentChunk += (currentChunk ? ' ' : '') + sentence;
}
sentenceStartOffset += sentenceLength + 1; // +1 for space
}
// Add last chunk
if (currentChunk.trim().length >= minChunkSize) {
chunks.push({
text: currentChunk.trim(),
metadata: {
chunkIndex,
startOffset: currentStartOffset,
endOffset: text.length,
},
});
}
// Add total chunks to metadata
chunks.forEach(chunk => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Paragraph-based chunking
*/
function paragraphBasedChunking(text, options) {
const { maxChunkSize, overlapSize, minChunkSize = 50 } = options;
// Split by double newlines or multiple spaces
const paragraphs = text.split(/\n\n+|\r\n\r\n+/).filter(p => p.trim());
const chunks = [];
let currentChunk = '';
let currentStartOffset = 0;
let chunkIndex = 0;
let paragraphStartOffset = 0;
for (let i = 0; i < paragraphs.length; i++) {
const paragraph = paragraphs[i].trim();
const paragraphLength = paragraph.length;
if (currentChunk.length + paragraphLength > maxChunkSize && currentChunk.length > 0) {
// Save current chunk
chunks.push({
text: currentChunk.trim(),
metadata: {
chunkIndex,
startOffset: currentStartOffset,
endOffset: paragraphStartOffset,
},
});
chunkIndex++;
// Start new chunk
currentChunk = paragraph;
currentStartOffset = paragraphStartOffset;
}
else {
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
}
paragraphStartOffset = text.indexOf(paragraph, paragraphStartOffset) + paragraphLength;
}
// Add last chunk
if (currentChunk.trim().length >= minChunkSize) {
chunks.push({
text: currentChunk.trim(),
metadata: {
chunkIndex,
startOffset: currentStartOffset,
endOffset: text.length,
},
});
}
// Add total chunks to metadata
chunks.forEach(chunk => {
chunk.metadata.totalChunks = chunks.length;
});
return chunks;
}
/**
* Main chunking function that delegates to specific strategies
*/
function chunkText(text, options = {
strategy: 'fixed',
maxChunkSize: 1000,
overlapSize: 200,
minChunkSize: 50,
}) {
switch (options.strategy) {
case 'sentence':
return sentenceBasedChunking(text, options);
case 'paragraph':
return paragraphBasedChunking(text, options);
case 'fixed':
default:
return fixedSizeChunking(text, options);
}
}
//# sourceMappingURL=strategies.js.map