UNPKG

n8n-nodes-rag

Version:

Advanced RAG (Retrieval-Augmented Generation) knowledge base nodes for n8n

129 lines 5.02 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.TextProcessor = void 0; const uuid_1 = require("uuid"); class TextProcessor { static cleanText(text, options) { let cleanedText = text; if (options.normalizeLineBreaks) { cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); } if (options.removeExtraWhitespace) { cleanedText = cleanedText.replace(/[ \t]+/g, ' '); } if (options.removeEmptyLines) { cleanedText = cleanedText.replace(/\n\s*\n/g, '\n'); } if (options.trimText) { cleanedText = cleanedText.trim(); } return cleanedText; } static chunkText(text, options, metadata = {}) { switch (options.strategy) { case 'fixed': return this.chunkByFixedSize(text, options, metadata); case 'sentence': return this.chunkBySentence(text, options, metadata); case 'paragraph': return this.chunkByParagraph(text, options, metadata); case 'semantic': return this.chunkBySemantic(text, options, metadata); default: throw new Error(`Unknown chunking strategy: ${options.strategy}`); } } static chunkByFixedSize(text, options, metadata) { const chunks = []; const { chunkSize, overlap } = options; for (let i = 0; i < text.length; i += chunkSize - overlap) { const chunk = text.slice(i, i + chunkSize); if (chunk.trim().length > 0) { chunks.push({ id: (0, uuid_1.v4)(), text: chunk.trim(), metadata: { ...metadata, index: chunks.length }, }); } } return chunks; } static chunkBySentence(text, options, metadata) { const chunks = []; const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0); let currentChunk = ''; let currentSize = 0; for (const sentence of sentences) { const trimmedSentence = sentence.trim(); if (currentSize + trimmedSentence.length > options.chunkSize && currentChunk.length > 0) { chunks.push({ id: (0, uuid_1.v4)(), text: currentChunk.trim(), metadata: { ...metadata, index: chunks.length }, }); const overlap = Math.min(options.overlap, currentChunk.length); currentChunk = currentChunk.slice(-overlap) + ' ' + trimmedSentence; currentSize = currentChunk.length; } else { currentChunk += (currentChunk ? ' ' : '') + trimmedSentence; currentSize = currentChunk.length; } } if (currentChunk.trim().length > 0) { chunks.push({ id: (0, uuid_1.v4)(), text: currentChunk.trim(), metadata: { ...metadata, index: chunks.length }, }); } return chunks; } static chunkByParagraph(text, options, metadata) { const chunks = []; const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0); let currentChunk = ''; let currentSize = 0; for (const paragraph of paragraphs) { const trimmedParagraph = paragraph.trim(); if (currentSize + trimmedParagraph.length > options.chunkSize && currentChunk.length > 0) { chunks.push({ id: (0, uuid_1.v4)(), text: currentChunk.trim(), metadata: { ...metadata, index: chunks.length }, }); const overlap = Math.min(options.overlap, currentChunk.length); currentChunk = currentChunk.slice(-overlap) + '\n\n' + trimmedParagraph; currentSize = currentChunk.length; } else { currentChunk += (currentChunk ? '\n\n' : '') + trimmedParagraph; currentSize = currentChunk.length; } } if (currentChunk.trim().length > 0) { chunks.push({ id: (0, uuid_1.v4)(), text: currentChunk.trim(), metadata: { ...metadata, index: chunks.length }, }); } return chunks; } static chunkBySemantic(text, options, metadata) { return this.chunkBySentence(text, options, metadata); } static extractMetadata(text, source) { const metadata = { length: text.length, wordCount: text.split(/\s+/).length, createdAt: new Date().toISOString(), }; if (source) { metadata.source = source; } return metadata; } } exports.TextProcessor = TextProcessor; //# sourceMappingURL=TextProcessor.js.map