n8n-nodes-rag
Version:
Advanced RAG (Retrieval-Augmented Generation) knowledge base nodes for n8n
129 lines • 5.02 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.TextProcessor = void 0;
const uuid_1 = require("uuid");
class TextProcessor {
static cleanText(text, options) {
let cleanedText = text;
if (options.normalizeLineBreaks) {
cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
}
if (options.removeExtraWhitespace) {
cleanedText = cleanedText.replace(/[ \t]+/g, ' ');
}
if (options.removeEmptyLines) {
cleanedText = cleanedText.replace(/\n\s*\n/g, '\n');
}
if (options.trimText) {
cleanedText = cleanedText.trim();
}
return cleanedText;
}
static chunkText(text, options, metadata = {}) {
switch (options.strategy) {
case 'fixed':
return this.chunkByFixedSize(text, options, metadata);
case 'sentence':
return this.chunkBySentence(text, options, metadata);
case 'paragraph':
return this.chunkByParagraph(text, options, metadata);
case 'semantic':
return this.chunkBySemantic(text, options, metadata);
default:
throw new Error(`Unknown chunking strategy: ${options.strategy}`);
}
}
static chunkByFixedSize(text, options, metadata) {
const chunks = [];
const { chunkSize, overlap } = options;
for (let i = 0; i < text.length; i += chunkSize - overlap) {
const chunk = text.slice(i, i + chunkSize);
if (chunk.trim().length > 0) {
chunks.push({
id: (0, uuid_1.v4)(),
text: chunk.trim(),
metadata: { ...metadata, index: chunks.length },
});
}
}
return chunks;
}
static chunkBySentence(text, options, metadata) {
const chunks = [];
const sentences = text.split(/[.!?]+/).filter(s => s.trim().length > 0);
let currentChunk = '';
let currentSize = 0;
for (const sentence of sentences) {
const trimmedSentence = sentence.trim();
if (currentSize + trimmedSentence.length > options.chunkSize && currentChunk.length > 0) {
chunks.push({
id: (0, uuid_1.v4)(),
text: currentChunk.trim(),
metadata: { ...metadata, index: chunks.length },
});
const overlap = Math.min(options.overlap, currentChunk.length);
currentChunk = currentChunk.slice(-overlap) + ' ' + trimmedSentence;
currentSize = currentChunk.length;
}
else {
currentChunk += (currentChunk ? ' ' : '') + trimmedSentence;
currentSize = currentChunk.length;
}
}
if (currentChunk.trim().length > 0) {
chunks.push({
id: (0, uuid_1.v4)(),
text: currentChunk.trim(),
metadata: { ...metadata, index: chunks.length },
});
}
return chunks;
}
static chunkByParagraph(text, options, metadata) {
const chunks = [];
const paragraphs = text.split(/\n\s*\n/).filter(p => p.trim().length > 0);
let currentChunk = '';
let currentSize = 0;
for (const paragraph of paragraphs) {
const trimmedParagraph = paragraph.trim();
if (currentSize + trimmedParagraph.length > options.chunkSize && currentChunk.length > 0) {
chunks.push({
id: (0, uuid_1.v4)(),
text: currentChunk.trim(),
metadata: { ...metadata, index: chunks.length },
});
const overlap = Math.min(options.overlap, currentChunk.length);
currentChunk = currentChunk.slice(-overlap) + '\n\n' + trimmedParagraph;
currentSize = currentChunk.length;
}
else {
currentChunk += (currentChunk ? '\n\n' : '') + trimmedParagraph;
currentSize = currentChunk.length;
}
}
if (currentChunk.trim().length > 0) {
chunks.push({
id: (0, uuid_1.v4)(),
text: currentChunk.trim(),
metadata: { ...metadata, index: chunks.length },
});
}
return chunks;
}
static chunkBySemantic(text, options, metadata) {
return this.chunkBySentence(text, options, metadata);
}
static extractMetadata(text, source) {
const metadata = {
length: text.length,
wordCount: text.split(/\s+/).length,
createdAt: new Date().toISOString(),
};
if (source) {
metadata.source = source;
}
return metadata;
}
}
exports.TextProcessor = TextProcessor;
//# sourceMappingURL=TextProcessor.js.map