dtamind-components
Version:
DTAmindai Components
153 lines • 5.65 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const utils_1 = require("../../../src/utils");
const text_splitter_1 = require("langchain/text_splitter");
class MarkdownTextSplitter_TextSplitters {
constructor() {
this.label = 'Markdown Text Splitter';
this.name = 'markdownTextSplitter';
this.version = 1.1;
this.type = 'MarkdownTextSplitter';
this.icon = 'markdownTextSplitter.svg';
this.category = 'Text Splitters';
this.description = `Split your content into documents based on the Markdown headers`;
this.baseClasses = [this.type, ...(0, utils_1.getBaseClasses)(text_splitter_1.MarkdownTextSplitter)];
this.inputs = [
{
label: 'Chunk Size',
name: 'chunkSize',
type: 'number',
description: 'Number of characters in each chunk. Default is 1000.',
default: 1000,
optional: true
},
{
label: 'Chunk Overlap',
name: 'chunkOverlap',
type: 'number',
description: 'Number of characters to overlap between chunks. Default is 200.',
default: 200,
optional: true
},
{
label: 'Split by Headers',
name: 'splitByHeaders',
type: 'options',
description: 'Split documents at specified header levels. Headers will be included with their content.',
default: 'disabled',
options: [
{
label: 'Disabled',
name: 'disabled'
},
{
label: '# Headers (H1)',
name: 'h1'
},
{
label: '## Headers (H2)',
name: 'h2'
},
{
label: '### Headers (H3)',
name: 'h3'
},
{
label: '#### Headers (H4)',
name: 'h4'
},
{
label: '##### Headers (H5)',
name: 'h5'
},
{
label: '###### Headers (H6)',
name: 'h6'
}
],
optional: true
}
];
}
async init(nodeData) {
const chunkSize = nodeData.inputs?.chunkSize;
const chunkOverlap = nodeData.inputs?.chunkOverlap;
const splitByHeaders = nodeData.inputs?.splitByHeaders;
const obj = {};
if (chunkSize)
obj.chunkSize = parseInt(chunkSize, 10);
if (chunkOverlap)
obj.chunkOverlap = parseInt(chunkOverlap, 10);
const splitter = new text_splitter_1.MarkdownTextSplitter(obj);
if (splitByHeaders && splitByHeaders !== 'disabled') {
return {
splitDocuments: async (documents) => {
const results = [];
for (const doc of documents) {
const chunks = await this.splitByHeaders(doc.pageContent, splitByHeaders, splitter);
for (const chunk of chunks) {
results.push({
pageContent: chunk,
metadata: { ...doc.metadata }
});
}
}
return results;
},
splitText: async (text) => {
return await this.splitByHeaders(text, splitByHeaders, splitter);
}
};
}
return splitter;
}
async splitByHeaders(text, headerLevel, fallbackSplitter) {
const maxLevel = this.getHeaderLevel(headerLevel);
if (maxLevel === 0)
return await fallbackSplitter.splitText(text);
const lines = text.split('\n');
const sections = [];
let currentSection = [];
for (const line of lines) {
const isHeader = line.startsWith('#') && line.match(/^#{1,6}\s/);
const headerDepth = isHeader ? line.match(/^(#+)/)?.[1]?.length || 0 : 0;
if (isHeader && headerDepth <= maxLevel) {
// Save previous section
if (currentSection.length > 0) {
sections.push(currentSection.join('\n').trim());
}
// Start new section
currentSection = [line];
}
else {
// Add line to current section
currentSection.push(line);
}
}
// Add final section
if (currentSection.length > 0) {
sections.push(currentSection.join('\n').trim());
}
return sections;
}
getHeaderLevel(headerLevel) {
switch (headerLevel) {
case 'h1':
return 1;
case 'h2':
return 2;
case 'h3':
return 3;
case 'h4':
return 4;
case 'h5':
return 5;
case 'h6':
return 6;
default:
return 0;
}
}
}
module.exports = { nodeClass: MarkdownTextSplitter_TextSplitters };
//# sourceMappingURL=MarkdownTextSplitter.js.map