@just-every/mcp-read-website-fast
Version:
Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown
149 lines (148 loc) • 5.17 kB
JavaScript
export class MarkdownChunker {
options;
constructor(options = {}) {
this.options = {
maxTokens: options.maxTokens ?? 0,
maxChars: options.maxChars ?? 4000,
splitOn: options.splitOn ?? 'heading',
overlap: options.overlap ?? 200,
};
}
chunk(markdown) {
switch (this.options.splitOn) {
case 'heading':
return this.chunkByHeading(markdown);
case 'paragraph':
return this.chunkByParagraph(markdown);
case 'sentence':
return this.chunkBySentence(markdown);
default:
return this.chunkByHeading(markdown);
}
}
chunkByHeading(markdown) {
const chunks = [];
const lines = markdown.split('\n');
let currentChunk = [];
let currentHeadings = [];
let startLine = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const isHeading = /^#+\s/.test(line);
if (isHeading && currentChunk.length > 0) {
chunks.push({
content: currentChunk.join('\n').trim(),
index: chunks.length,
metadata: {
headings: [...currentHeadings],
startLine,
endLine: i - 1,
},
});
const overlapLines = this.getOverlapLines(currentChunk);
currentChunk = [...overlapLines, line];
currentHeadings = [line];
startLine = i - overlapLines.length;
}
else {
currentChunk.push(line);
if (isHeading) {
currentHeadings.push(line);
}
}
const currentSize = currentChunk.join('\n').length;
if (currentSize > this.options.maxChars) {
chunks.push({
content: currentChunk.join('\n').trim(),
index: chunks.length,
metadata: {
headings: [...currentHeadings],
startLine,
endLine: i,
},
});
const overlapLines = this.getOverlapLines(currentChunk);
currentChunk = [...overlapLines];
currentHeadings = [];
startLine = i - overlapLines.length + 1;
}
}
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk.join('\n').trim(),
index: chunks.length,
metadata: {
headings: currentHeadings,
startLine,
endLine: lines.length - 1,
},
});
}
return chunks;
}
chunkByParagraph(markdown) {
const chunks = [];
const paragraphs = markdown.split(/\n\n+/);
let currentChunk = [];
for (const paragraph of paragraphs) {
const wouldExceedLimit = currentChunk.join('\n\n').length + paragraph.length >
this.options.maxChars;
if (wouldExceedLimit && currentChunk.length > 0) {
chunks.push({
content: currentChunk.join('\n\n').trim(),
index: chunks.length,
});
currentChunk = [];
}
currentChunk.push(paragraph);
}
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk.join('\n\n').trim(),
index: chunks.length,
});
}
return chunks;
}
chunkBySentence(markdown) {
const chunks = [];
const sentences = markdown.match(/[^.!?]+[.!?]+/g) || [markdown];
let currentChunk = [];
for (const sentence of sentences) {
const wouldExceedLimit = currentChunk.join(' ').length + sentence.length >
this.options.maxChars;
if (wouldExceedLimit && currentChunk.length > 0) {
chunks.push({
content: currentChunk.join(' ').trim(),
index: chunks.length,
});
currentChunk = [];
}
currentChunk.push(sentence.trim());
}
if (currentChunk.length > 0) {
chunks.push({
content: currentChunk.join(' ').trim(),
index: chunks.length,
});
}
return chunks;
}
getOverlapLines(lines) {
if (this.options.overlap <= 0)
return [];
let overlapChars = 0;
const overlapLines = [];
for (let i = lines.length - 1; i >= 0; i--) {
overlapLines.unshift(lines[i]);
overlapChars += lines[i].length + 1;
if (overlapChars >= this.options.overlap) {
break;
}
}
return overlapLines;
}
estimateTokens(text) {
return Math.ceil(text.length / 4);
}
}