UNPKG

@just-every/mcp-read-website-fast

Version:

Markdown Content Preprocessor - Fetch web pages, extract content, convert to clean Markdown

149 lines (148 loc) 5.17 kB
export class MarkdownChunker { options; constructor(options = {}) { this.options = { maxTokens: options.maxTokens ?? 0, maxChars: options.maxChars ?? 4000, splitOn: options.splitOn ?? 'heading', overlap: options.overlap ?? 200, }; } chunk(markdown) { switch (this.options.splitOn) { case 'heading': return this.chunkByHeading(markdown); case 'paragraph': return this.chunkByParagraph(markdown); case 'sentence': return this.chunkBySentence(markdown); default: return this.chunkByHeading(markdown); } } chunkByHeading(markdown) { const chunks = []; const lines = markdown.split('\n'); let currentChunk = []; let currentHeadings = []; let startLine = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; const isHeading = /^#+\s/.test(line); if (isHeading && currentChunk.length > 0) { chunks.push({ content: currentChunk.join('\n').trim(), index: chunks.length, metadata: { headings: [...currentHeadings], startLine, endLine: i - 1, }, }); const overlapLines = this.getOverlapLines(currentChunk); currentChunk = [...overlapLines, line]; currentHeadings = [line]; startLine = i - overlapLines.length; } else { currentChunk.push(line); if (isHeading) { currentHeadings.push(line); } } const currentSize = currentChunk.join('\n').length; if (currentSize > this.options.maxChars) { chunks.push({ content: currentChunk.join('\n').trim(), index: chunks.length, metadata: { headings: [...currentHeadings], startLine, endLine: i, }, }); const overlapLines = this.getOverlapLines(currentChunk); currentChunk = [...overlapLines]; currentHeadings = []; startLine = i - overlapLines.length + 1; } } if (currentChunk.length > 0) { chunks.push({ content: currentChunk.join('\n').trim(), index: chunks.length, metadata: { headings: currentHeadings, startLine, endLine: lines.length - 1, }, }); } return chunks; } chunkByParagraph(markdown) { const chunks = []; const paragraphs = markdown.split(/\n\n+/); let currentChunk = []; for (const paragraph of paragraphs) { const wouldExceedLimit = currentChunk.join('\n\n').length + paragraph.length > this.options.maxChars; if (wouldExceedLimit && currentChunk.length > 0) { chunks.push({ content: currentChunk.join('\n\n').trim(), index: chunks.length, }); currentChunk = []; } currentChunk.push(paragraph); } if (currentChunk.length > 0) { chunks.push({ content: currentChunk.join('\n\n').trim(), index: chunks.length, }); } return chunks; } chunkBySentence(markdown) { const chunks = []; const sentences = markdown.match(/[^.!?]+[.!?]+/g) || [markdown]; let currentChunk = []; for (const sentence of sentences) { const wouldExceedLimit = currentChunk.join(' ').length + sentence.length > this.options.maxChars; if (wouldExceedLimit && currentChunk.length > 0) { chunks.push({ content: currentChunk.join(' ').trim(), index: chunks.length, }); currentChunk = []; } currentChunk.push(sentence.trim()); } if (currentChunk.length > 0) { chunks.push({ content: currentChunk.join(' ').trim(), index: chunks.length, }); } return chunks; } getOverlapLines(lines) { if (this.options.overlap <= 0) return []; let overlapChars = 0; const overlapLines = []; for (let i = lines.length - 1; i >= 0; i--) { overlapLines.unshift(lines[i]); overlapChars += lines[i].length + 1; if (overlapChars >= this.options.overlap) { break; } } return overlapLines; } estimateTokens(text) { return Math.ceil(text.length / 4); } }