UNPKG

brain-mcp

Version:

Brain MCP Server - Semantic knowledge base access for Claude Code via Model Context Protocol. Provides intelligent search and navigation of files from multiple locations through native MCP tools.

320 lines 12.8 kB
"use strict"; /** * Intelligent chunking service for markdown content * Creates semantic chunks optimized for embedding and retrieval */ Object.defineProperty(exports, "__esModule", { value: true }); exports.ChunkingService = void 0; const types_1 = require("../models/types"); class ChunkingService { static DEFAULT_OPTIONS = { maxChunkSize: 6000, // Maximum safe size (~7500 tokens) to stay under 8192 limit overlapSize: 600, // 10% overlap preserveHeadings: true, minChunkSize: 100 }; /** * Generate semantic chunks from markdown content */ static createChunks(content, title, headings, notePath, options = {}) { const opts = { ...this.DEFAULT_OPTIONS, ...options }; const chunks = []; // Simple chunking: split content by max size with overlap let chunkIndex = 0; let position = 0; while (position < content.length && chunkIndex < 10000) { // Safety limit // Calculate chunk end position let chunkEnd = Math.min(position + opts.maxChunkSize, content.length); // If not at the end, try to break at a word boundary if (chunkEnd < content.length) { const lastSpace = content.lastIndexOf(' ', chunkEnd); if (lastSpace > position + opts.minChunkSize) { chunkEnd = lastSpace; } } // Extract chunk content const chunkContent = content.substring(position, chunkEnd).trim(); if (chunkContent.length >= opts.minChunkSize) { chunks.push({ id: `${notePath}#chunk${chunkIndex}`, content: chunkContent, startLine: 0, // Line numbers not meaningful with this approach endLine: 0, headingContext: [], chunkType: types_1.ChunkType.PARAGRAPH }); chunkIndex++; } // Move position forward if (chunkEnd < content.length) { // Always move forward by at least maxChunkSize - overlapSize to avoid tiny steps const newPosition = Math.max(position + opts.maxChunkSize - opts.overlapSize, chunkEnd - opts.overlapSize); // Safety check - ensure we're moving forward if (newPosition <= position) { position = position + 100; // Force advancement } else { position = newPosition; } } else { break; } } return chunks; } /** * Create a title chunk with context */ static createTitleChunk(content, title, notePath) { const lines = content.split('\n'); let firstParagraph = ''; let endLine = 0; // Skip frontmatter and empty lines let startLine = 0; while (startLine < lines.length && (lines[startLine].trim() === '' || lines[startLine].trim() === '---')) { startLine++; } // Skip title line if it exists if (startLine < lines.length && lines[startLine].startsWith('#')) { startLine++; } // Collect first meaningful paragraph for (let i = startLine; i < lines.length; i++) { const line = lines[i].trim(); if (line === '') { if (firstParagraph.trim().length > 0) { endLine = i; break; } continue; } firstParagraph += line + ' '; // Stop if we have enough content if (firstParagraph.length > 300) { endLine = i + 1; break; } } if (firstParagraph.trim().length < 50) { return null; // Not enough meaningful content } const titleContent = `${title}\n\n${firstParagraph.trim()}`; return { id: `${notePath}#title`, content: titleContent, startLine: 0, endLine: endLine, headingContext: [title], chunkType: types_1.ChunkType.TITLE }; } /** * Create chunks based on heading structure */ static createHeadingChunks(lines, headings, notePath, options) { const chunks = []; for (let i = 0; i < headings.length; i++) { const heading = headings[i]; const nextHeading = i + 1 < headings.length ? headings[i + 1] : null; // Determine section boundaries const startLine = heading.lineNumber; const endLine = nextHeading ? nextHeading.lineNumber - 1 : lines.length - 1; // Extract section content const sectionLines = lines.slice(startLine, endLine + 1); const sectionContent = sectionLines.join('\n').trim(); if (sectionContent.length < options.minChunkSize) { continue; // Skip very small sections } // Build heading context (hierarchical path) const headingContext = this.buildHeadingContext(heading, headings); // If section is small enough, create single chunk if (sectionContent.length <= options.maxChunkSize) { chunks.push({ id: `${notePath}#${heading.slug}`, content: sectionContent, startLine, endLine, headingContext, chunkType: types_1.ChunkType.HEADING_SECTION }); } else { // Split large sections into sub-chunks const subChunks = this.splitLargeSection(sectionContent, startLine, headingContext, notePath, heading.slug, options); chunks.push(...subChunks); } } return chunks; } /** * Create chunks based on paragraph boundaries */ static createParagraphChunks(lines, notePath, options) { const chunks = []; let currentChunk = ''; let chunkStartLine = 0; let chunkIndex = 0; for (let i = 0; i < lines.length; i++) { const line = lines[i]; // Check if we hit a natural break (empty line + enough content) if (line.trim() === '' && currentChunk.trim().length >= options.minChunkSize) { if (currentChunk.length >= options.maxChunkSize) { // Save current chunk chunks.push({ id: `${notePath}#para${chunkIndex}`, content: currentChunk.trim(), startLine: chunkStartLine, endLine: i - 1, headingContext: [], chunkType: types_1.ChunkType.PARAGRAPH }); // Start new chunk with overlap const overlapContent = this.extractOverlap(currentChunk, options.overlapSize); currentChunk = overlapContent; chunkStartLine = Math.max(0, i - this.estimateOverlapLines(overlapContent)); chunkIndex++; } } else { currentChunk += line + '\n'; } } // Add final chunk if there's content if (currentChunk.trim().length >= options.minChunkSize) { chunks.push({ id: `${notePath}#para${chunkIndex}`, content: currentChunk.trim(), startLine: chunkStartLine, endLine: lines.length - 1, headingContext: [], chunkType: types_1.ChunkType.PARAGRAPH }); } return chunks; } /** * Split large sections into manageable chunks */ static splitLargeSection(content, startLine, headingContext, notePath, headingSlug, options) { const chunks = []; const paragraphs = content.split('\n\n'); let currentChunk = ''; let currentStartLine = startLine; let chunkIndex = 0; for (const paragraph of paragraphs) { if (currentChunk.length + paragraph.length + 2 > options.maxChunkSize) { if (currentChunk.trim().length >= options.minChunkSize) { chunks.push({ id: `${notePath}#${headingSlug}-${chunkIndex}`, content: currentChunk.trim(), startLine: currentStartLine, endLine: currentStartLine + this.estimateLines(currentChunk), headingContext, chunkType: types_1.ChunkType.HEADING_SECTION }); chunkIndex++; } // Start new chunk with overlap const overlapContent = this.extractOverlap(currentChunk, options.overlapSize); currentChunk = overlapContent + (overlapContent ? '\n\n' : '') + paragraph; currentStartLine += this.estimateLines(currentChunk) - this.estimateLines(overlapContent); } else { currentChunk += (currentChunk ? '\n\n' : '') + paragraph; } } // Add final chunk if (currentChunk.trim().length >= options.minChunkSize) { chunks.push({ id: `${notePath}#${headingSlug}-${chunkIndex}`, content: currentChunk.trim(), startLine: currentStartLine, endLine: startLine + this.estimateLines(content), headingContext, chunkType: types_1.ChunkType.HEADING_SECTION }); } return chunks; } /** * Build hierarchical heading context */ static buildHeadingContext(currentHeading, allHeadings) { const context = []; // Find parent headings for (let i = allHeadings.indexOf(currentHeading) - 1; i >= 0; i--) { const heading = allHeadings[i]; if (heading.level < currentHeading.level) { context.unshift(heading.text); // Stop at immediate parent for this level if (heading.level === currentHeading.level - 1) { break; } } } // Add current heading context.push(currentHeading.text); return context; } /** * Extract overlap content from end of chunk */ static extractOverlap(content, overlapSize) { if (content.length <= overlapSize) { return content; } // Try to break at sentence boundary within overlap const overlapText = content.slice(-overlapSize); const sentenceEnd = overlapText.lastIndexOf('. '); if (sentenceEnd > 0 && sentenceEnd > overlapSize * 0.5) { return overlapText.slice(sentenceEnd + 2); } return overlapText; } /** * Remove duplicate or very similar chunks */ static deduplicateChunks(chunks, options) { const uniqueChunks = []; for (const chunk of chunks) { const isDuplicate = uniqueChunks.some(existing => { // Skip title chunks in deduplication if (chunk.chunkType === types_1.ChunkType.TITLE || existing.chunkType === types_1.ChunkType.TITLE) { return false; } // Check for high similarity (simple string comparison) const similarity = this.calculateStringSimilarity(chunk.content, existing.content); return similarity > 0.85; }); if (!isDuplicate) { uniqueChunks.push(chunk); } } return uniqueChunks; } /** * Calculate simple string similarity (Jaccard index) */ static calculateStringSimilarity(str1, str2) { const words1 = new Set(str1.toLowerCase().split(/\s+/)); const words2 = new Set(str2.toLowerCase().split(/\s+/)); const intersection = new Set([...words1].filter(x => words2.has(x))); const union = new Set([...words1, ...words2]); return intersection.size / union.size; } /** * Estimate number of lines in text */ static estimateLines(text) { return text.split('\n').length; } /** * Estimate lines needed for overlap content */ static estimateOverlapLines(text) { return Math.max(1, text.split('\n').length); } } exports.ChunkingService = ChunkingService; //# sourceMappingURL=ChunkingService.js.map