brain-mcp
Version:
Brain MCP Server - Semantic knowledge base access for Claude Code via Model Context Protocol. Provides intelligent search and navigation of files from multiple locations through native MCP tools.
320 lines • 12.8 kB
JavaScript
;
/**
* Intelligent chunking service for markdown content
* Creates semantic chunks optimized for embedding and retrieval
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.ChunkingService = void 0;
const types_1 = require("../models/types");
class ChunkingService {
static DEFAULT_OPTIONS = {
maxChunkSize: 6000, // Maximum safe size (~7500 tokens) to stay under 8192 limit
overlapSize: 600, // 10% overlap
preserveHeadings: true,
minChunkSize: 100
};
/**
* Generate semantic chunks from markdown content
*/
static createChunks(content, title, headings, notePath, options = {}) {
const opts = { ...this.DEFAULT_OPTIONS, ...options };
const chunks = [];
// Simple chunking: split content by max size with overlap
let chunkIndex = 0;
let position = 0;
while (position < content.length && chunkIndex < 10000) { // Safety limit
// Calculate chunk end position
let chunkEnd = Math.min(position + opts.maxChunkSize, content.length);
// If not at the end, try to break at a word boundary
if (chunkEnd < content.length) {
const lastSpace = content.lastIndexOf(' ', chunkEnd);
if (lastSpace > position + opts.minChunkSize) {
chunkEnd = lastSpace;
}
}
// Extract chunk content
const chunkContent = content.substring(position, chunkEnd).trim();
if (chunkContent.length >= opts.minChunkSize) {
chunks.push({
id: `${notePath}#chunk${chunkIndex}`,
content: chunkContent,
startLine: 0, // Line numbers not meaningful with this approach
endLine: 0,
headingContext: [],
chunkType: types_1.ChunkType.PARAGRAPH
});
chunkIndex++;
}
// Move position forward
if (chunkEnd < content.length) {
// Always move forward by at least maxChunkSize - overlapSize to avoid tiny steps
const newPosition = Math.max(position + opts.maxChunkSize - opts.overlapSize, chunkEnd - opts.overlapSize);
// Safety check - ensure we're moving forward
if (newPosition <= position) {
position = position + 100; // Force advancement
}
else {
position = newPosition;
}
}
else {
break;
}
}
return chunks;
}
/**
* Create a title chunk with context
*/
static createTitleChunk(content, title, notePath) {
const lines = content.split('\n');
let firstParagraph = '';
let endLine = 0;
// Skip frontmatter and empty lines
let startLine = 0;
while (startLine < lines.length && (lines[startLine].trim() === '' || lines[startLine].trim() === '---')) {
startLine++;
}
// Skip title line if it exists
if (startLine < lines.length && lines[startLine].startsWith('#')) {
startLine++;
}
// Collect first meaningful paragraph
for (let i = startLine; i < lines.length; i++) {
const line = lines[i].trim();
if (line === '') {
if (firstParagraph.trim().length > 0) {
endLine = i;
break;
}
continue;
}
firstParagraph += line + ' ';
// Stop if we have enough content
if (firstParagraph.length > 300) {
endLine = i + 1;
break;
}
}
if (firstParagraph.trim().length < 50) {
return null; // Not enough meaningful content
}
const titleContent = `${title}\n\n${firstParagraph.trim()}`;
return {
id: `${notePath}#title`,
content: titleContent,
startLine: 0,
endLine: endLine,
headingContext: [title],
chunkType: types_1.ChunkType.TITLE
};
}
/**
* Create chunks based on heading structure
*/
static createHeadingChunks(lines, headings, notePath, options) {
const chunks = [];
for (let i = 0; i < headings.length; i++) {
const heading = headings[i];
const nextHeading = i + 1 < headings.length ? headings[i + 1] : null;
// Determine section boundaries
const startLine = heading.lineNumber;
const endLine = nextHeading ? nextHeading.lineNumber - 1 : lines.length - 1;
// Extract section content
const sectionLines = lines.slice(startLine, endLine + 1);
const sectionContent = sectionLines.join('\n').trim();
if (sectionContent.length < options.minChunkSize) {
continue; // Skip very small sections
}
// Build heading context (hierarchical path)
const headingContext = this.buildHeadingContext(heading, headings);
// If section is small enough, create single chunk
if (sectionContent.length <= options.maxChunkSize) {
chunks.push({
id: `${notePath}#${heading.slug}`,
content: sectionContent,
startLine,
endLine,
headingContext,
chunkType: types_1.ChunkType.HEADING_SECTION
});
}
else {
// Split large sections into sub-chunks
const subChunks = this.splitLargeSection(sectionContent, startLine, headingContext, notePath, heading.slug, options);
chunks.push(...subChunks);
}
}
return chunks;
}
/**
* Create chunks based on paragraph boundaries
*/
static createParagraphChunks(lines, notePath, options) {
const chunks = [];
let currentChunk = '';
let chunkStartLine = 0;
let chunkIndex = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
// Check if we hit a natural break (empty line + enough content)
if (line.trim() === '' && currentChunk.trim().length >= options.minChunkSize) {
if (currentChunk.length >= options.maxChunkSize) {
// Save current chunk
chunks.push({
id: `${notePath}#para${chunkIndex}`,
content: currentChunk.trim(),
startLine: chunkStartLine,
endLine: i - 1,
headingContext: [],
chunkType: types_1.ChunkType.PARAGRAPH
});
// Start new chunk with overlap
const overlapContent = this.extractOverlap(currentChunk, options.overlapSize);
currentChunk = overlapContent;
chunkStartLine = Math.max(0, i - this.estimateOverlapLines(overlapContent));
chunkIndex++;
}
}
else {
currentChunk += line + '\n';
}
}
// Add final chunk if there's content
if (currentChunk.trim().length >= options.minChunkSize) {
chunks.push({
id: `${notePath}#para${chunkIndex}`,
content: currentChunk.trim(),
startLine: chunkStartLine,
endLine: lines.length - 1,
headingContext: [],
chunkType: types_1.ChunkType.PARAGRAPH
});
}
return chunks;
}
/**
* Split large sections into manageable chunks
*/
static splitLargeSection(content, startLine, headingContext, notePath, headingSlug, options) {
const chunks = [];
const paragraphs = content.split('\n\n');
let currentChunk = '';
let currentStartLine = startLine;
let chunkIndex = 0;
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length + 2 > options.maxChunkSize) {
if (currentChunk.trim().length >= options.minChunkSize) {
chunks.push({
id: `${notePath}#${headingSlug}-${chunkIndex}`,
content: currentChunk.trim(),
startLine: currentStartLine,
endLine: currentStartLine + this.estimateLines(currentChunk),
headingContext,
chunkType: types_1.ChunkType.HEADING_SECTION
});
chunkIndex++;
}
// Start new chunk with overlap
const overlapContent = this.extractOverlap(currentChunk, options.overlapSize);
currentChunk = overlapContent + (overlapContent ? '\n\n' : '') + paragraph;
currentStartLine += this.estimateLines(currentChunk) - this.estimateLines(overlapContent);
}
else {
currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
}
}
// Add final chunk
if (currentChunk.trim().length >= options.minChunkSize) {
chunks.push({
id: `${notePath}#${headingSlug}-${chunkIndex}`,
content: currentChunk.trim(),
startLine: currentStartLine,
endLine: startLine + this.estimateLines(content),
headingContext,
chunkType: types_1.ChunkType.HEADING_SECTION
});
}
return chunks;
}
/**
* Build hierarchical heading context
*/
static buildHeadingContext(currentHeading, allHeadings) {
const context = [];
// Find parent headings
for (let i = allHeadings.indexOf(currentHeading) - 1; i >= 0; i--) {
const heading = allHeadings[i];
if (heading.level < currentHeading.level) {
context.unshift(heading.text);
// Stop at immediate parent for this level
if (heading.level === currentHeading.level - 1) {
break;
}
}
}
// Add current heading
context.push(currentHeading.text);
return context;
}
/**
* Extract overlap content from end of chunk
*/
static extractOverlap(content, overlapSize) {
if (content.length <= overlapSize) {
return content;
}
// Try to break at sentence boundary within overlap
const overlapText = content.slice(-overlapSize);
const sentenceEnd = overlapText.lastIndexOf('. ');
if (sentenceEnd > 0 && sentenceEnd > overlapSize * 0.5) {
return overlapText.slice(sentenceEnd + 2);
}
return overlapText;
}
/**
* Remove duplicate or very similar chunks
*/
static deduplicateChunks(chunks, options) {
const uniqueChunks = [];
for (const chunk of chunks) {
const isDuplicate = uniqueChunks.some(existing => {
// Skip title chunks in deduplication
if (chunk.chunkType === types_1.ChunkType.TITLE || existing.chunkType === types_1.ChunkType.TITLE) {
return false;
}
// Check for high similarity (simple string comparison)
const similarity = this.calculateStringSimilarity(chunk.content, existing.content);
return similarity > 0.85;
});
if (!isDuplicate) {
uniqueChunks.push(chunk);
}
}
return uniqueChunks;
}
/**
* Calculate simple string similarity (Jaccard index)
*/
static calculateStringSimilarity(str1, str2) {
const words1 = new Set(str1.toLowerCase().split(/\s+/));
const words2 = new Set(str2.toLowerCase().split(/\s+/));
const intersection = new Set([...words1].filter(x => words2.has(x)));
const union = new Set([...words1, ...words2]);
return intersection.size / union.size;
}
/**
* Estimate number of lines in text
*/
static estimateLines(text) {
return text.split('\n').length;
}
/**
* Estimate lines needed for overlap content
*/
static estimateOverlapLines(text) {
return Math.max(1, text.split('\n').length);
}
}
exports.ChunkingService = ChunkingService;
//# sourceMappingURL=ChunkingService.js.map