@knath2000/codebase-indexing-mcp
Version:
MCP server for codebase indexing with Voyage AI embeddings and Qdrant vector storage
872 lines • 36.4 kB
JavaScript
import Parser from 'tree-sitter';
import { readFileSync } from 'fs';
import { extname } from 'path';
import { createHash } from 'crypto';
import { ChunkType } from '../types.js';
// Dynamic imports for tree-sitter language grammars with error handling
const loadLanguage = async (language) => {
try {
switch (language) {
case 'javascript': {
const jsModule = await import('tree-sitter-javascript');
console.log(`JavaScript module loaded:`, {
hasDefault: !!jsModule.default,
keys: Object.keys(jsModule),
defaultType: typeof jsModule.default
});
// Try different export patterns
let grammar = jsModule.default;
if (!grammar && typeof jsModule === 'function') {
grammar = jsModule;
}
if (!grammar && jsModule.javascript) {
grammar = jsModule.javascript;
}
if (!grammar) {
throw new Error('JavaScript grammar not found in module');
}
return grammar;
}
case 'typescript': {
const tsModule = await import('tree-sitter-typescript');
console.log(`TypeScript module loaded:`, {
hasTypescript: !!tsModule.typescript,
hasTsx: !!tsModule.tsx,
keys: Object.keys(tsModule),
typescriptType: typeof tsModule.typescript,
defaultType: typeof tsModule.default,
defaultKeys: tsModule.default ? Object.keys(tsModule.default) : []
});
// Try different export patterns
let grammar = tsModule.typescript;
if (!grammar && tsModule.default) {
// The default export contains both typescript and tsx grammars
grammar = tsModule.default.typescript;
}
if (!grammar) {
throw new Error(`TypeScript grammar not found in module. Available: ${Object.keys(tsModule)}, Default: ${tsModule.default ? Object.keys(tsModule.default) : 'none'}`);
}
return grammar;
}
case 'tsx': {
const tsxModule = await import('tree-sitter-typescript');
console.log(`TSX module loaded:`, {
hasTypescript: !!tsxModule.typescript,
hasTsx: !!tsxModule.tsx,
keys: Object.keys(tsxModule),
tsxType: typeof tsxModule.tsx,
defaultType: typeof tsxModule.default,
defaultKeys: tsxModule.default ? Object.keys(tsxModule.default) : []
});
// Try different export patterns
let grammar = tsxModule.tsx;
if (!grammar && tsxModule.default) {
// The default export contains both typescript and tsx grammars
grammar = tsxModule.default.tsx;
}
if (!grammar) {
throw new Error(`TSX grammar not found in module. Available: ${Object.keys(tsxModule)}, Default: ${tsxModule.default ? Object.keys(tsxModule.default) : 'none'}`);
}
return grammar;
}
case 'python': {
const pyModule = await import('tree-sitter-python');
console.log(`Python module loaded:`, {
hasDefault: !!pyModule.default,
keys: Object.keys(pyModule),
defaultType: typeof pyModule.default
});
// Try different export patterns
let grammar = pyModule.default;
if (!grammar && typeof pyModule === 'function') {
grammar = pyModule;
}
if (!grammar && pyModule.python) {
grammar = pyModule.python;
}
if (!grammar) {
throw new Error('Python grammar not found in module');
}
return grammar;
}
case 'markdown': {
const mdModule = await import('tree-sitter-markdown');
console.log(`Markdown module loaded:`, {
hasDefault: !!mdModule.default,
keys: Object.keys(mdModule),
defaultType: typeof mdModule.default
});
// Try different export patterns for markdown
let grammar = mdModule.default;
if (!grammar && typeof mdModule === 'function') {
grammar = mdModule;
}
if (!grammar && mdModule.markdown) {
grammar = mdModule.markdown;
}
if (!grammar) {
throw new Error('Markdown grammar not found in module');
}
return grammar;
}
default:
throw new Error(`Language parser not available for: ${language}`);
}
}
catch (error) {
console.warn(`Failed to load Tree-sitter language grammar for ${language}:`, error);
throw error;
}
};
export class CodeParser {
constructor() {
this.parser = new Parser();
this.languageConfigs = new Map();
this.initializeLanguageConfigs();
}
/**
* Parse a file and extract code chunks
*/
async parseFile(filePath) {
const content = readFileSync(filePath, 'utf-8');
const language = this.getLanguageFromFile(filePath);
console.log(`[DEBUG] Parsing file: ${filePath} (language: ${language})`);
if (!language) {
const genericChunks = this.parseGenericFile(filePath, content);
console.log(`[DEBUG] [${filePath}] No language detected. Generic chunk count: ${genericChunks.length}`);
return genericChunks;
}
try {
const grammar = await loadLanguage(language);
if (!grammar) {
throw new Error(`Grammar is null or undefined for language: ${language}`);
}
this.parser.setLanguage(grammar);
const tree = this.parser.parse(content);
const chunks = this.extractChunks(tree.rootNode, content, filePath, language);
console.log(`[DEBUG] [${filePath}] Chunks extracted with language parser: ${chunks.length}`);
if (chunks.length === 0) {
// Fallback: generic chunking to ensure every file is represented
const genericChunks = this.parseGenericContent(content, filePath);
console.log(`[DEBUG] [${filePath}] Fallback to generic chunking. Generic chunk count: ${genericChunks.length}`);
chunks.push(...genericChunks);
}
return chunks;
}
catch (error) {
console.warn(`[DEBUG] [${filePath}] Failed to parse with ${language} parser, falling back to generic:`, error);
const genericChunks = this.parseGenericFile(filePath, content);
console.log(`[DEBUG] [${filePath}] Exception fallback. Generic chunk count: ${genericChunks.length}`);
return genericChunks;
}
}
/**
* Parse content directly without file I/O
*/
async parseContent(content, filePath, language) {
const detectedLanguage = language || this.getLanguageFromFile(filePath);
if (!detectedLanguage) {
return this.parseGenericContent(content, filePath);
}
try {
const grammar = await loadLanguage(detectedLanguage);
this.parser.setLanguage(grammar);
const tree = this.parser.parse(content);
const chunks = this.extractChunks(tree.rootNode, content, filePath, detectedLanguage);
if (chunks.length === 0) {
// Fallback: generic chunking to ensure every file is represented
const genericChunks = this.parseGenericContent(content, filePath);
chunks.push(...genericChunks);
}
return chunks;
}
catch (error) {
console.warn(`Failed to parse content with ${detectedLanguage} parser, falling back to generic:`, error);
return this.parseGenericContent(content, filePath);
}
}
/**
* Extract code chunks from a tree-sitter node
*/
extractChunks(node, content, filePath, language) {
const chunks = [];
const config = this.languageConfigs.get(language);
if (!config) {
return this.parseGenericContent(content, filePath);
}
const lines = content.split('\n');
this.traverseNode(node, content, filePath, language, config, chunks, lines);
// Include any deferred sub-chunks generated for large nodes
if (this._deferredChunks && this._deferredChunks.length > 0) {
chunks.push(...this._deferredChunks);
delete this._deferredChunks;
}
return chunks;
}
/**
* Recursively traverse tree-sitter nodes
*/
traverseNode(node, content, filePath, language, config, chunks, lines) {
// Check if this node matches any chunk strategies
for (const strategy of config.chunkStrategies) {
if (node.type === strategy.nodeType) {
const chunk = this.createChunkFromNode(node, content, filePath, language, strategy, lines);
if (chunk) {
chunks.push(chunk);
}
}
}
// Recursively process children
for (const child of node.children) {
this.traverseNode(child, content, filePath, language, config, chunks, lines);
}
}
/**
* Create a code chunk from a tree-sitter node
*/
createChunkFromNode(node, content, filePath, language, strategy, _lines) {
const startLine = node.startPosition.row + 1;
const endLine = node.endPosition.row + 1;
let chunkContent = node.text;
// Privacy-focused chunk size enforcement (100-1000 characters)
const MIN_CHUNK_SIZE = 100;
const MAX_CHUNK_SIZE = 1000;
// Skip if chunk is too small
if (chunkContent.length < MIN_CHUNK_SIZE) {
return null;
}
// Truncate if chunk is too large (privacy protection)
if (chunkContent.length > MAX_CHUNK_SIZE) {
chunkContent = chunkContent.substring(0, MAX_CHUNK_SIZE);
console.log(`🔒 Privacy: Truncated chunk in ${filePath}:${startLine} to ${MAX_CHUNK_SIZE} chars`);
}
// Apply strategy-specific size limits (secondary validation)
if (strategy.minSize && chunkContent.length < strategy.minSize) {
return null;
}
if (strategy.maxSize && chunkContent.length > strategy.maxSize) {
chunkContent = chunkContent.substring(0, strategy.maxSize);
}
// Extract name if strategy provides name extractor
let name;
if (strategy.nameExtractor) {
name = strategy.nameExtractor(this.nodeToParser(node));
}
const metadata = {
fileSize: content.length,
lastModified: Date.now(),
language,
extension: extname(filePath),
relativePath: filePath,
isTest: this.isTestFile(filePath),
complexity: this.calculateComplexity(chunkContent),
dependencies: this.extractDependencies(chunkContent, language),
exports: this.extractExports(chunkContent, language),
imports: this.extractImports(chunkContent, language)
};
// If the chunk is larger than MAX_CHUNK_SIZE, split it into multiple overlapping sub-chunks
if (chunkContent.length > MAX_CHUNK_SIZE) {
const SUB_CHUNK_OVERLAP = 100; // 100 char overlap for context preservation
const subChunks = [];
for (let offset = 0; offset < chunkContent.length; offset += MAX_CHUNK_SIZE - SUB_CHUNK_OVERLAP) {
const subContent = chunkContent.slice(offset, offset + MAX_CHUNK_SIZE);
if (subContent.length < MIN_CHUNK_SIZE) {
continue; // skip tiny trailing slice
}
// Estimate line numbers inside the parent chunk for metadata
const offsetLines = chunkContent.slice(0, offset).split('\n').length - 1;
const subStartLine = startLine + offsetLines;
const subEndLine = Math.min(subStartLine + subContent.split('\n').length - 1, endLine);
const subMetadata = {
...metadata,
complexity: this.calculateComplexity(subContent)
};
subChunks.push({
id: this.generateChunkId(filePath, subStartLine, subEndLine, strategy.chunkType),
content: subContent,
filePath,
language,
startLine: subStartLine,
endLine: subEndLine,
chunkType: strategy.chunkType,
functionName: strategy.chunkType === ChunkType.FUNCTION ? name : undefined,
className: strategy.chunkType === ChunkType.CLASS ? name : undefined,
moduleName: strategy.chunkType === ChunkType.MODULE ? name : undefined,
contentHash: this.generateContentHash(subContent),
metadata: subMetadata
});
}
// Return null here; traverseNode will handle pushing subChunks separately
this._deferredChunks = (this._deferredChunks || []).concat(subChunks);
return null;
}
return {
id: this.generateChunkId(filePath, startLine, endLine, strategy.chunkType),
content: chunkContent,
filePath,
language,
startLine,
endLine,
chunkType: strategy.chunkType,
functionName: strategy.chunkType === ChunkType.FUNCTION ? name : undefined,
className: strategy.chunkType === ChunkType.CLASS ? name : undefined,
moduleName: strategy.chunkType === ChunkType.MODULE ? name : undefined,
contentHash: this.generateContentHash(chunkContent),
metadata
};
}
/**
* Parse generic files (non-code files or unsupported languages)
*/
parseGenericFile(filePath, content) {
return this.parseGenericContent(content, filePath);
}
/**
* Parse generic content by splitting into logical chunks
*/
parseGenericContent(content, filePath) {
const ext = extname(filePath).toLowerCase();
// Special handling for markdown files without tree-sitter
if (ext === '.md' || ext === '.markdown') {
return this.parseMarkdownContentFallback(content, filePath);
}
const chunks = [];
const lines = content.split('\n');
const chunkSize = 50; // Lines per chunk
const overlap = 5;
// Privacy-focused chunk size enforcement
const MIN_CHUNK_SIZE = 100;
const MAX_CHUNK_SIZE = 1000;
for (let i = 0; i < lines.length; i += chunkSize - overlap) {
const chunkLines = lines.slice(i, i + chunkSize);
const startLine = i + 1;
const endLine = Math.min(i + chunkSize, lines.length);
let chunkContent = chunkLines.join('\n');
if (chunkContent.trim().length === 0) {
continue;
}
// Privacy protection: enforce size limits
if (chunkContent.length < MIN_CHUNK_SIZE) {
continue; // Skip chunks that are too small
}
if (chunkContent.length > MAX_CHUNK_SIZE) {
chunkContent = chunkContent.substring(0, MAX_CHUNK_SIZE);
console.log(`🔒 Privacy: Truncated generic chunk in ${filePath}:${startLine} to ${MAX_CHUNK_SIZE} chars`);
}
const metadata = {
fileSize: content.length,
lastModified: Date.now(),
language: 'text',
extension: extname(filePath),
relativePath: filePath,
isTest: this.isTestFile(filePath)
};
chunks.push({
id: this.generateChunkId(filePath, startLine, endLine, ChunkType.GENERIC),
content: chunkContent,
filePath,
language: 'text',
startLine,
endLine,
chunkType: ChunkType.GENERIC,
contentHash: this.generateContentHash(chunkContent),
metadata
});
}
return chunks;
}
/**
* Parse markdown content without tree-sitter (fallback)
*/
parseMarkdownContentFallback(content, filePath) {
const chunks = [];
const lines = content.split('\n');
let currentChunk = '';
let currentStartLine = 1;
let currentChunkType = ChunkType.PARAGRAPH;
let currentName = '';
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const lineNumber = i + 1;
// Check for ATX headings (# ## ### etc)
const atxHeadingMatch = line.match(/^(#{1,6})\s*(.+?)(?:\s*#*)?$/);
if (atxHeadingMatch) {
// Save previous chunk if it exists
if (currentChunk.trim()) {
chunks.push(this.createMarkdownChunk(currentChunk.trim(), filePath, currentStartLine, lineNumber - 1, currentChunkType, currentName, content));
}
// Start new section chunk
currentChunk = line;
currentStartLine = lineNumber;
currentChunkType = ChunkType.SECTION;
currentName = atxHeadingMatch[2].trim();
continue;
}
// Check for setext headings (underlined with = or -)
if (i > 0 && line.match(/^[=\-]{3,}$/)) {
const prevLine = lines[i - 1];
if (prevLine.trim()) {
// Save previous chunk if it exists and is not the heading line
if (currentChunk.trim() && currentChunk.trim() !== prevLine.trim()) {
chunks.push(this.createMarkdownChunk(currentChunk.trim(), filePath, currentStartLine, lineNumber - 2, currentChunkType, currentName, content));
}
// Create section chunk with heading and underline
const sectionContent = prevLine + '\n' + line;
chunks.push(this.createMarkdownChunk(sectionContent, filePath, lineNumber - 1, lineNumber, ChunkType.SECTION, prevLine.trim(), content));
currentChunk = '';
currentStartLine = lineNumber + 1;
currentChunkType = ChunkType.PARAGRAPH;
currentName = '';
continue;
}
}
// Check for fenced code blocks
if (line.match(/^```/)) {
// Save previous chunk if it exists
if (currentChunk.trim()) {
chunks.push(this.createMarkdownChunk(currentChunk.trim(), filePath, currentStartLine, lineNumber - 1, currentChunkType, currentName, content));
}
// Find the end of the code block
const langMatch = line.match(/^```\s*([a-zA-Z0-9_+-]*)/);
const language = langMatch && langMatch[1] ? langMatch[1] : 'code';
let codeBlockContent = line + '\n';
let j = i + 1;
while (j < lines.length && !lines[j].match(/^```\s*$/)) {
codeBlockContent += lines[j] + '\n';
j++;
}
if (j < lines.length) {
codeBlockContent += lines[j]; // Add closing ```
}
chunks.push(this.createMarkdownChunk(codeBlockContent.trim(), filePath, lineNumber, j + 1, ChunkType.CODE_BLOCK, language, content));
i = j; // Skip to after the code block
currentChunk = '';
currentStartLine = j + 2;
currentChunkType = ChunkType.PARAGRAPH;
currentName = '';
continue;
}
// Add line to current chunk
currentChunk += line + '\n';
}
// Save final chunk if it exists
if (currentChunk.trim()) {
chunks.push(this.createMarkdownChunk(currentChunk.trim(), filePath, currentStartLine, lines.length, currentChunkType, currentName, content));
}
return chunks;
}
/**
* Create a markdown chunk with proper metadata
*/
createMarkdownChunk(content, filePath, startLine, endLine, chunkType, name, fullContent) {
// Privacy-focused chunk size enforcement
const MIN_CHUNK_SIZE = 100;
const MAX_CHUNK_SIZE = 1000;
// Skip if content is too small
if (content.length < MIN_CHUNK_SIZE) {
return null; // This will be filtered out
}
// Truncate if content is too large (privacy protection)
if (content.length > MAX_CHUNK_SIZE) {
content = content.substring(0, MAX_CHUNK_SIZE);
console.log(`🔒 Privacy: Truncated markdown chunk in ${filePath}:${startLine} to ${MAX_CHUNK_SIZE} chars`);
}
const metadata = {
fileSize: fullContent.length,
lastModified: Date.now(),
language: 'markdown',
extension: extname(filePath),
relativePath: filePath,
isTest: this.isTestFile(filePath),
complexity: this.calculateComplexity(content)
};
return {
id: this.generateChunkId(filePath, startLine, endLine, chunkType),
content,
filePath,
language: 'markdown',
startLine,
endLine,
chunkType,
functionName: chunkType === ChunkType.CODE_BLOCK ? name : undefined,
className: chunkType === ChunkType.SECTION ? name : undefined,
moduleName: undefined,
contentHash: this.generateContentHash(content),
metadata
};
}
/**
* Get language from file extension
*/
getLanguageFromFile(filePath) {
const ext = extname(filePath).toLowerCase();
const languageMap = {
'.js': 'javascript',
'.jsx': 'javascript',
'.ts': 'typescript',
'.tsx': 'tsx',
'.py': 'python',
'.md': 'markdown',
'.markdown': 'markdown'
};
return languageMap[ext] || null;
}
/**
* Convert tree-sitter node to our ParsedNode interface
*/
nodeToParser(node) {
return {
type: node.type,
startPosition: {
row: node.startPosition.row,
column: node.startPosition.column
},
endPosition: {
row: node.endPosition.row,
column: node.endPosition.column
},
text: node.text,
depth: this.calculateNodeDepth(node),
children: node.children.map(child => this.nodeToParser(child))
};
}
/**
* Calculate node depth in the AST
*/
calculateNodeDepth(node) {
let depth = 0;
let current = node.parent;
while (current) {
depth++;
current = current.parent;
}
return depth;
}
/**
* Generate content hash for chunk
*/
generateContentHash(content) {
return createHash('sha256').update(content).digest('hex');
}
/**
* Generate unique chunk ID compatible with Qdrant (UUID format)
*/
generateChunkId(filePath, startLine, endLine, chunkType) {
// Create a deterministic UUID based on the chunk data
const input = `${filePath}:${startLine}:${endLine}:${chunkType}`;
const hash = this.simpleHash(input);
// Convert hash to UUID format (8-4-4-4-12 hex digits)
const hex = hash.padStart(32, '0').substring(0, 32);
return `${hex.substring(0, 8)}-${hex.substring(8, 12)}-${hex.substring(12, 16)}-${hex.substring(16, 20)}-${hex.substring(20, 32)}`;
}
/**
* Enhanced hash function that produces hex output suitable for UUID generation
*/
simpleHash(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
// Convert to positive hex and pad to ensure we have enough digits
const hashStr = Math.abs(hash).toString(16);
// Create a longer hex string by repeating and hashing if needed
let result = hashStr;
while (result.length < 32) {
// Add more entropy by hashing the current result with original string
let newHash = 0;
const combined = result + str;
for (let i = 0; i < combined.length; i++) {
const char = combined.charCodeAt(i);
newHash = ((newHash << 5) - newHash) + char;
newHash = newHash & newHash;
}
result += Math.abs(newHash).toString(16);
}
return result;
}
/**
* Check if file is a test file
*/
isTestFile(filePath) {
const testPatterns = [
/\.test\./,
/\.spec\./,
/test/,
/spec/,
/__tests__/
];
return testPatterns.some(pattern => pattern.test(filePath));
}
/**
* Calculate code complexity (simple metric)
*/
calculateComplexity(content) {
const complexityKeywords = [
'if', 'else', 'for', 'while', 'do', 'switch', 'case',
'try', 'catch', 'finally', 'throw', 'return'
];
let complexity = 1; // Base complexity
for (const keyword of complexityKeywords) {
const matches = content.match(new RegExp(`\\b${keyword}\\b`, 'g'));
if (matches) {
complexity += matches.length;
}
}
return complexity;
}
/**
* Extract dependencies/imports from content
*/
extractDependencies(content, language) {
const deps = [];
switch (language) {
case 'javascript':
case 'typescript':
const jsImports = content.match(/import\s+.*?\s+from\s+['"]([^'"]+)['"]/g);
if (jsImports) {
jsImports.forEach(imp => {
const match = imp.match(/from\s+['"]([^'"]+)['"]/);
if (match)
deps.push(match[1]);
});
}
break;
case 'python':
const pyImports = content.match(/(?:from\s+(\S+)\s+import|import\s+(\S+))/g);
if (pyImports) {
pyImports.forEach(imp => {
const match = imp.match(/(?:from\s+(\S+)\s+import|import\s+(\S+))/);
if (match)
deps.push(match[1] || match[2]);
});
}
break;
}
return deps;
}
/**
* Extract exports from content
*/
extractExports(content, language) {
const exports = [];
switch (language) {
case 'javascript':
case 'typescript':
const jsExports = content.match(/export\s+(?:default\s+)?(?:function\s+|class\s+|const\s+|let\s+|var\s+)?([a-zA-Z_$][a-zA-Z0-9_$]*)/g);
if (jsExports) {
jsExports.forEach(exp => {
const match = exp.match(/export\s+(?:default\s+)?(?:function\s+|class\s+|const\s+|let\s+|var\s+)?([a-zA-Z_$][a-zA-Z0-9_$]*)/);
if (match)
exports.push(match[1]);
});
}
break;
}
return exports;
}
/**
* Extract imports from content
*/
extractImports(content, language) {
return this.extractDependencies(content, language);
}
/**
* Initialize language configurations
*/
initializeLanguageConfigs() {
// JavaScript/TypeScript configuration
const jsConfig = {
name: 'javascript',
extensions: ['.js', '.jsx', '.ts', '.tsx'],
grammar: 'javascript',
chunkStrategies: [
{
nodeType: 'function_declaration',
chunkType: ChunkType.FUNCTION,
nameExtractor: (node) => this.extractFunctionName(node)
},
{
nodeType: 'class_declaration',
chunkType: ChunkType.CLASS,
nameExtractor: (node) => this.extractClassName(node)
},
{
nodeType: 'interface_declaration',
chunkType: ChunkType.INTERFACE,
nameExtractor: (node) => this.extractInterfaceName(node)
},
{
nodeType: 'method_definition',
chunkType: ChunkType.FUNCTION,
nameExtractor: (node) => this.extractMethodName(node)
},
{
nodeType: 'arrow_function',
chunkType: ChunkType.FUNCTION,
nameExtractor: (node) => this.extractFunctionName(node)
}
],
keywords: ['function', 'class', 'interface', 'const', 'let', 'var', 'import', 'export'],
commentPatterns: ['//', '/*', '*/'],
astNodeMappings: {},
contextualChunking: false,
supportsSparseSearch: true
};
// Python configuration
const pyConfig = {
name: 'python',
extensions: ['.py'],
grammar: 'python',
chunkStrategies: [
{
nodeType: 'function_definition',
chunkType: ChunkType.FUNCTION,
nameExtractor: (node) => this.extractFunctionName(node)
},
{
nodeType: 'class_definition',
chunkType: ChunkType.CLASS,
nameExtractor: (node) => this.extractClassName(node)
}
],
keywords: ['def', 'class', 'import', 'from', 'if', 'else', 'for', 'while'],
commentPatterns: ['#'],
astNodeMappings: {},
contextualChunking: false,
supportsSparseSearch: true
};
// Markdown configuration
const markdownConfig = {
name: 'markdown',
extensions: ['.md', '.markdown'],
grammar: 'markdown',
chunkStrategies: [
{
nodeType: 'atx_heading',
chunkType: ChunkType.SECTION,
nameExtractor: (node) => this.extractMarkdownHeading(node),
includeContext: true
},
{
nodeType: 'setext_heading',
chunkType: ChunkType.SECTION,
nameExtractor: (node) => this.extractMarkdownHeading(node),
includeContext: true
},
{
nodeType: 'fenced_code_block',
chunkType: ChunkType.CODE_BLOCK,
nameExtractor: (node) => this.extractCodeBlockLanguage(node)
},
{
nodeType: 'indented_code_block',
chunkType: ChunkType.CODE_BLOCK,
nameExtractor: () => 'code'
},
{
nodeType: 'paragraph',
chunkType: ChunkType.PARAGRAPH,
minSize: 50, // Only chunk substantial paragraphs
maxSize: 2000
},
{
nodeType: 'list',
chunkType: ChunkType.LIST,
minSize: 30
},
{
nodeType: 'table',
chunkType: ChunkType.TABLE,
nameExtractor: () => 'table'
},
{
nodeType: 'block_quote',
chunkType: ChunkType.BLOCKQUOTE,
minSize: 30
}
],
keywords: ['#', '##', '###', '####', '#####', '######', '```', '---', '***'],
commentPatterns: ['<!--', '-->'],
astNodeMappings: {
'atx_heading': ChunkType.SECTION,
'setext_heading': ChunkType.SECTION,
'fenced_code_block': ChunkType.CODE_BLOCK,
'indented_code_block': ChunkType.CODE_BLOCK,
'paragraph': ChunkType.PARAGRAPH,
'list': ChunkType.LIST,
'table': ChunkType.TABLE,
'block_quote': ChunkType.BLOCKQUOTE
},
contextualChunking: true,
supportsSparseSearch: true
};
this.languageConfigs.set('javascript', jsConfig);
this.languageConfigs.set('typescript', jsConfig);
this.languageConfigs.set('tsx', jsConfig);
this.languageConfigs.set('python', pyConfig);
this.languageConfigs.set('markdown', markdownConfig);
}
/**
* Extract function name from node
*/
extractFunctionName(node) {
// Simple name extraction - in real implementation, this would be more sophisticated
const nameMatch = node.text.match(/(?:function\s+|def\s+)([a-zA-Z_$][a-zA-Z0-9_$]*)/);
return nameMatch ? nameMatch[1] : 'anonymous';
}
/**
* Extract class name from node
*/
extractClassName(node) {
const nameMatch = node.text.match(/class\s+([a-zA-Z_$][a-zA-Z0-9_$]*)/);
return nameMatch ? nameMatch[1] : 'anonymous';
}
/**
* Extract interface name from node
*/
extractInterfaceName(node) {
const nameMatch = node.text.match(/interface\s+([a-zA-Z_$][a-zA-Z0-9_$]*)/);
return nameMatch ? nameMatch[1] : 'anonymous';
}
/**
* Extract method name from node
*/
extractMethodName(node) {
const nameMatch = node.text.match(/([a-zA-Z_$][a-zA-Z0-9_$]*)\s*\(/);
return nameMatch ? nameMatch[1] : 'anonymous';
}
/**
* Extract markdown heading text from node
*/
extractMarkdownHeading(node) {
// Extract text from ATX headings (# ## ### etc) or setext headings (=== --- underlines)
const text = node.text.trim();
// For ATX headings, remove the # symbols
const atxMatch = text.match(/^#{1,6}\s*(.+?)(?:\s*#*)?$/);
if (atxMatch) {
return atxMatch[1].trim();
}
// For setext headings, take the first line
const setextMatch = text.match(/^(.+?)\n[=\-]+/);
if (setextMatch) {
return setextMatch[1].trim();
}
// Fallback to first line
return text.split('\n')[0].trim();
}
/**
* Extract code block language from fenced code block
*/
extractCodeBlockLanguage(node) {
const text = node.text.trim();
// Extract language from fenced code block (```language)
const langMatch = text.match(/^```\s*([a-zA-Z0-9_+-]*)/);
if (langMatch && langMatch[1]) {
return langMatch[1];
}
return 'code';
}
}
//# sourceMappingURL=code-parser.js.map