UNPKG

remcode

Version:

Turn your AI assistant into a codebase expert. Intelligent code analysis, semantic search, and software engineering guidance through MCP integration.

372 lines (371 loc) 16.3 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ChunkingManager = void 0; const logger_1 = require("../../utils/logger"); const path = __importStar(require("path")); const documents_1 = require("@langchain/core/documents"); const text_splitter_1 = require("langchain/text_splitter"); const text_splitter_2 = require("langchain/text_splitter"); const logger = (0, logger_1.getLogger)('ChunkingManager'); /** * Manages code chunking with various strategies tailored to different code types */ class ChunkingManager { /** * Creates a new ChunkingManager with the specified strategy * @param strategy Chunking strategy configuration */ constructor(strategy) { this.strategy = strategy; // Map file extensions to LangChain supported languages this.languageMap = { ts: 'js', // Use 'js' for TypeScript (LangChain's closest match) js: 'js', jsx: 'js', tsx: 'js', py: 'python', java: 'java', cs: 'java', // Map C# to Java as approximation cpp: 'cpp', c: 'cpp', go: 'go', rb: 'ruby', php: 'php', rs: 'rust', swift: 'swift', kt: 'scala', // Map Kotlin to Scala as approximation scala: 'scala', md: 'markdown', html: 'html', css: 'html', // Map CSS to HTML as approximation json: 'markdown', // Map JSON to markdown as approximation yaml: 'markdown', // Map YAML to markdown as approximation yml: 'markdown', sql: 'markdown', // Map SQL to markdown as approximation sh: 'markdown', // Map shell scripts to markdown as approximation bash: 'markdown', }; } /** * Chunks a file's content based on the specified strategy * @param content The file content to chunk * @param strategy The chunking strategy to apply * @param fileInfo Information about the file * @returns An array of code chunks */ async chunkFile(content, strategy, fileInfo) { logger.info(`Chunking file with strategy: ${strategy} - ${fileInfo.file_path}`); if (!content || content.trim() === '') { logger.warn(`Empty content for file: ${fileInfo.file_path}`); return []; } // Determine language for better chunking const extension = path.extname(fileInfo.file_path).substring(1).toLowerCase(); const language = this.languageMap[extension] || 'text'; try { // Apply the appropriate chunking strategy switch (strategy) { case 'function_level': return await this.chunkByFunction(content, fileInfo, language); case 'class_level': return await this.chunkByClass(content, fileInfo, language); case 'file_level': return await this.chunkAsFile(content, fileInfo, language); case 'sliding_window': return await this.chunkBySlidingWindow(content, fileInfo, language, 0.1); // 10% overlap case 'sliding_window_with_overlap': return await this.chunkBySlidingWindow(content, fileInfo, language, 0.25); // 25% overlap case 'sliding_window_with_high_overlap': return await this.chunkBySlidingWindow(content, fileInfo, language, 0.5); // 50% overlap default: logger.warn(`Unknown chunking strategy: ${strategy}, falling back to sliding window`); return await this.chunkBySlidingWindow(content, fileInfo, language, 0.1); } } catch (error) { logger.error(`Error chunking file ${fileInfo.file_path}: ${error instanceof Error ? error.message : String(error)}`); // Fallback to a simple chunking approach if the advanced methods fail return this.fallbackChunking(content, fileInfo); } } /** * Chunks code by function boundaries */ async chunkByFunction(content, fileInfo, language) { const chunks = []; // Use regex patterns to identify function boundaries based on language const functionPatterns = { typescript: /(?:export\s+)?(?:async\s+)?function\s+([\w$]+)\s*\([^)]*\)\s*(?::\s*[^{]+)?\s*\{([\s\S]*?)\}/g, javascript: /(?:export\s+)?(?:async\s+)?function\s+([\w$]+)\s*\([^)]*\)\s*\{([\s\S]*?)\}/g, python: /def\s+([\w_]+)\s*\([^)]*\)\s*(?:->\s*[^:]+)?\s*:([\s\S]*?)(?=\n\s*def|\n\s*class|$)/g, java: /(?:public|private|protected|static|final|abstract|synchronized|native)*\s+(?:[\w<>\[\]]+)\s+([\w$]+)\s*\([^)]*\)\s*(?:throws\s+[\w\s,]+)?\s*\{([\s\S]*?)\}/g, 'default': /function\s+([\w$]+)\s*\([^)]*\)\s*\{([\s\S]*?)\}/g }; const pattern = functionPatterns[language] || functionPatterns['default']; // Extract methods/functions using regex let match; let lastIndex = 0; while ((match = pattern.exec(content)) !== null) { const functionName = match[1]; const functionBody = match[0]; // Full function including signature const startIndex = match.index; const endIndex = pattern.lastIndex; // Calculate line numbers const contentBeforeFunction = content.substring(0, startIndex); const startLine = contentBeforeFunction.split('\n').length; const endLine = startLine + functionBody.split('\n').length - 1; chunks.push({ content: functionBody, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'function_level', language, start_line: startLine, end_line: endLine, function_name: functionName, chunk_type: 'function' } }); lastIndex = endIndex; } // If no functions were found or there's significant code outside functions, // add the remaining content as a separate chunk if (chunks.length === 0 || content.length - lastIndex > 200) { const remainingContent = content.substring(lastIndex); if (remainingContent.trim().length > 0) { const startLine = content.substring(0, lastIndex).split('\n').length; const endLine = content.split('\n').length; chunks.push({ content: remainingContent, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'function_level', language, start_line: startLine, end_line: endLine, chunk_type: 'code_segment' } }); } } return chunks; } /** * Chunks code by class boundaries */ async chunkByClass(content, fileInfo, language) { const chunks = []; // Class patterns for different languages const classPatterns = { typescript: /(?:export\s+)?(?:abstract\s+)?class\s+([\w$]+)(?:\s+extends\s+[\w$.]+)?(?:\s+implements\s+[\w$.]+(?:\s*,\s*[\w$.]+)*)?\s*\{([\s\S]*?)\}/g, javascript: /(?:export\s+)?class\s+([\w$]+)(?:\s+extends\s+[\w$.]+)?\s*\{([\s\S]*?)\}/g, python: /class\s+([\w_]+)\s*(?:\([^)]*\))?\s*:([\s\S]*?)(?=\n\s*(?:class|def|$))/g, java: /(?:public|private|protected|abstract|final)*\s+class\s+([\w$]+)(?:\s+extends\s+[\w$.]+)?(?:\s+implements\s+[\w$.]+(?:\s*,\s*[\w$.]+)*)?\s*\{([\s\S]*?)\}/g, 'default': /class\s+([\w$]+)(?:\s+extends\s+[\w$.]+)?\s*\{([\s\S]*?)\}/g }; const pattern = classPatterns[language] || classPatterns['default']; // Extract classes using regex let match; let lastIndex = 0; while ((match = pattern.exec(content)) !== null) { const className = match[1]; const classBody = match[0]; // Full class including declaration const startIndex = match.index; const endIndex = pattern.lastIndex; // Calculate line numbers const contentBeforeClass = content.substring(0, startIndex); const startLine = contentBeforeClass.split('\n').length; const endLine = startLine + classBody.split('\n').length - 1; chunks.push({ content: classBody, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'class_level', language, start_line: startLine, end_line: endLine, class_name: className, chunk_type: 'class' } }); lastIndex = endIndex; } // If no classes were found or there's significant code outside classes, // add the remaining content as a separate chunk if (chunks.length === 0 || content.length - lastIndex > 200) { const remainingContent = content.substring(lastIndex); if (remainingContent.trim().length > 0) { const startLine = content.substring(0, lastIndex).split('\n').length; const endLine = content.split('\n').length; chunks.push({ content: remainingContent, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'class_level', language, start_line: startLine, end_line: endLine, chunk_type: 'code_segment' } }); } } return chunks; } /** * Treats the entire file as one chunk */ async chunkAsFile(content, fileInfo, language) { return [{ content, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'file_level', language, start_line: 1, end_line: content.split('\n').length, chunk_type: 'file' } }]; } /** * Chunks content using a sliding window approach with configurable overlap */ async chunkBySlidingWindow(content, fileInfo, language, overlap) { const chunkSize = this.determineChunkSize(content, language); // Use LangChain's text splitters for code-aware chunking let splitter; const langchainLanguage = language; // Make sure the language is actually supported if (Object.values(this.languageMap).includes(langchainLanguage) && ['cpp', 'go', 'java', 'js', 'php', 'proto', 'python', 'rst', 'ruby', 'rust', 'scala', 'swift', 'markdown', 'latex', 'html', 'sol'].includes(langchainLanguage)) { // Use the language-specific recursive character text splitter splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage(langchainLanguage, { chunkSize, chunkOverlap: Math.floor(chunkSize * overlap) }); } else { // Fall back to token-based splitting for unsupported languages splitter = new text_splitter_2.TokenTextSplitter({ chunkSize: Math.floor(chunkSize / 4), // Tokens are roughly 4 chars chunkOverlap: Math.floor((chunkSize / 4) * overlap) }); } // Create document from content const doc = new documents_1.Document({ pageContent: content, metadata: { ...fileInfo, file_path: fileInfo.file_path, language, } }); // Split the document const docs = await splitter.splitDocuments([doc]); // Convert LangChain documents to our CodeChunk format return docs.map((doc, index) => { // Calculate approximate line numbers const docStartIndex = content.indexOf(doc.pageContent); const contentBeforeChunk = content.substring(0, docStartIndex); const startLine = contentBeforeChunk.split('\n').length; const endLine = startLine + doc.pageContent.split('\n').length - 1; return { content: doc.pageContent, metadata: { ...fileInfo, ...doc.metadata, file_path: fileInfo.file_path, strategy: `sliding_window_${overlap === 0.1 ? '' : overlap === 0.25 ? 'with_overlap' : 'with_high_overlap'}`, language, start_line: startLine, end_line: endLine, chunk_index: index, chunk_type: 'sliding_window' } }; }); } /** * Simple fallback chunking strategy when advanced methods fail */ fallbackChunking(content, fileInfo) { const chunks = []; const lines = content.split('\n'); const chunkSize = 50; // Lines per chunk for (let i = 0; i < lines.length; i += chunkSize) { const chunkLines = lines.slice(i, i + chunkSize); const chunkContent = chunkLines.join('\n'); if (chunkContent.trim().length > 0) { chunks.push({ content: chunkContent, metadata: { ...fileInfo, file_path: fileInfo.file_path, strategy: 'fallback', start_line: i + 1, end_line: Math.min(i + chunkSize, lines.length), chunk_type: 'fallback' } }); } } return chunks; } /** * Determines appropriate chunk size based on content and language */ determineChunkSize(content, language) { const totalLength = content.length; // Base chunk size on content length, with limits if (totalLength < 5000) { return 1000; // Small files get smaller chunks } else if (totalLength < 20000) { return 1500; // Medium files } else { return 2000; // Large files } } } exports.ChunkingManager = ChunkingManager;