mcp-repl
Version:
MCP REPL with code execution, semantic code search, and comprehensive ast-grep integration
1,445 lines (1,222 loc) • 58.5 kB
JavaScript
#!/usr/bin/env node
// Pure JavaScript implementation of code indexing and vector search
// This implementation avoids native dependencies for better Windows compatibility
// Force use of WASM backend to avoid onnxruntime-node dependency
process.env.TFJS_BACKEND = 'wasm';
process.env.SHARP = 'false';
// Detect ARM64 architecture for compatibility handling
const isARM64 = process.arch === 'arm64' || process.platform === 'linux' && process.arch === 'arm64';
import fs from 'fs/promises';
import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
import path from 'path';
import ignore from 'ignore';
// Global embedding engine variables
let pipeline, env, embeddingEngineRef;
// Configuration constants
const INDEX_DIR = './code_search_index';
const DEFAULT_MODEL = 'Xenova/all-MiniLM-L6-v2';
const DEFAULT_DIM = 384; // Dimension size for the chosen model
const DEFAULT_EXTS = [
'js', 'ts', 'jsx', 'tsx', 'mjs', 'cjs',
'py', 'java', 'go', 'rs', 'c', 'cpp', 'h', 'hpp',
'rb', 'php', 'cs', 'swift', 'kt', 'scala', 'clj',
'sh', 'bash', 'zsh', 'fish', 'ps1', 'bat',
'sql', 'json', 'yaml', 'yml', 'toml', 'xml', 'html',
'css', 'scss', 'sass', 'less', 'styl', 'vue',
'md', 'markdown', 'txt', 'cfg', 'ini', 'conf'
];
const DEFAULT_IGNORES = [
'node_modules', '.git', '.node_modules',
'dist', 'build', 'coverage', '.nyc_output',
'tmp', 'temp', '.tmp', '.cache', '.parcel-cache',
'.next', '.nuxt', '.vuepress', '.docusaurus',
'public', 'static', 'assets', 'images', 'img',
'.vscode', '.idea', '.DS_Store', 'Thumbs.db',
// Additional build folders
'out', 'output', 'generated', 'gen',
'.angular', '.react', '.svelte-kit',
'storybook-static', 'docs-build', 'build-docs',
'.vite', '.turbo', '.nx', '.swc',
// Common dependency folders
'bower_components', 'jspm_packages', '.pnp',
// Test and coverage folders
'__tests__', '__mocks__', '__snapshots__',
'.jest', '.mocha', '.cypress', '.playwright',
// Lock files and package managers
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
'.npmrc', '.yarnrc', '.pnpmrc'
];
const INDEX_FILE = 'code_index.json';
const VECTOR_INDEX_FILE = 'vector_index.json';
// Global state
// embedder deprecated - using embeddingEngineRef instead
let codeChunks = [];
let chunkIds = [];
let isInitialized = false;
// Helper to calculate cosine similarity between two vectors
function cosineSimilarity(vecA, vecB) {
const dotProduct = vecA.reduce((sum, a, i) => sum + a * vecB[i], 0);
const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0));
const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0));
return dotProduct / (magnitudeA * magnitudeB);
}
// Create robust ignore filter using the ignore library
function createIgnoreFilter(rootDir) {
const ig = ignore();
// Add default ignore patterns
ig.add(DEFAULT_IGNORES);
// Find and add all .gitignore files in the directory tree
const addGitignoreFiles = (dir) => {
try {
const entries = readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isFile() && entry.name === '.gitignore') {
try {
const content = readFileSync(fullPath, 'utf8');
ig.add(content);
} catch (error) {
// Silently handle .gitignore read errors
}
} else if (entry.isDirectory() && !entry.name.startsWith('.') && !DEFAULT_IGNORES.includes(entry.name)) {
// Recursively add .gitignore files from subdirectories
addGitignoreFiles(fullPath);
}
}
} catch (error) {
// Silently handle directory read errors
}
};
addGitignoreFiles(rootDir);
return ig;
}
// Check if a file should be indexed based on extension
function shouldIndexFile(filePath, allowedExtensions) {
const ext = path.extname(filePath).slice(1).toLowerCase();
if (!ext || !allowedExtensions.includes(ext)) {
return false;
}
// Additional filtering for known non-code files even with allowed extensions
const filename = path.basename(filePath).toLowerCase();
const excludedFiles = [
'.min.js', '.bundle.js', '.pack.js', // Minified/bundled JS
'.d.ts', '.d.tsx', // TypeScript definitions
'.map', '.css.map', '.js.map', // Source maps
'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml', // Package files
'tsconfig.json', 'jsconfig.json', // Config files
'.eslintrc.json', '.prettierrc.json', // Linter configs
'license', 'readme.md', 'changelog.md', // Documentation
'dockerfile', 'docker-compose.yml' // Docker files
];
return !excludedFiles.some(excluded => filename.includes(excluded));
}
// Initialize the in-memory index and embedding model
export async function initialize(indexDir = INDEX_DIR) {
if (isInitialized) return true;
try {
// Create index directory if it doesn't exist
if (!existsSync(indexDir)) {
mkdirSync(indexDir, { recursive: true });
}
// Initialize compatible embedding engine for all architectures
// Use safer approach for ARM64 to prevent memory corruption
let embeddingEngine = null;
// Skip TensorFlow.js on ARM64 to avoid memory corruption issues
if (!isARM64) {
try {
const tf = await import('@tensorflow/tfjs');
await tf.setBackend('wasm');
const use = await import('@tensorflow-models/universal-sentence-encoder');
embeddingEngine = {
type: 'tfjs',
model: await use.load(),
embed: async (text) => {
const embeddings = await embeddingEngine.model.embed([text]);
return embeddings.arraySync()[0];
}
};
console.log('[DEBUG] TensorFlow.js embedding engine loaded');
} catch (tfError) {
console.log(`[DEBUG] TensorFlow.js not available: ${tfError.message}`);
}
} else {
console.log('[DEBUG] Skipping TensorFlow.js on ARM64 to prevent memory issues');
}
// Fallback to transformers with ARM64-safe configuration
if (!embeddingEngine) {
try {
const transformers = await import('@xenova/transformers');
pipeline = transformers.pipeline;
env = transformers.env;
// ARM64-safe configuration for transformers based on official ONNX Runtime docs
if (env) {
env.backends = env.backends || {};
env.backends.onnx = env.backends.onnx || {};
env.backends.onnx.wasm = env.backends.onnx.wasm || {};
env.backends.onnx.wasm.numThreads = isARM64 ? 1 : 4; // Single thread on ARM64
env.backends.onnx.wasm.simd = !isARM64; // Disable SIMD on ARM64
env.allowRemoteModels = false; // Use local models only
env.allowLocalModels = true;
env.useBrowserCache = false; // Disable cache to prevent corruption
env.cacheDir = null; // No cache directory
// Critical ARM64 fixes for memory corruption
env.sessionOptions = {
enableCpuMemArena: false, // Disable memory arena to prevent corruption
enableMemPattern: false, // Disable memory pattern optimization
graphOptimizationLevel: 'basic' // Use basic optimization only
};
}
embeddingEngine = {
type: 'transformers',
pipeline: pipeline,
model: await pipeline('feature-extraction', DEFAULT_MODEL, {
quantized: false, // Avoid quantization issues
device: 'wasm',
cache_dir: null, // No cache to prevent corruption
local_files_only: true, // Use only local models
trust_remote_code: false, // Security precaution
// Pass the ARM64 session options to prevent memory corruption
session_options: isARM64 ? {
enableCpuMemArena: false,
enableMemPattern: false,
graphOptimizationLevel: 'basic'
} : undefined
}),
embed: async (text) => {
try {
const result = await embeddingEngine.model(text, {
pooling: 'mean',
normalize: true
});
return Array.isArray(result) ? result[0] : result;
} catch (embedError) {
throw new Error(`Transformers embedding failed: ${embedError.message}`);
}
}
};
console.log('[DEBUG] Transformers embedding engine loaded with safe config');
} catch (transformerError) {
console.log(`[DEBUG] Transformers not available: ${transformerError.message}`);
pipeline = null;
env = null;
}
}
// High-performance fallback embedding engine
if (!embeddingEngine) {
console.log('[DEBUG] Using advanced hybrid embedding engine');
embeddingEngine = {
type: 'hybrid',
embed: async (text) => {
// Advanced hybrid embedding combining multiple techniques
const processedText = text.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
// Extract semantic features
const words = processedText.split(' ').filter(w => w.length > 1);
const wordFreq = {};
const bigrams = [];
const trigrams = [];
// Generate n-grams
for (let i = 0; i < words.length; i++) {
const word = words[i];
wordFreq[word] = (wordFreq[word] || 0) + 1;
if (i < words.length - 1) {
bigrams.push(`${words[i]} ${words[i + 1]}`);
}
if (i < words.length - 2) {
trigrams.push(`${words[i]} ${words[i + 1]} ${words[i + 2]}`);
}
}
// TF-IDF-like scoring with position weighting
const features = new Map();
const totalWords = words.length;
// Process unigrams with TF-IDF weighting
Object.entries(wordFreq).forEach(([word, freq]) => {
const tf = freq / totalWords;
const idf = Math.log(1 + (8 / word.length)); // Simple IDF approximation
const positionBoost = words.indexOf(word) / totalWords; // Earlier words get slight boost
const score = tf * idf * (1 + positionBoost * 0.2);
features.set(word, score);
});
// Process bigrams with higher weight
const bigramFreq = {};
bigrams.forEach(bg => {
bigramFreq[bg] = (bigramFreq[bg] || 0) + 1;
});
Object.entries(bigramFreq).forEach(([bigram, freq]) => {
const score = (freq / bigrams.length) * 1.5; // Boost bigrams
features.set(bigram, score);
});
// Process trigrams with highest weight
const trigramFreq = {};
trigrams.forEach(tg => {
trigramFreq[tg] = (trigramFreq[tg] || 0) + 1;
});
Object.entries(trigramFreq).forEach(([trigram, freq]) => {
const score = (freq / trigrams.length) * 2.0; // Boost trigrams
features.set(trigram, score);
});
// Create 384-dimensional embedding with intelligent distribution
const vector = new Array(384).fill(0);
const featureArray = Array.from(features.entries()).sort((a, b) => b[1] - a[1]);
// Distribute features using multiple hash functions for better spread
featureArray.forEach(([feature, score], index) => {
// Use multiple hash functions to distribute feature importance
const hash1 = simpleHash(feature, 1);
const hash2 = simpleHash(feature, 2);
const hash3 = simpleHash(feature, 3);
// Primary position
const pos1 = hash1 % 384;
vector[pos1] += score;
// Secondary positions for better semantic spread
const pos2 = (hash2 % 192) + 96; // Middle third
vector[pos2] += score * 0.7;
// Tertiary positions
const pos3 = (hash3 % 96) + 288; // Last third
vector[pos3] += score * 0.5;
});
// Apply activation function (ReLU-like)
for (let i = 0; i < vector.length; i++) {
vector[i] = Math.max(0, vector[i]);
}
// Normalize to unit vector
const magnitude = Math.sqrt(vector.reduce((sum, val) => sum + val * val, 0));
return magnitude > 0 ? vector.map(v => v / magnitude) : vector;
}
};
}
// Simple hash function for feature distribution
function simpleHash(str, seed = 1) {
let hash = seed;
for (let i = 0; i < str.length; i++) {
const char = str.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return Math.abs(hash);
}
// Store embedding engine globally
embeddingEngineRef = embeddingEngine;
// Old embedder initialization removed - now using embeddingEngineRef
// Load existing index if available
const indexPath = path.join(indexDir, INDEX_FILE);
if (existsSync(indexPath)) {
try {
const data = readFileSync(indexPath, 'utf8');
codeChunks = JSON.parse(data);
chunkIds = codeChunks.map(chunk => chunk.id);
} catch (error) {
codeChunks = [];
chunkIds = [];
}
} else {
codeChunks = [];
chunkIds = [];
}
isInitialized = true;
return true;
} catch (error) {
return false;
}
}
// Gather files for indexing with robust .gitignore support
export async function gatherFiles(dir, exts = DEFAULT_EXTS, ignores = DEFAULT_IGNORES, ignoreFilter = null) {
const results = [];
try {
// Create robust ignore filter if not provided
if (!ignoreFilter) {
ignoreFilter = createIgnoreFilter(dir);
}
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const full = path.join(dir, entry.name);
const relativePath = path.relative(dir, full);
// Check if file/directory should be ignored using the ignore library
if (ignoreFilter.ignores(relativePath)) {
continue;
}
if (entry.isDirectory()) {
// Recursively gather files from subdirectories, passing the same ignore filter
const subDirFiles = await gatherFiles(full, exts, ignores, ignoreFilter);
results.push(...subDirFiles);
} else {
// Check if this file should be indexed based on extension and content type
if (!shouldIndexFile(full, exts)) {
continue;
}
// Check file size - skip files larger than 200KB (increased for larger code files)
const stat = await fs.stat(full);
if (stat.size > 200 * 1024) { // 200KB limit
continue;
}
results.push(full);
}
}
} catch (error) {
// Silently handle errors
}
return results;
}
// Extract comment above an element
function extractDocComment(content, position) {
let docComment = '';
const linesBefore = content.substring(0, position).split('\n');
let i = linesBefore.length - 1;
while (i >= 0 && i >= linesBefore.length - 5) {
const line = linesBefore[i].trim();
if (line.startsWith('//') || line.startsWith('/*') || line.startsWith('*')) {
docComment = line.replace(/^\/\/|\*\/|\*|\/\*\*?/g, '').trim() + ' ' + docComment;
} else if (line === '') {
i--;
continue;
} else {
break;
}
i--;
}
return docComment.trim();
}
// Extract parameters from function or method signature
function extractParameters(signature) {
const paramMatch = signature.match(/\((.*?)\)/);
if (!paramMatch || !paramMatch[1]) return [];
const paramString = paramMatch[1].trim();
if (!paramString) return [];
return paramString.split(',')
.map(param => {
// Handle destructuring or complex params
const cleanParam = param.trim().replace(/[{}[\]]/g, '');
const parts = cleanParam.split('='); // Handle default values
const nameWithType = parts[0].trim();
// Try to separate type annotations (for TypeScript)
const typeSplit = nameWithType.split(':');
const name = typeSplit[0].trim();
const type = typeSplit.length > 1 ? typeSplit[1].trim() : '';
return { name, type };
})
.filter(p => p.name && p.name !== '');
}
// Extract return type from function signature (TypeScript)
function extractReturnType(signature, code) {
// Check for TypeScript return type annotation
const returnTypeMatch = signature.match(/\)(?:\s*:\s*([^{]+))?/);
if (returnTypeMatch && returnTypeMatch[1]) {
return returnTypeMatch[1].trim();
}
// Try to infer from return statements
const returnMatches = code.match(/return\s+([^;]+)/g);
if (returnMatches && returnMatches.length > 0) {
// Just indicate there are returns but don't try to infer type
return 'inferred';
}
return '';
}
// Extract exported status
function isExported(content, position) {
const linesBefore = content.substring(0, position).split('\n');
const currentLine = linesBefore[linesBefore.length - 1];
return currentLine.includes('export ');
}
// Calculate token count (simplified approximation)
function calculateTokenCount(code) {
if (!code || typeof code !== 'string') return 0;
// Clean the code and count meaningful tokens
const cleanedCode = code.trim();
if (!cleanedCode) return 0;
// Count words, numbers, and meaningful punctuation as tokens
const tokens = cleanedCode.match(/\b\w+\b|[{}();,=+\-*\/\[\]<>!&|.]/g) || [];
return tokens.length;
}
// Extract code structure from a file
export async function extractChunks(filePath) {
try {
const content = await fs.readFile(filePath, 'utf-8');
const stat = await fs.stat(filePath);
const chunks = [];
const fileName = path.basename(filePath);
const fileScope = {
id: Buffer.from(`file-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16),
type: 'file',
name: fileName,
path: filePath,
children: [],
exports: []
};
// Map of element IDs to their relationship data
const relationships = new Map();
// Extract functions
const funcRegex = /(?:export\s+)?(?:async\s+)?function\s+(\w+)\s*\([^)]*\)\s*(?::\s*[^{]+)?\s*{/g;
let funcMatch;
while ((funcMatch = funcRegex.exec(content)) !== null) {
const funcName = funcMatch[1];
const funcStart = funcMatch.index;
// Find the function end
let openBraces = 1;
let funcEnd = funcStart + funcMatch[0].length;
for (let i = funcEnd; i < content.length; i++) {
if (content[i] === '{') openBraces++;
else if (content[i] === '}') openBraces--;
if (openBraces === 0) {
funcEnd = i + 1;
break;
}
}
const funcCode = content.substring(funcStart, funcEnd);
const lines = funcCode.split('\n').length;
const startPos = content.substring(0, funcStart).split('\n').length - 1;
const endPos = startPos + lines - 1;
const docComment = extractDocComment(content, funcStart);
const isExportedFunc = isExported(content, funcStart);
const parameters = extractParameters(funcMatch[0]);
const returnType = extractReturnType(funcMatch[0], funcCode);
const funcChunk = {
id: Buffer.from(`function-${funcName}-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16),
type: 'function',
name: funcName,
qualifiedName: funcName,
file: filePath,
startLine: startPos,
endLine: endPos,
lines,
tokens: calculateTokenCount(funcCode),
code: funcCode,
mtime: stat.mtimeMs,
doc: docComment,
isExported: isExportedFunc,
parameters,
returnType,
complexity: calculateComplexity(funcCode)
};
chunks.push(funcChunk);
fileScope.children.push(funcChunk.id);
if (isExportedFunc) {
fileScope.exports.push(funcChunk.id);
}
// Store relationships
relationships.set(funcChunk.id, {
calls: extractFunctionCalls(funcCode),
dependencies: extractDependencies(funcCode)
});
}
// Extract classes
const classRegex = /(?:export\s+)?class\s+(\w+)(?:\s+extends\s+(\w+))?\s*{/g;
let classMatch;
while ((classMatch = classRegex.exec(content)) !== null) {
const className = classMatch[1];
const extendedClass = classMatch[2] || null;
const classStart = classMatch.index;
// Find the class end
let openBraces = 1;
let classEnd = classStart + classMatch[0].length;
for (let i = classEnd; i < content.length; i++) {
if (content[i] === '{') openBraces++;
else if (content[i] === '}') openBraces--;
if (openBraces === 0) {
classEnd = i + 1;
break;
}
}
const classCode = content.substring(classStart, classEnd);
const lines = classCode.split('\n').length;
const startPos = content.substring(0, classStart).split('\n').length - 1;
const endPos = startPos + lines - 1;
const docComment = extractDocComment(content, classStart);
const isExportedClass = isExported(content, classStart);
const classChunk = {
id: Buffer.from(`class-${className}-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16),
type: 'class',
name: className,
qualifiedName: className,
parentClass: extendedClass,
file: filePath,
startLine: startPos,
endLine: endPos,
lines,
tokens: calculateTokenCount(classCode),
code: classCode,
mtime: stat.mtimeMs,
doc: docComment,
isExported: isExportedClass,
methods: [],
properties: []
};
chunks.push(classChunk);
fileScope.children.push(classChunk.id);
if (isExportedClass) {
fileScope.exports.push(classChunk.id);
}
// Extract class methods
const methodRegex = /(?:async\s+)?(?:static\s+)?(?:get|set)?\s*(\w+)\s*\([^)]*\)\s*(?::\s*[^{]+)?\s*{/g;
let methodMatch;
const methodIds = [];
while ((methodMatch = methodRegex.exec(classCode)) !== null) {
const methodName = methodMatch[1];
// Skip constructor and private methods
if (methodName === 'constructor' || !methodName.match(/^[a-zA-Z]/) || methodName.startsWith('_')) continue;
const methodStart = classStart + methodMatch.index;
// Find the method end
let openBraces = 1;
let methodEnd = methodStart + methodMatch[0].length;
for (let i = methodEnd; i < classEnd; i++) {
if (content[i] === '{') openBraces++;
else if (content[i] === '}') openBraces--;
if (openBraces === 0) {
methodEnd = i + 1;
break;
}
// Don't go past the class boundary
if (i >= classEnd - 1) {
methodEnd = classEnd - 1;
break;
}
}
const methodCode = content.substring(methodStart, methodEnd);
const methodLines = methodCode.split('\n').length;
const methodStartPos = content.substring(0, methodStart).split('\n').length - 1;
const methodEndPos = methodStartPos + methodLines - 1;
const methodDocComment = extractDocComment(content, methodStart);
const parameters = extractParameters(methodMatch[0]);
const returnType = extractReturnType(methodMatch[0], methodCode);
const isStatic = methodMatch[0].includes('static ');
const methodId = Buffer.from(`method-${className}-${methodName}-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16);
methodIds.push(methodId);
const methodChunk = {
id: methodId,
type: 'method',
name: methodName,
qualifiedName: `${className}.${methodName}`,
parentClass: className,
parentClassId: classChunk.id,
file: filePath,
startLine: methodStartPos,
endLine: methodEndPos,
lines: methodLines,
tokens: calculateTokenCount(methodCode),
code: methodCode,
mtime: stat.mtimeMs,
doc: methodDocComment,
parameters,
returnType,
isStatic,
complexity: calculateComplexity(methodCode)
};
chunks.push(methodChunk);
classChunk.methods.push(methodId);
// Store relationships
relationships.set(methodId, {
calls: extractFunctionCalls(methodCode),
dependencies: extractDependencies(methodCode)
});
}
// Extract class properties
const propertyRegex = /(?:static\s+)?(?:readonly\s+)?(\w+)\s*(?::\s*([^;=]+))?\s*(?:=|;)/g;
let propertyMatch;
while ((propertyMatch = propertyRegex.exec(classCode)) !== null) {
const propName = propertyMatch[1];
// Skip private properties
if (propName.startsWith('_') || !propName.match(/^[a-zA-Z]/)) continue;
const propStart = classStart + propertyMatch.index;
let propEnd = propStart + propertyMatch[0].length;
// Find property end (could be a complex assignment)
if (content[propEnd - 1] !== ';') {
for (let i = propEnd; i < classEnd; i++) {
if (content[i] === ';') {
propEnd = i + 1;
break;
}
}
}
const propCode = content.substring(propStart, propEnd);
const propLines = propCode.split('\n').length;
const propStartPos = content.substring(0, propStart).split('\n').length - 1;
const propEndPos = propStartPos + propLines - 1;
const propType = propertyMatch[2] ? propertyMatch[2].trim() : '';
const isStatic = propertyMatch[0].includes('static ');
const propId = Buffer.from(`property-${className}-${propName}-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16);
const propChunk = {
id: propId,
type: 'property',
name: propName,
qualifiedName: `${className}.${propName}`,
parentClass: className,
parentClassId: classChunk.id,
file: filePath,
startLine: propStartPos,
endLine: propEndPos,
lines: propLines,
tokens: calculateTokenCount(propCode),
code: propCode,
mtime: stat.mtimeMs,
propertyType: propType,
isStatic
};
chunks.push(propChunk);
classChunk.properties.push(propId);
}
// Add inheritance relationships
if (extendedClass) {
relationships.set(classChunk.id, {
inheritsFrom: extendedClass,
methods: methodIds
});
}
}
// Extract imports/exports
const importExportRegex = /(import|export)[\s\S]+?;/g;
let importMatch;
while ((importMatch = importExportRegex.exec(content)) !== null) {
const code = importMatch[0];
const type = importMatch[1] === 'import' ? 'import' : 'export';
const lines = code.split('\n').length;
const startPos = content.substring(0, importMatch.index).split('\n').length - 1;
const endPos = startPos + lines - 1;
// Extract imported/exported elements and module
let modulePath = '';
let elements = [];
if (type === 'import') {
const moduleMatch = code.match(/from\s+['"]([^'"]+)['"]/);
if (moduleMatch) {
modulePath = moduleMatch[1];
}
const elementsMatch = code.match(/{\s*([^}]+)\s*}/);
if (elementsMatch) {
elements = elementsMatch[1].split(',').map(e => e.trim());
} else {
// Default import
const defaultMatch = code.match(/import\s+(\w+)/);
if (defaultMatch) {
elements = [defaultMatch[1] + ' (default)'];
}
}
} else {
// Export
const namedExport = code.match(/{\s*([^}]+)\s*}/);
if (namedExport) {
elements = namedExport[1].split(',').map(e => e.trim());
} else {
const defaultExport = code.match(/export\s+default\s+(\w+)/);
if (defaultExport) {
elements = [defaultExport[1] + ' (default)'];
}
}
}
const chunk = {
id: Buffer.from(`${type}-${importMatch.index}-${filePath}`).toString('base64').replace(/[^a-zA-Z0-9]/g, '').substring(0, 16),
type,
qualifiedName: code.trim(),
file: filePath,
startLine: startPos,
endLine: endPos,
lines,
tokens: calculateTokenCount(code),
code,
mtime: stat.mtimeMs,
doc: '',
modulePath,
elements
};
chunks.push(chunk);
if (type === 'import') {
// Add dependency relationships
relationships.set(chunk.id, {
dependsOn: modulePath,
imports: elements
});
}
}
// Add file metadata chunk as the first item
const fileChunk = {
id: fileScope.id,
type: 'file',
name: fileName,
qualifiedName: filePath,
file: filePath,
startLine: 0,
endLine: content.split('\n').length - 1,
lines: content.split('\n').length,
tokens: calculateTokenCount(content),
code: content.substring(0, Math.min(150, content.length)) + '...',
mtime: stat.mtimeMs,
doc: extractFileHeader(content),
children: fileScope.children,
exports: fileScope.exports
};
chunks.unshift(fileChunk);
// Add relationships data to chunks
for (const chunk of chunks) {
if (relationships.has(chunk.id)) {
chunk.relationships = relationships.get(chunk.id);
}
}
return chunks;
} catch (error) {
return [];
}
}
// Calculate code complexity (simplified)
function calculateComplexity(code) {
let complexity = 1; // Base complexity
// Count control flow statements
const controlFlow = (code.match(/if|else|for|while|switch|case|catch|try|return|throw/g) || []).length;
complexity += controlFlow * 0.5;
// Count logical operators
const logicalOps = (code.match(/&&|\|\|/g) || []).length;
complexity += logicalOps * 0.3;
return parseFloat(complexity.toFixed(1));
}
// Extract function calls from code
function extractFunctionCalls(code) {
const calls = [];
const callRegex = /(\w+)\s*\(/g;
let callMatch;
while ((callMatch = callRegex.exec(code)) !== null) {
const calledFunc = callMatch[1];
// Filter out common keywords that can appear before parentheses
if (!['if', 'for', 'while', 'switch', 'catch', 'function'].includes(calledFunc)) {
calls.push(calledFunc);
}
}
return [...new Set(calls)]; // Remove duplicates
}
// Extract dependencies from code
function extractDependencies(code) {
// Simple regex to find variable usage
const dependencies = [];
const varRegex = /(\b\w+\b)(?!\s*\(|:)/g;
let varMatch;
while ((varMatch = varRegex.exec(code)) !== null) {
const varName = varMatch[1];
// Filter out keywords and common primitives
if (!['let', 'const', 'var', 'function', 'class', 'if', 'else', 'return',
'true', 'false', 'null', 'undefined', 'this', 'super'].includes(varName)) {
dependencies.push(varName);
}
}
return [...new Set(dependencies)]; // Remove duplicates
}
// Extract file header comments
function extractFileHeader(content) {
const headerLines = [];
const lines = content.split('\n');
for (let i = 0; i < Math.min(10, lines.length); i++) {
const line = lines[i].trim();
if (line.startsWith('//') || line.startsWith('/*') || line.startsWith('*')) {
headerLines.push(line.replace(/^\/\/|\*\/|\*|\/\*\*?/g, '').trim());
} else if (headerLines.length > 0 && line === '') {
continue;
} else if (headerLines.length > 0) {
break;
}
}
return headerLines.join(' ');
}
// Create text representation of a chunk for embedding
function createEmbeddingText(chunk) {
const parts = [];
parts.push(`${chunk.type}: ${chunk.name || chunk.qualifiedName || ''}`);
if (chunk.doc) {
parts.push(`Documentation: ${chunk.doc}`);
}
if (chunk.parentClass) {
parts.push(`In class: ${chunk.parentClass}`);
}
// Add structural info
if (chunk.parameters) {
const paramText = chunk.parameters
.map(p => p.type ? `${p.name}: ${p.type}` : p.name)
.join(', ');
parts.push(`Parameters: ${paramText}`);
}
if (chunk.returnType) {
parts.push(`Returns: ${chunk.returnType}`);
}
if (chunk.complexity) {
parts.push(`Complexity: ${chunk.complexity}`);
}
if (chunk.isExported) {
parts.push('Exported: true');
}
if (chunk.relationships) {
if (chunk.relationships.calls && chunk.relationships.calls.length > 0) {
parts.push(`Calls: ${chunk.relationships.calls.join(', ')}`);
}
if (chunk.relationships.inheritsFrom) {
parts.push(`Inherits from: ${chunk.relationships.inheritsFrom}`);
}
}
if (chunk.code) {
// Clean up code to focus on semantics
const cleanCode = chunk.code
.replace(/[{};,=()[\]]/g, ' ')
.replace(/\s+/g, ' ')
.trim();
parts.push(`Code: ${cleanCode}`);
}
return parts.join(' ');
}
// Optimized text preparation for faster embedding generation
function prepareTextForEmbedding(chunk) {
// Fast path for simple chunks
if (!chunk.doc && !chunk.parameters && !chunk.code) {
return `${chunk.type || ''} ${chunk.name || ''}`.trim();
}
const parts = [];
// Essential info first (most important for search)
if (chunk.type && chunk.name) {
parts.push(`${chunk.type} ${chunk.name}`);
}
// Documentation (high semantic value)
if (chunk.doc) {
parts.push(chunk.doc.substring(0, 100)); // Limit doc length
}
// Function signatures (important for functions)
if (chunk.parameters && chunk.parameters.length > 0 && chunk.parameters.length < 10) {
const paramText = chunk.parameters.map(p => p.name).join(' '); // Just names, skip types for speed
parts.push(paramText);
}
// Minimal code context (reduced for speed)
if (chunk.code && chunk.code.length < 500) {
const cleanCode = chunk.code
.replace(/[{};,()[\]]/g, ' ') // Remove syntax noise quickly
.replace(/\s+/g, ' ') // Normalize whitespace
.substring(0, 150); // Limit length
parts.push(cleanCode);
}
return parts.join(' ').substring(0, 300); // Shorter limit for faster processing
}
// Extract semantic keywords from code
function extractSemanticKeywords(code) {
const keywords = new Set();
// Extract variable names, function calls, and meaningful identifiers
const identifierRegex = /\b[a-zA-Z_$][a-zA-Z0-9_$]*\b/g;
const matches = code.match(identifierRegex) || [];
for (const match of matches) {
// Skip common keywords and short names
if (match.length < 3 || isCommonKeyword(match)) continue;
// Split camelCase and snake_case
const words = match
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/_/g, ' ')
.toLowerCase()
.split(' ')
.filter(w => w.length >= 3 && !isCommonKeyword(w));
words.forEach(word => keywords.add(word));
}
return Array.from(keywords).slice(0, 10); // Limit to most relevant
}
// Clean code for better embedding
function cleanCodeForEmbedding(code) {
return code
// Remove comments
.replace(/\/\*[\s\S]*?\*\//g, '')
.replace(/\/\/.*$/gm, '')
// Remove excessive whitespace
.replace(/\s+/g, ' ')
// Remove common syntax noise
.replace(/[{}();,]/g, ' ')
// Trim and limit length
.trim()
.substring(0, 200);
}
// Check if word is a common programming keyword
function isCommonKeyword(word) {
const commonWords = new Set([
'const', 'let', 'var', 'function', 'class', 'if', 'else', 'for', 'while',
'return', 'import', 'export', 'from', 'async', 'await', 'try', 'catch',
'throw', 'new', 'this', 'super', 'extends', 'implements', 'interface',
'type', 'enum', 'public', 'private', 'protected', 'static', 'readonly',
'true', 'false', 'null', 'undefined', 'void', 'any', 'string', 'number',
'boolean', 'object', 'array', 'map', 'set', 'date', 'error', 'promise'
]);
return commonWords.has(word.toLowerCase());
}
async function generateEmbedding(text, chunk = null) {
try {
if (!embeddingEngineRef) {
return null;
}
// Use enhanced text preparation for chunks
const embeddingText = chunk ? prepareTextForEmbedding(chunk) : text;
// Use the appropriate embedding engine
const embedding = await embeddingEngineRef.embed(embeddingText);
return embedding;
} catch (error) {
console.log(`[DEBUG] Embedding generation failed: ${error.message}`);
return null;
}
}
// Save index to disk
async function saveIndex(indexDir = INDEX_DIR) {
try {
// Save code chunks
const indexPath = path.join(indexDir, INDEX_FILE);
await fs.writeFile(indexPath, JSON.stringify(codeChunks));
return true;
} catch (error) {
return false;
}
}
// Synchronize the index with the file system
export async function syncIndex(folders, exts = DEFAULT_EXTS, ignores = DEFAULT_IGNORES) {
if (!isInitialized) {
await initialize();
}
// Gather all files
const files = [];
for (const folder of folders) {
const folderFiles = await gatherFiles(folder, exts, ignores);
files.push(...folderFiles);
}
// Process files and extract chunks (optimized batch processing)
let newChunksCount = 0;
const allNewChunks = [];
const updatedChunkIds = new Set();
const chunksNeedingEmbeddings = [];
// First pass: extract all chunks and identify what needs embeddings
for (const file of files) {
try {
const fileChunks = await extractChunks(file);
for (const chunk of fileChunks) {
updatedChunkIds.add(chunk.id);
// Check if chunk exists with the same mtime
const existingIndex = chunkIds.indexOf(chunk.id);
if (existingIndex !== -1 && codeChunks[existingIndex].mtime === chunk.mtime) {
continue;
}
allNewChunks.push(chunk);
if (embeddingEngineRef) {
chunksNeedingEmbeddings.push(chunk);
} else {
chunk.embedding = null;
}
newChunksCount++;
}
} catch (error) {
// Silently handle file processing errors
}
}
// Second pass: batch generate embeddings for better performance
if (embeddingEngineRef && chunksNeedingEmbeddings.length > 0) {
const BATCH_SIZE = 20; // Process embeddings in batches
for (let i = 0; i < chunksNeedingEmbeddings.length; i += BATCH_SIZE) {
const batch = chunksNeedingEmbeddings.slice(i, i + BATCH_SIZE);
const batchPromises = batch.map(async (chunk) => {
const text = prepareTextForEmbedding(chunk); // Use optimized version
chunk.embedding = await generateEmbedding(text, chunk);
});
// Process batch concurrently
await Promise.all(batchPromises);
}
}
// Find chunks to delete (chunks not in updated files)
const chunksToDelete = codeChunks.filter(chunk => !updatedChunkIds.has(chunk.id));
// Update the in-memory index
if (allNewChunks.length > 0 || chunksToDelete.length > 0) {
// Remove deleted chunks
for (const chunk of chunksToDelete) {
const index = chunkIds.indexOf(chunk.id);
if (index !== -1) {
// Remove from code chunks array
codeChunks.splice(index, 1);
chunkIds.splice(index, 1);
}
}
// Add new chunks
for (const chunk of allNewChunks) {
const existingIndex = chunkIds.indexOf(chunk.id);
if (existingIndex !== -1) {
// Update existing chunk
codeChunks[existingIndex] = chunk;
} else {
// Add new chunk
codeChunks.push(chunk);
chunkIds.push(chunk.id);
}
}
// Save the updated index
await saveIndex();
}
return {
total: codeChunks.length,
new: newChunksCount,
deleted: chunksToDelete.length
};
}
// Enhanced natural language code search scoring
function textMatchScore(query, chunk) {
// Normalize query and text
const normalizedQuery = query.toLowerCase();
const normalizedCode = chunk.code ? chunk.code.toLowerCase() : '';
const normalizedName = chunk.name ? chunk.name.toLowerCase() : '';
const normalizedQualifiedName = chunk.qualifiedName ? chunk.qualifiedName.toLowerCase() : '';
const normalizedDoc = chunk.doc ? chunk.doc.toLowerCase() : '';
let score = 0;
// Enhanced exact matching with camelCase and snake_case handling
const queryVariants = generateQueryVariants(query);
const nameVariants = generateNameVariants(chunk.name || '');
// Exact matches (highest priority)
for (const queryVariant of queryVariants) {
for (const nameVariant of nameVariants) {
if (nameVariant === queryVariant) {
score += 2.0;
break;
}
}
}
// Partial name matches with intelligent scoring
if (score === 0) {
for (const queryVariant of queryVariants) {
if (normalizedName.includes(queryVariant) || normalizedQualifiedName.includes(queryVariant)) {
// Score based on match quality and position
const matchQuality = queryVariant.length / Math.max(normalizedName.length, normalizedQualifiedName.length);
const isAtStart = normalizedName.startsWith(queryVariant) || normalizedQualifiedName.startsWith(queryVariant);
score += (isAtStart ? 1.2 : 0.8) * matchQuality;
}
}
}
// Documentation and comments scoring (for natural language)
if (normalizedDoc.includes(normalizedQuery)) {
const docWords = normalizedDoc.split(/\s+/);
const queryWords = normalizedQuery.split(/\s+/);
const matchRatio = queryWords.filter(word => docWords.some(docWord => docWord.includes(word))).length / queryWords.length;
score += 0.8 * matchRatio;
}
// Code context scoring with semantic understanding
const codeScore = calculateCodeContextScore(normalizedQuery, normalizedCode, chunk);
score += codeScore;
// Natural language intent scoring
const intentScore = calculateIntentScore(query, chunk);
score += intentScore;
// Type-specific boosting
const typeBoost = getTypeBoost(query, chunk.type);
score *= typeBoost;
return Math.min(score, 2.0); // Allow higher scores for better matches
}
// Generate query variants for better matching
function generateQueryVariants(query) {
const variants = [query.toLowerCase()];
// Add camelCase variant
const camelCase = query.replace(/\s+/g, '').replace(/\b\w/g, (l, i) => i === 0 ? l.toLowerCase() : l.toUpperCase());
variants.push(camelCase.toLowerCase());
// Add snake_case variant
const snakeCase = query.toLowerCase().replace(/\s+/g, '_');
variants.push(snakeCase);
// Add kebab-case variant
const kebabCase = query.toLowerCase().replace(/\s+/g, '-');
variants.push(kebabCase);
// Add individual words
const words = query.toLowerCase().split(/\s+/).filter(w => w.length >= 2);
variants.push(...words);
return [...new Set(variants)];
}
// Generate name variants for better matching
function generateNameVariants(name) {
if (!name) return [''];
const variants = [name.toLowerCase()];
// Split camelCase
const camelSplit = name.replace(/([a-z])([A-Z])/g, '$1 $2').toLowerCase();
variants.push(camelSplit);
// Split snake_case
const snakeSplit = name.replace(/_/g, ' ').toLowerCase();
variants.push(snakeSplit);
// Split kebab-case
const kebabSplit = name.replace(/-/g, ' ').toLowerCase();
variants.push(kebabSplit);
return [...new Set(variants)];
}
// Calculate code context score with semantic understanding
function calculateCodeContextScore(query, code, chunk) {
let score = 0;
// Direct code inclusion
if (code.includes(query)) {
score += 0.4;
}
// Word-level matching with context
const queryWords = query.split(/\s+/).filter(w => w.length >= 2);
const codeWords = code.split(/[^a-zA-Z0-9_$]/).filter(w => w.length >= 2);
for (const queryWord of queryWords) {
const matchingWords = codeWords.filter(codeWord =>
codeWord.toLowerCase().includes(queryWord.toLowerCase()) ||
queryWord.toLowerCase().includes(codeWord.toLowerCase())
);
if (matchingWords.length > 0) {
score += 0.1 * Math.min(matchingWords.length / codeWords.length, 0.5);
}
}
return score;
}
// Calculate intent-based scoring for natural language queries
function calculateIntentScore(query, chunk) {
let score = 0;
const lowerQuery = query.toLowerCase();
// Action intent patterns
const actionPatterns = {
'find': ['find', 'search', 'get', 'retrieve', 'fetch'],
'create': ['create', 'make', 'generate', 'build', 'add'],
'update': ['update', 'modify', 'change', 'edit', 'set'],
'delete': ['delete', 'remove', 'destroy', 'clear'],
'validate': ['validate', 'check', 'verify', 'test'],
'parse': ['parse', 'process', 'analyze', 'extract'],
'format': ['format', 'transform', 'convert', 'serialize']
};
// Type-specific intent matching
for (const [intent, keywords] of Object.entries(actionPatterns)) {
if (keywords.some(keyword => lowerQuery.includes(keyword))) {
// Boost functions that match the intent
if (chunk.type === 'function' && chunk.name &&
chunk.name.toLowerCase().includes(intent)) {
score += 0.3;
}
// Check if code contains related patterns
const code = chunk.code?.toLowerCase() || '';
if (keywords.some(keyword => code.includes(keyword))) {
score += 0.2;
}
}
}
// Domain-specific patterns
const domainPatterns = {
'api': ['api', 'endpoint', 'request', 'response', 'http'],
'data': ['data', 'model', 'schema', 'database', 'sql'],
'ui': ['ui', 'component', 'render', 'display', 'view'],
'util': ['util', 'helper', 'utility', 'common', 'shared']
};
for (const [domain, keywords] of Object.entries(domainPatterns)) {
if (keywords.some(keyword => lowerQuery.includes(keyword))) {
const fileName = chunk.file?.toLowerCase() || '';
const code = chunk.code?.toLowerCase() || '';
if (fileName.includes(domain) || keywords.some(keyword => code.includes(keyword))) {
score += 0.15;
}
}
}
return score;
}
// Get type-specific boost for relevance
function getTypeBoost(query, type) {
const lowerQuery = query.toLowerCase();
// Query suggests looking for specific types
if (lowerQuery.includes('function') && type === 'function') return 1.3;
if (lowerQuery.includes('class') && type === 'class') return 1.3;
if (lowerQuery.includes('method') && type === 'method') return 1.3;
if (lowerQuery.includes('variable') && type === 'property') return 1.2;
if (lowerQuery.includes('file') && type === 'file') return 1.2;
// Default boosts by type importance
switch (type) {
case 'function': return 1.1; // Functions are often what users search for
case 'class': return 1.05;
case 'method': return 1.0;
case 'export': return 0.95;
case 'property': return 0.9;
case 'import': return 0.8;
case 'file': return 0.7; // Files are less likely to be direct search targets
default: return 1.0;
}
}
// Main search function that tools expect
export async function searchSemantic(query, options = {}) {
const {
workingDirectory = process.cwd(),
topK = 8,
extensions = DEFAULT_EX