mcp-repl
Version:
MCP REPL with code execution, semantic code search, and comprehensive ast-grep integration
241 lines (198 loc) • 8.6 kB
JavaScript
// Lightweight semantic search for ARM64 compatibility
// Uses NLP techniques without heavy ML dependencies
import fs from 'fs/promises';
import path from 'path';
// Simple text similarity using TF-IDF + cosine similarity
class LightSemanticSearch {
constructor() {
this.documents = [];
this.documentMap = new Map();
this.isInitialized = false;
}
// Extract meaningful keywords from text
extractKeywords(text) {
const keywords = [];
// Extract function names, variable names, and class names
const functionMatches = text.match(/(?:function|class|const|let|var)\s+(\w+)/g);
if (functionMatches) {
functionMatches.forEach(match => {
const name = match.split(/\s+/)[1];
keywords.push(name.toLowerCase());
});
}
// Extract method calls
const methodMatches = text.match(/(\w+)\.\w+\(/g);
if (methodMatches) {
methodMatches.forEach(match => {
const object = match.split('.')[0];
keywords.push(object.toLowerCase());
});
}
// Extract important keywords
const commonWords = new Set(['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'as', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'must', 'can', 'could']);
const words = text.toLowerCase().match(/\b[a-zA-Z_][a-zA-Z0-9_]*\b/g) || [];
words.forEach(word => {
if (word.length > 2 && !commonWords.has(word) && word.match(/^[a-z]/)) {
keywords.push(word);
}
});
return keywords;
}
// Calculate TF-IDF similarity
calculateSimilarity(queryKeywords, documentKeywords) {
const allKeywords = new Set([...queryKeywords, ...documentKeywords]);
const queryVector = [];
const docVector = [];
allKeywords.forEach(keyword => {
queryVector.push(queryKeywords.filter(k => k === keyword).length);
docVector.push(documentKeywords.filter(k => k === keyword).length);
});
// Calculate cosine similarity
const dotProduct = queryVector.reduce((sum, val, i) => sum + val * docVector[i], 0);
const queryMagnitude = Math.sqrt(queryVector.reduce((sum, val) => sum + val * val, 0));
const docMagnitude = Math.sqrt(docVector.reduce((sum, val) => sum + val * val, 0));
if (queryMagnitude === 0 || docMagnitude === 0) return 0;
return dotProduct / (queryMagnitude * docMagnitude);
}
// Index code files
async indexFiles(folders, exts = ['js', 'ts'], ignores = []) {
console.log('[DEBUG] Starting lightweight semantic indexing...');
for (const folder of folders) {
await this.scanDirectory(folder, exts, ignores);
}
this.isInitialized = true;
console.log(`[DEBUG] Indexed ${this.documents.length} files`);
}
async scanDirectory(dir, exts, ignores, depth = 0) {
if (depth > 5) return; // Prevent infinite recursion
try {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
const relativePath = path.relative(process.cwd(), fullPath);
// Skip ignored files/directories
if (ignores.some(pattern => {
if (pattern.endsWith('/')) {
return relativePath.startsWith(pattern) || fullPath.startsWith(pattern);
} else {
return relativePath.includes(pattern) || fullPath.includes(pattern);
}
})) {
continue;
}
if (entry.isDirectory() && !entry.name.startsWith('.') && entry.name !== 'node_modules') {
await this.scanDirectory(fullPath, exts, ignores, depth + 1);
} else if (entry.isFile()) {
const ext = entry.name.split('.').pop().toLowerCase();
if (exts.includes(ext)) {
try {
const content = await fs.readFile(fullPath, 'utf8');
const keywords = this.extractKeywords(content);
const doc = {
id: this.documents.length,
path: relativePath,
fullPath: fullPath,
keywords: keywords,
content: content.substring(0, 5000), // Store first 5KB for context
language: ext
};
this.documents.push(doc);
this.documentMap.set(relativePath, doc);
} catch (err) {
console.log(`[DEBUG] Failed to index ${relativePath}:`, err.message);
}
}
}
}
} catch (err) {
console.log(`[DEBUG] Failed to scan directory ${dir}:`, err.message);
}
}
// Search using semantic similarity
async search(query, topK = 8) {
if (!this.isInitialized) {
throw new Error('Search not initialized. Call indexFiles first.');
}
const queryKeywords = this.extractKeywords(query);
// Calculate similarity scores
const results = this.documents.map(doc => ({
document: doc,
score: this.calculateSimilarity(queryKeywords, doc.keywords),
keywordOverlap: doc.keywords.filter(k => queryKeywords.includes(k)).length
}));
// Sort by relevance (combine similarity and keyword overlap)
results.sort((a, b) => {
const scoreA = a.score + (a.keywordOverlap * 0.1);
const scoreB = b.score + (b.keywordOverlap * 0.1);
return scoreB - scoreA;
});
// Return top results with context
return results.slice(0, topK).map(result => ({
file: result.document.path,
score: result.score,
keywordOverlap: result.keywordOverlap,
context: this.extractRelevantContext(result.document.content, queryKeywords),
language: result.document.language
}));
}
// Extract relevant context around keywords
extractRelevantContext(content, keywords) {
const lines = content.split('\n');
const relevantLines = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (keywords.some(keyword => line.toLowerCase().includes(keyword))) {
// Include this line and surrounding lines for context
const start = Math.max(0, i - 2);
const end = Math.min(lines.length, i + 3);
const context = lines.slice(start, end);
relevantLines.push({
lineNumbers: [start + 1, end],
text: context.join('\n')
});
}
}
return relevantLines.slice(0, 3); // Limit to 3 contexts per file
}
}
// Singleton instance
let searchInstance = null;
async function getLightSearchInstance() {
if (!searchInstance) {
searchInstance = new LightSemanticSearch();
}
return searchInstance;
}
export async function lightweightSearch(args) {
const { query, workingDirectory, folders, extensions, ignores, topK } = args;
if (!query || !workingDirectory) {
throw new Error('Query and working directory are required');
}
const searchInstance = await getLightSearchInstance();
// Set defaults
const searchFolders = folders ? folders.split(',').map(f => path.join(workingDirectory, f.trim())) : [workingDirectory];
const searchExts = extensions ? extensions.split(',').map(e => e.trim()) : ['js', 'ts'];
const searchIgnores = ignores ? ignores.split(',').map(i => i.trim()) : ['node_modules', '.git', '.next', 'dist', 'build'];
const searchTopK = topK || 8;
// Index files if not already indexed
if (!searchInstance.isInitialized) {
await searchInstance.indexFiles(searchFolders, searchExts, searchIgnores);
}
// Perform search
const results = await searchInstance.search(query, searchTopK);
// Format results
let output = `Lightweight semantic search results for "${query}":\n\n`;
results.forEach((result, index) => {
output += `${index + 1}. ${result.file} (score: ${result.score.toFixed(3)}, keyword overlap: ${result.keywordOverlap})\n`;
output += ` Language: ${result.language}\n`;
if (result.context.length > 0) {
output += ' Context:\n';
result.context.forEach(ctx => {
output += ` Lines ${ctx.lineNumbers[0]}-${ctx.lineNumbers[1]}:\n`;
output += ` ${ctx.text}\n\n`;
});
}
output += '\n';
});
return output;
}