mcp-repl
Version:
MCP REPL with code execution, semantic code search, and comprehensive ast-grep integration
392 lines (336 loc) • 12.1 kB
JavaScript
// ARM64-compatible vector search using transformers.js
import fs from 'fs/promises';
import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs';
import path from 'path';
import { pipeline } from '@xenova/transformers';
import ignore from 'ignore';
// Configuration constants
const INDEX_DIR = './code_search_index';
const DEFAULT_MODEL = 'Xenova/all-MiniLM-L6-v2';
const DEFAULT_DIM = 384; // Dimension size for the chosen model
const DEFAULT_EXTS = ['js', 'ts'];
const DEFAULT_IGNORES = [
'node_modules', '.git', '.node_modules',
'dist', 'build', 'coverage', '.nyc_output',
'tmp', 'temp', '.tmp', '.cache', '.parcel-cache',
'.next', '.nuxt', '.vuepress', '.docusaurus',
'public', 'static', 'assets', 'images', 'img',
'.vscode', '.idea', '.DS_Store', 'Thumbs.db',
'out', 'output', 'generated', 'gen',
'.angular', '.react', '.svelte-kit',
'storybook-static', 'docs-build', 'build-docs',
'.vite', '.turbo', '.nx', '.swc',
'bower_components', 'jspm_packages', '.pnp',
'__tests__', '__mocks__', '__snapshots__',
'.jest', '.mocha', '.cypress', '.playwright',
'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
'.npmrc', '.yarnrc', '.pnpmrc'
];
const INDEX_FILE = 'code_index.json';
const VECTOR_INDEX_FILE = 'vector_index.json';
// Global state
let codeChunks = [];
let embeddingExtractor = null;
let isInitialized = false;
// Create robust ignore filter using the ignore library
function createIgnoreFilter(rootDir) {
const ig = ignore();
// Add default ignore patterns
ig.add(DEFAULT_IGNORES);
// Find and add all .gitignore files in the directory tree
const addGitignoreFiles = (dir) => {
try {
const entries = readdirSync(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (entry.isFile() && entry.name === '.gitignore') {
try {
const content = readFileSync(fullPath, 'utf8');
ig.add(content);
} catch (error) {
// Silently handle .gitignore read errors
}
} else if (entry.isDirectory() && !entry.name.startsWith('.') && !DEFAULT_IGNORES.includes(entry.name)) {
// Recursively add .gitignore files from subdirectories
addGitignoreFiles(fullPath);
}
}
} catch (error) {
// Silently handle directory read errors
}
};
addGitignoreFiles(rootDir);
return ig;
}
// Check if a file should be indexed based on extension
function shouldIndexFile(filePath, allowedExtensions) {
const ext = path.extname(filePath).slice(1).toLowerCase();
if (!ext || !allowedExtensions.includes(ext)) {
return false;
}
// Additional filtering for known non-code files even with allowed extensions
const filename = path.basename(filePath);
const excludedFiles = [
// Minified/bundled JS
'*.min.js', '*.bundle.js', '*.pack.js',
// TypeScript definitions
'*.d.ts', '*.d.tsx',
// Source maps
'*.map', '*.css.map',
// Package files
'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
// Config files
'tsconfig.json', 'jsconfig.json',
// Linter configs
'.eslintrc.*', '.prettierrc.*',
// Documentation
'LICENSE*', 'README*', '*.md', 'CHANGELOG*',
// Docker
'Dockerfile*', 'docker-compose*.yml'
];
return !excludedFiles.some(excluded => filename.match(excluded.replace(/\*/g, '.*')));
}
// Initialize the embedding model
export async function initialize(indexDir = INDEX_DIR) {
if (isInitialized) return true;
try {
console.log('[DEBUG] Initializing ARM64-compatible vector search...');
// Create index directory if it doesn't exist
if (!existsSync(indexDir)) {
mkdirSync(indexDir, { recursive: true });
}
// Initialize embedding extractor with ARM64-safe configuration
if (!embeddingExtractor) {
console.log('[DEBUG] Loading embedding model...');
embeddingExtractor = await pipeline('feature-extraction', DEFAULT_MODEL, {
device: 'wasm' // ARM64-safe configuration
});
console.log('[DEBUG] Embedding model loaded successfully');
}
isInitialized = true;
console.log('[DEBUG] Vector search initialized successfully');
return true;
} catch (error) {
console.error(`[DEBUG] Vector search initialization failed: ${error.message}`);
return false;
}
}
// Process code files into chunks
function processCodeIntoChunks(content, filePath) {
const chunks = [];
const lines = content.split('\n');
// Split into logical chunks (functions, classes, blocks)
let currentChunk = '';
let inFunction = false;
let inClass = false;
let braceCount = 0;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
const trimmedLine = line.trim();
// Start of function or class
if (trimmedLine.match(/^(function|class)\s+\w/)) {
if (currentChunk.trim()) {
chunks.push({
content: currentChunk.trim(),
file: filePath,
startLine: Math.max(0, i - currentChunk.split('\n').length),
endLine: i
});
}
currentChunk = line;
braceCount = (line.match(/{/g) || []).length;
inFunction = trimmedLine.startsWith('function');
inClass = trimmedLine.startsWith('class');
} else {
currentChunk += '\n' + line;
// Track braces for proper chunking
braceCount += (line.match(/{/g) || []).length;
braceCount -= (line.match(/}/g) || []).length;
// End chunk when brace count reaches zero
if (braceCount === 0 && (inFunction || inClass)) {
chunks.push({
content: currentChunk.trim(),
file: filePath,
startLine: Math.max(0, i - currentChunk.split('\n').length),
endLine: i
});
currentChunk = '';
inFunction = false;
inClass = false;
}
}
}
// Add remaining content
if (currentChunk.trim()) {
chunks.push({
content: currentChunk.trim(),
file: filePath,
startLine: Math.max(0, lines.length - currentChunk.split('\n').length),
endLine: lines.length - 1
});
}
return chunks;
}
// Synchronize the index with the file system
export async function syncIndex(folders, exts = DEFAULT_EXTS, ignores = DEFAULT_IGNORES) {
if (!isInitialized) {
await initialize();
}
const files = [];
const ignoreFilter = createIgnoreFilter(process.cwd());
for (const folder of folders) {
await scanDirectory(folder, ignoreFilter, files, exts);
}
// Process files into chunks
const newChunks = [];
for (const file of files) {
try {
const content = await fs.readFile(file, 'utf8');
const chunks = processCodeIntoChunks(content, file);
newChunks.push(...chunks);
} catch (error) {
console.error(`Error reading file ${file}:`, error);
}
}
codeChunks = newChunks;
console.log(`[DEBUG] Indexed ${codeChunks.length} code chunks`);
// Save index
const indexData = {
timestamp: Date.now(),
chunks: codeChunks.map(c => ({
file: c.file,
content: c.content,
startLine: c.startLine,
endLine: c.endLine
}))
};
writeFileSync(path.join(INDEX_DIR, INDEX_FILE), JSON.stringify(indexData, null, 2));
return codeChunks.length;
}
// Helper function to scan directories with proper .gitignore support
async function scanDirectory(dir, ignoreFilter, files, exts) {
try {
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
const relativePath = path.relative(process.cwd(), fullPath);
// Check if file/directory should be ignored using the ignore library
if (ignoreFilter.ignores(relativePath)) {
continue;
}
if (entry.isDirectory()) {
// Recursively scan subdirectories
await scanDirectory(fullPath, ignoreFilter, files, exts);
} else if (entry.isFile()) {
// Check if this file should be indexed based on extension and content type
if (shouldIndexFile(fullPath, exts)) {
// Check file size - skip files larger than 200KB
try {
const stat = await fs.stat(fullPath);
if (stat.size <= 200 * 1024) { // 200KB limit
files.push(fullPath);
}
} catch (error) {
// Skip files we can't stat
}
}
}
}
} catch (error) {
// Skip directories we can't read
}
}
// Query the index with semantic search
export async function queryIndex(query, topK = 8) {
if (!isInitialized) {
await initialize();
}
if (codeChunks.length === 0) {
return [];
}
try {
// Generate query embedding
const queryEmbedding = await embeddingExtractor(query, {
pooling: 'mean',
normalize: true
});
// Calculate similarity with all chunks
const results = [];
for (let i = 0; i < codeChunks.length; i++) {
const chunk = codeChunks[i];
try {
const chunkEmbedding = await embeddingExtractor(chunk.content, {
pooling: 'mean',
normalize: true
});
// Calculate cosine similarity
const similarity = calculateCosineSimilarity(queryEmbedding.data, chunkEmbedding.data);
results.push({
file: chunk.file,
content: chunk.content,
startLine: chunk.startLine,
endLine: chunk.endLine,
similarity: similarity
});
} catch (error) {
// Skip chunks that can't be embedded
}
}
// Sort by similarity and return topK results
return results
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK)
.map(r => ({
file: r.file,
content: r.content,
startLine: r.startLine,
endLine: r.endLine,
score: r.similarity
}));
} catch (error) {
console.error('[DEBUG] Query failed:', error);
return [];
}
}
// Calculate cosine similarity between two vectors
function calculateCosineSimilarity(vecA, vecB) {
if (vecA.length !== vecB.length) return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < vecA.length; i++) {
dotProduct += vecA[i] * vecB[i];
normA += vecA[i] * vecA[i];
normB += vecB[i] * vecB[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
// Search with enhanced natural language support
export async function searchCode(query, workingDirectory, folders = ['.'], extensions = DEFAULT_EXTS, topK = 8) {
try {
// Validate working directory
if (!existsSync(workingDirectory)) {
console.error(`Working directory does not exist: ${workingDirectory}`);
return [];
}
// Initialize if needed
const initPromise = isInitialized ? Promise.resolve() : initialize();
await Promise.race([
initPromise,
new Promise((_, reject) => setTimeout(() => reject(new Error('Initialization timeout')), 10000))
]);
// Convert folder paths to absolute paths
const absFolders = folders.map(f => path.resolve(workingDirectory, f));
// Sync index and get results
await syncIndex(absFolders, extensions);
return await queryIndex(query, topK);
} catch (error) {
console.error('[DEBUG] Search failed:', error);
return [];
}
}
export async function searchSemantic(query, options = {}) {
const { workingDirectory, folders = ['.'], extensions = DEFAULT_EXTS, topK = 8 } = options;
return await searchCode(query, workingDirectory, folders, extensions, topK);
}