remcode
Version:
Turn your AI assistant into a codebase expert. Intelligent code analysis, semantic search, and software engineering guidance through MCP integration.
339 lines (338 loc) • 14.1 kB
JavaScript
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.VectorizationPipeline = void 0;
const logger_1 = require("../utils/logger");
const manager_1 = require("./chunkers/manager");
const manager_2 = require("./embedders/manager");
const pinecone_1 = require("./storage/pinecone");
const fs = __importStar(require("fs"));
const path = __importStar(require("path"));
const dotenv = __importStar(require("dotenv"));
// Load environment variables
dotenv.config();
const logger = (0, logger_1.getLogger)('VectorizationPipeline');
/**
* Main vectorization pipeline that orchestrates chunking, embedding, and storage
*/
class VectorizationPipeline {
constructor(options) {
this.initialized = false;
this.options = {
// Default values with working models
embeddingModel: 'microsoft/codebert-base',
fallbackModel: 'microsoft/graphcodebert-base',
batchSize: 5, // Reduced for stability
maxFileSize: 1024 * 1024, // 1MB
includeExtensions: ['.ts', '.js', '.py', '.java', '.go', '.rb', '.php', '.cpp', '.c', '.cs', '.rs'],
excludeExtensions: ['.min.js', '.bundle.js', '.test.js', '.spec.js'],
excludePaths: ['node_modules', '.git', 'dist', 'build', '__pycache__', '.pytest_cache'],
chunkingStrategy: {
clean_modules: 'function_level',
complex_modules: 'class_level',
monolithic_files: 'sliding_window_with_overlap'
},
pineconeNamespace: 'default',
...options
};
// Initialize managers with correct configurations
this.chunkingManager = new manager_1.ChunkingManager(this.options.chunkingStrategy);
this.embeddingManager = new manager_2.EmbeddingManager({
primary: this.options.embeddingModel,
fallback: this.options.fallbackModel,
batchSize: this.options.batchSize,
token: this.options.huggingfaceToken,
dimension: 768 // CodeBERT dimension
});
this.storage = new pinecone_1.PineconeStorage({
apiKey: this.options.pineconeApiKey,
indexName: this.options.pineconeIndexName,
namespace: this.options.pineconeNamespace,
dimension: 768,
metric: 'cosine'
});
}
/**
* Initialize the vectorization pipeline
*/
async initialize() {
logger.info('Initializing vectorization pipeline...');
try {
// Initialize Pinecone storage
await this.storage.initialize();
this.initialized = true;
logger.info('Vectorization pipeline initialized successfully');
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(`Failed to initialize vectorization pipeline: ${errorMessage}`);
throw new Error(`Initialization failed: ${errorMessage}`);
}
}
/**
* Process a single file and return its vectorized chunks
*/
async processFile(filePath, relativePath) {
if (!this.initialized) {
throw new Error('Pipeline not initialized. Call initialize() first.');
}
logger.debug(`Processing file: ${filePath}`);
try {
// 1. Read file content
const content = await fs.promises.readFile(filePath, 'utf8');
// 2. Create file info
const fileInfo = this.createFileInfo(filePath, relativePath);
// 3. Determine chunking strategy based on file characteristics
const strategy = this.determineChunkingStrategy(content, fileInfo);
// 4. Chunk the file
const chunks = await this.chunkingManager.chunkFile(content, strategy, fileInfo);
if (chunks.length === 0) {
logger.warn(`No chunks created for file: ${filePath}`);
return [];
}
logger.debug(`Created ${chunks.length} chunks for file: ${filePath}`);
// 5. Generate embeddings for chunks
const embeddedChunks = await this.embeddingManager.embedChunks(chunks);
// 6. Store vectors in Pinecone
const vectorData = embeddedChunks
.filter(chunk => chunk.embedding) // Only store chunks with embeddings
.map(chunk => ({
embedding: chunk.embedding,
metadata: {
...chunk.metadata,
content: chunk.content, // Store the actual content in metadata
processed_at: new Date().toISOString()
}
}));
if (vectorData.length > 0) {
await this.storage.storeVectors(vectorData);
logger.debug(`Stored ${vectorData.length} vectors for file: ${filePath}`);
}
logger.info(`Successfully processed file: ${filePath} (${chunks.length} chunks, ${vectorData.length} stored)`);
return embeddedChunks;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(`Error processing file ${filePath}: ${errorMessage}`);
throw new Error(`File processing failed: ${errorMessage}`);
}
}
/**
* Search for similar code using vector similarity
*/
async searchSimilarCode(query, topK = 10, filter) {
if (!this.initialized) {
throw new Error('Pipeline not initialized. Call initialize() first.');
}
try {
logger.info(`Searching for similar code with query: "${query.substring(0, 50)}..."`);
// 1. Generate embedding for the query
const queryChunk = {
content: query,
metadata: {
file_path: '',
strategy: 'query',
chunk_type: 'query'
}
};
const embeddedQuery = await this.embeddingManager.embedChunks([queryChunk]);
if (!embeddedQuery[0].embedding) {
throw new Error('Failed to generate embedding for query');
}
// 2. Search Pinecone for similar vectors
const results = await this.storage.queryVectors(embeddedQuery[0].embedding, topK, filter, this.options.pineconeNamespace);
logger.info(`Found ${results.length} similar code matches`);
return results;
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(`Error searching for similar code: ${errorMessage}`);
throw new Error(`Search failed: ${errorMessage}`);
}
}
/**
* Get storage statistics
*/
async getStats() {
if (!this.initialized) {
throw new Error('Pipeline not initialized. Call initialize() first.');
}
return await this.storage.getIndexStats(this.options.pineconeNamespace);
}
/**
* Delete vectors for a specific file (useful for incremental updates)
*/
async deleteFileVectors(filePath) {
if (!this.initialized) {
throw new Error('Pipeline not initialized. Call initialize() first.');
}
try {
return await this.storage.deleteVectorsByMetadata({ file_path: filePath }, this.options.pineconeNamespace);
}
catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(`Error deleting vectors for file ${filePath}: ${errorMessage}`);
throw new Error(`Vector deletion failed: ${errorMessage}`);
}
}
// Private helper methods
createFileInfo(filePath, relativePath) {
const stats = fs.statSync(filePath);
const extension = path.extname(filePath).toLowerCase();
// Simple language detection based on extension
const languageMap = {
'.ts': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.tsx': 'typescript',
'.py': 'python',
'.java': 'java',
'.go': 'go',
'.rb': 'ruby',
'.php': 'php',
'.cpp': 'cpp',
'.c': 'c',
'.cs': 'csharp',
'.rs': 'rust',
'.swift': 'swift',
'.kt': 'kotlin',
'.scala': 'scala'
};
return {
file_path: filePath,
relative_path: relativePath || path.basename(filePath),
language: languageMap[extension] || 'text',
size: stats.size,
extension: extension.substring(1) // Remove the dot
};
}
determineChunkingStrategy(content, fileInfo) {
const lines = content.split('\n').length;
const complexity = this.estimateComplexity(content);
// Simple heuristics for strategy selection
if (complexity === 'low' && lines < 200) {
return this.options.chunkingStrategy.clean_modules;
}
else if (complexity === 'high' || lines > 1000) {
return this.options.chunkingStrategy.monolithic_files;
}
else {
return this.options.chunkingStrategy.complex_modules;
}
}
estimateComplexity(content) {
const lines = content.split('\n').length;
const functions = (content.match(/function|def |class |const |let |var /g) || []).length;
const nesting = (content.match(/\{|\[|\(/g) || []).length;
const complexityScore = (functions * 2) + (nesting * 0.1) + (lines * 0.01);
if (complexityScore < 10)
return 'low';
if (complexityScore < 50)
return 'medium';
return 'high';
}
/**
* Process all files in a directory recursively
*/
async processDirectory(dirPath) {
if (!this.initialized) {
throw new Error('Pipeline not initialized. Call initialize() first.');
}
const startTime = Date.now();
const result = {
success: false,
filesProcessed: 0,
chunksCreated: 0,
vectorsStored: 0,
errors: [],
duration: 0
};
try {
// Find all code files
const files = await this.findCodeFiles(dirPath);
for (const filePath of files) {
try {
const chunks = await this.processFile(filePath);
result.filesProcessed++;
result.chunksCreated += chunks.length;
result.vectorsStored += chunks.length;
}
catch (error) {
const errorMsg = `Failed to process ${filePath}: ${error instanceof Error ? error.message : String(error)}`;
result.errors.push(errorMsg);
logger.warn(errorMsg, error instanceof Error ? error : undefined);
}
}
result.success = result.errors.length < files.length;
result.duration = Date.now() - startTime;
return result;
}
catch (error) {
result.errors.push(`Directory processing failed: ${error instanceof Error ? error.message : String(error)}`);
result.duration = Date.now() - startTime;
return result;
}
}
/**
* Find all code files in a directory recursively
*/
async findCodeFiles(dirPath) {
const files = [];
const excludePatterns = this.options.excludePaths || [];
const includeExtensions = this.options.includeExtensions || ['.ts', '.js', '.py'];
const walk = async (currentPath) => {
const items = await fs.promises.readdir(currentPath);
for (const item of items) {
const fullPath = path.join(currentPath, item);
const stat = await fs.promises.stat(fullPath);
if (stat.isDirectory()) {
const shouldSkip = excludePatterns.some(pattern => fullPath.includes(pattern) || item === pattern);
if (!shouldSkip) {
await walk(fullPath);
}
}
else if (stat.isFile()) {
const ext = path.extname(item);
if (includeExtensions.includes(ext)) {
files.push(fullPath);
}
}
}
};
await walk(dirPath);
return files;
}
}
exports.VectorizationPipeline = VectorizationPipeline;
;