codecrucible-synth
Version:
Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability
712 lines (710 loc) • 24.6 kB
JavaScript
/**
* Vector-Based RAG System for CodeCrucible Synth
* Production-ready implementation with local-first architecture, LanceDB storage,
* and real-time incremental indexing optimized for code repositories
*/
import { EventEmitter } from 'events';
import { promises as fs } from 'fs';
import * as path from 'path';
import chokidar from 'chokidar';
import { Logger } from '../logger.js';
// Main RAG System
export class VectorRAGSystem extends EventEmitter {
logger;
config;
vectorStore;
embeddingModel;
codeChunker;
modelClient;
fileWatcher;
embeddingCache = new Map();
indexingQueue = new Set();
isIndexing = false;
performanceMetrics;
constructor(config, modelClient) {
super();
this.logger = new Logger('VectorRAGSystem');
this.config = config;
this.modelClient = modelClient;
this.performanceMetrics = new RAGMetrics();
// Initialize components based on config
this.initializeComponents();
}
/**
* Initialize the RAG system
*/
async initialize() {
this.logger.info('Initializing Vector RAG System...');
try {
// Initialize vector store
await this.vectorStore.initialize();
// Start file watching if enabled
if (this.config.indexing.enabled) {
await this.startFileWatching();
}
// Perform initial indexing
await this.performInitialIndexing();
this.logger.info('Vector RAG System initialized successfully');
this.emit('initialized');
}
catch (error) {
this.logger.error('Failed to initialize RAG system:', error);
throw error;
}
}
/**
* Query the RAG system
*/
async query(ragQuery) {
const startTime = Date.now();
this.logger.info(`Processing RAG query: ${ragQuery.query.substring(0, 100)}...`);
try {
// Generate query embedding
const queryEmbedding = await this.embeddingModel.embed(ragQuery.query);
let results;
let retrievalMethod;
// Choose retrieval strategy
switch (ragQuery.queryType) {
case 'semantic':
results = await this.semanticSearch(queryEmbedding, ragQuery);
retrievalMethod = 'semantic_vector';
break;
case 'hybrid':
results = await this.hybridSearch(ragQuery.query, queryEmbedding, ragQuery);
retrievalMethod = 'hybrid_vector_keyword';
break;
case 'exact':
results = await this.exactSearch(ragQuery.query, ragQuery);
retrievalMethod = 'exact_match';
break;
default:
results = await this.semanticSearch(queryEmbedding, ragQuery);
retrievalMethod = 'default_semantic';
}
// Re-rank results if enabled
let reranked = false;
if (ragQuery.rerank && this.config.retrieval.rerankingEnabled) {
results = await this.rerankResults(ragQuery.query, results);
reranked = true;
}
// Apply result limit
const maxResults = ragQuery.maxResults || this.config.retrieval.defaultMaxResults;
results = results.slice(0, maxResults);
const queryTime = Date.now() - startTime;
this.performanceMetrics.recordQuery(queryTime, results.length, retrievalMethod);
const ragResult = {
documents: results,
totalFound: results.length,
queryTime,
retrievalMethod,
reranked,
};
this.emit('query:completed', { query: ragQuery, result: ragResult });
return ragResult;
}
catch (error) {
this.logger.error('RAG query failed:', error);
this.emit('query:failed', { query: ragQuery, error });
throw error;
}
}
/**
* Index a single document
*/
async indexDocument(filePath) {
try {
const document = await this.createVectorDocument(filePath);
if (!document)
return;
// Generate embeddings for document and chunks
await this.generateEmbeddings(document);
// Store in vector database
await this.vectorStore.addDocuments([document]);
this.logger.debug(`Indexed document: ${filePath}`);
this.emit('document:indexed', { filePath, document });
}
catch (error) {
this.logger.error(`Failed to index document ${filePath}:`, error);
this.emit('document:failed', { filePath, error });
}
}
/**
* Update an existing document
*/
async updateDocument(filePath) {
try {
const existingDoc = await this.vectorStore.getDocument(filePath);
const newDocument = await this.createVectorDocument(filePath);
if (!newDocument) {
if (existingDoc) {
await this.vectorStore.deleteDocument(filePath);
this.logger.debug(`Removed deleted document: ${filePath}`);
}
return;
}
// Check if document actually changed
if (existingDoc &&
!this.codeChunker.shouldReindex(existingDoc.metadata, newDocument.metadata)) {
this.logger.debug(`Document unchanged, skipping: ${filePath}`);
return;
}
await this.generateEmbeddings(newDocument);
await this.vectorStore.updateDocument(newDocument);
this.logger.debug(`Updated document: ${filePath}`);
this.emit('document:updated', { filePath, document: newDocument });
}
catch (error) {
this.logger.error(`Failed to update document ${filePath}:`, error);
}
}
/**
* Get system statistics
*/
async getStats() {
const storeStats = await this.vectorStore.getStats();
return {
vectorStore: storeStats,
performance: this.performanceMetrics.getStats(),
indexing: {
queueSize: this.indexingQueue.size,
isIndexing: this.isIndexing,
watchedPaths: this.config.indexing.watchPaths.length,
cacheSize: this.embeddingCache.size,
},
config: this.config,
};
}
/**
* Private Methods
*/
initializeComponents() {
// Initialize vector store
switch (this.config.vectorStore.provider) {
case 'lancedb':
this.vectorStore = new LanceDBVectorStore(this.config.vectorStore);
break;
case 'hnswsqlite':
this.vectorStore = new HNSWSQLiteVectorStore(this.config.vectorStore);
break;
default:
this.vectorStore = new MemoryVectorStore(this.config.vectorStore);
}
// Initialize embedding model
switch (this.config.embedding.provider) {
case 'transformers-js':
this.embeddingModel = new TransformersJSEmbedding(this.config.embedding);
break;
case 'ollama':
this.embeddingModel = new OllamaEmbedding(this.config.embedding, this.modelClient);
break;
default:
this.embeddingModel = new LocalEmbedding(this.config.embedding);
}
// Initialize code chunker
this.codeChunker = new ASTBasedCodeChunker(this.config.chunking);
}
async startFileWatching() {
if (this.fileWatcher) {
await this.fileWatcher.close();
}
this.fileWatcher = chokidar.watch(this.config.indexing.watchPaths, {
ignored: this.config.indexing.excludePatterns,
persistent: true,
ignoreInitial: true,
});
const debouncedIndex = this.debounce((filePath) => this.queueForIndexing(filePath), this.config.indexing.debounceMs);
this.fileWatcher
.on('add', debouncedIndex)
.on('change', debouncedIndex)
.on('unlink', filePath => this.vectorStore.deleteDocument(filePath));
this.logger.info(`Watching ${this.config.indexing.watchPaths.length} paths for changes`);
}
async performInitialIndexing() {
this.logger.info('Starting initial indexing...');
for (const watchPath of this.config.indexing.watchPaths) {
await this.indexDirectory(watchPath);
}
this.logger.info('Initial indexing completed');
}
async indexDirectory(dirPath) {
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
if (entry.isDirectory()) {
await this.indexDirectory(fullPath);
}
else if (this.shouldIndexFile(fullPath)) {
await this.indexDocument(fullPath);
}
}
}
catch (error) {
this.logger.error(`Failed to index directory ${dirPath}:`, error);
}
}
shouldIndexFile(filePath) {
const ext = path.extname(filePath).toLowerCase();
const codeExtensions = ['.ts', '.js', '.py', '.java', '.cpp', '.c', '.h', '.rs', '.go', '.php'];
const docExtensions = ['.md', '.txt', '.rst', '.adoc'];
return codeExtensions.includes(ext) || docExtensions.includes(ext);
}
async createVectorDocument(filePath) {
try {
const stats = await fs.stat(filePath);
const content = await fs.readFile(filePath, 'utf-8');
const hash = this.calculateHash(content);
const metadata = {
filePath,
language: this.detectLanguage(filePath),
fileType: path.extname(filePath),
lastModified: stats.mtime,
size: stats.size,
hash,
semanticType: this.detectSemanticType(filePath),
extractedSymbols: this.codeChunker.extractSymbols(content, this.detectLanguage(filePath)),
};
const document = {
id: filePath,
content,
metadata,
};
// Generate chunks
document.chunks = await this.codeChunker.chunkDocument(document);
return document;
}
catch (error) {
this.logger.error(`Failed to create document for ${filePath}:`, error);
return null;
}
}
async generateEmbeddings(document) {
// Check cache first
const cacheKey = `${document.id}:${document.metadata.hash}`;
if (this.embeddingCache.has(cacheKey)) {
document.embedding = this.embeddingCache.get(cacheKey);
}
else {
document.embedding = await this.embeddingModel.embed(document.content);
if (this.config.embedding.cacheEmbeddings) {
this.embeddingCache.set(cacheKey, document.embedding);
}
}
// Generate embeddings for chunks
if (document.chunks) {
const chunkTexts = document.chunks.map(chunk => chunk.content);
const chunkEmbeddings = await this.embeddingModel.embedBatch(chunkTexts);
document.chunks.forEach((chunk, index) => {
chunk.embedding = chunkEmbeddings[index];
});
}
}
async semanticSearch(queryEmbedding, ragQuery) {
return await this.vectorStore.search(queryEmbedding, ragQuery.filters, ragQuery.maxResults || this.config.retrieval.defaultMaxResults);
}
async hybridSearch(query, queryEmbedding, ragQuery) {
return await this.vectorStore.hybridSearch(query, queryEmbedding, ragQuery.filters);
}
async exactSearch(query, ragQuery) {
// Implement exact text matching
const results = [];
// Implementation would search for exact text matches
return results;
}
async rerankResults(query, results) {
// Use LLM to rerank results based on relevance
const rerankPrompt = `
Query: ${query}
Rank the following code snippets by relevance to the query (1 = most relevant):
${results.map((r, i) => `${i + 1}. ${r.document.content.substring(0, 200)}...`).join('\n')}
Return only the numbers in order of relevance.
`;
try {
const response = await this.modelClient.synthesize({
prompt: rerankPrompt,
maxTokens: 100,
});
const rankings = this.parseRankings(response.content);
return this.applyRankings(results, rankings);
}
catch (error) {
this.logger.warn('Reranking failed, returning original results:', error);
return results;
}
}
parseRankings(response) {
const numbers = response.match(/\d+/g);
return numbers ? numbers.map(n => parseInt(n) - 1) : [];
}
applyRankings(results, rankings) {
if (rankings.length === 0)
return results;
const reranked = [];
for (const rank of rankings) {
if (rank >= 0 && rank < results.length) {
reranked.push(results[rank]);
}
}
// Add any remaining results
for (let i = 0; i < results.length; i++) {
if (!rankings.includes(i)) {
reranked.push(results[i]);
}
}
return reranked;
}
queueForIndexing(filePath) {
this.indexingQueue.add(filePath);
this.processIndexingQueue();
}
async processIndexingQueue() {
if (this.isIndexing || this.indexingQueue.size === 0)
return;
this.isIndexing = true;
const batch = Array.from(this.indexingQueue).slice(0, this.config.indexing.batchSize);
this.indexingQueue.clear();
try {
await Promise.all(batch.map(filePath => this.updateDocument(filePath)));
}
catch (error) {
this.logger.error('Batch indexing failed:', error);
}
finally {
this.isIndexing = false;
// Process any new items that were added
if (this.indexingQueue.size > 0) {
setTimeout(() => this.processIndexingQueue(), 100);
}
}
}
detectLanguage(filePath) {
const ext = path.extname(filePath).toLowerCase();
const languageMap = {
'.ts': 'typescript',
'.js': 'javascript',
'.py': 'python',
'.java': 'java',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.rs': 'rust',
'.go': 'go',
'.php': 'php',
'.md': 'markdown',
'.txt': 'text',
};
return languageMap[ext] || 'unknown';
}
detectSemanticType(filePath) {
const fileName = path.basename(filePath).toLowerCase();
if (fileName.includes('test') || fileName.includes('spec'))
return 'test';
if (fileName.includes('config') || fileName.includes('setting'))
return 'configuration';
if (fileName.endsWith('.md') || fileName.endsWith('.txt'))
return 'documentation';
return 'code';
}
calculateHash(content) {
// Simple hash function - in production, use crypto.createHash
let hash = 0;
for (let i = 0; i < content.length; i++) {
const char = content.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32-bit integer
}
return hash.toString(36);
}
debounce(func, delay) {
let timeoutId;
return (...args) => {
clearTimeout(timeoutId);
timeoutId = setTimeout(() => func.apply(this, args), delay);
};
}
/**
* Public API methods
*/
async shutdown() {
this.logger.info('Shutting down RAG system...');
if (this.fileWatcher) {
await this.fileWatcher.close();
}
await this.vectorStore.close();
this.embeddingCache.clear();
this.logger.info('RAG system shutdown completed');
}
async compactIndex() {
await this.vectorStore.compact();
this.logger.info('Vector index compacted');
}
async clearCache() {
this.embeddingCache.clear();
this.logger.info('Embedding cache cleared');
}
}
class RAGMetrics {
queries = 0;
totalQueryTime = 0;
totalResults = 0;
methodCounts = new Map();
recordQuery(queryTime, resultCount, method) {
this.queries++;
this.totalQueryTime += queryTime;
this.totalResults += resultCount;
this.methodCounts.set(method, (this.methodCounts.get(method) || 0) + 1);
}
getStats() {
return {
totalQueries: this.queries,
averageQueryTime: this.queries > 0 ? this.totalQueryTime / this.queries : 0,
averageResultsPerQuery: this.queries > 0 ? this.totalResults / this.queries : 0,
cacheHitRate: 0, // Would be calculated based on cache metrics
methodBreakdown: Object.fromEntries(this.methodCounts),
};
}
}
// Placeholder implementations - these would be separate files in production
class LanceDBVectorStore {
config;
constructor(config) {
this.config = config;
}
async initialize() {
/* Implementation */
}
async addDocuments(documents) {
/* Implementation */
}
async updateDocument(document) {
/* Implementation */
}
async deleteDocument(id) {
/* Implementation */
}
async search(query, filters, maxResults) {
return [];
}
async hybridSearch(query, vector, filters) {
return [];
}
async getDocument(id) {
return null;
}
async getStats() {
return {};
}
async compact() {
/* Implementation */
}
async close() {
/* Implementation */
}
}
class HNSWSQLiteVectorStore {
config;
constructor(config) {
this.config = config;
}
async initialize() {
/* Implementation */
}
async addDocuments(documents) {
/* Implementation */
}
async updateDocument(document) {
/* Implementation */
}
async deleteDocument(id) {
/* Implementation */
}
async search(query, filters, maxResults) {
return [];
}
async hybridSearch(query, vector, filters) {
return [];
}
async getDocument(id) {
return null;
}
async getStats() {
return {};
}
async compact() {
/* Implementation */
}
async close() {
/* Implementation */
}
}
class MemoryVectorStore {
config;
documents = new Map();
constructor(config) {
this.config = config;
}
async initialize() { }
async addDocuments(documents) {
for (const doc of documents) {
this.documents.set(doc.id, doc);
}
}
async updateDocument(document) {
this.documents.set(document.id, document);
}
async deleteDocument(id) {
this.documents.delete(id);
}
async search(query, filters, maxResults) {
const results = [];
for (const doc of this.documents.values()) {
if (doc.embedding) {
const similarity = this.cosineSimilarity(query, doc.embedding);
results.push({
document: doc,
score: similarity,
});
}
}
return results.sort((a, b) => b.score - a.score).slice(0, maxResults || 10);
}
async hybridSearch(query, vector, filters) {
return this.search(vector, filters);
}
async getDocument(id) {
return this.documents.get(id) || null;
}
async getStats() {
return {
totalDocuments: this.documents.size,
totalChunks: 0,
indexSize: 0,
memoryUsage: 0,
lastUpdated: new Date(),
averageDocumentSize: 0,
};
}
async compact() { }
async close() { }
cosineSimilarity(a, b) {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
}
class TransformersJSEmbedding {
config;
name = 'transformers-js';
dimensions = 384;
maxTokens = 512;
constructor(config) {
this.config = config;
}
async embed(text) {
// Placeholder - would use @xenova/transformers
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts) {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class OllamaEmbedding {
config;
modelClient;
name = 'ollama';
dimensions = 4096;
maxTokens = 2048;
constructor(config, modelClient) {
this.config = config;
this.modelClient = modelClient;
}
async embed(text) {
// Placeholder - would use Ollama embedding API
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts) {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class LocalEmbedding {
config;
name = 'local';
dimensions = 768;
maxTokens = 512;
constructor(config) {
this.config = config;
}
async embed(text) {
// Placeholder for local embedding model
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts) {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class ASTBasedCodeChunker {
config;
constructor(config) {
this.config = config;
}
async chunkDocument(document) {
const chunks = [];
const content = document.content;
const lines = content.split('\n');
// Simple line-based chunking for now
const chunkSize = this.config.maxChunkSize || 500;
const overlap = this.config.overlapSize || 50;
for (let i = 0; i < lines.length; i += chunkSize - overlap) {
const chunkLines = lines.slice(i, i + chunkSize);
const chunkContent = chunkLines.join('\n');
chunks.push({
id: `${document.id}:chunk:${i}`,
content: chunkContent,
embedding: [], // Will be filled later
startOffset: i,
endOffset: i + chunkLines.length,
chunkType: 'block',
parentDocument: document.id,
semanticWeight: 1.0,
});
}
return chunks;
}
extractSymbols(content, language) {
const symbols = [];
const lines = content.split('\n');
// Simple regex-based symbol extraction
const patterns = {
function: /function\s+(\w+)\s*\(/,
class: /class\s+(\w+)/,
interface: /interface\s+(\w+)/,
variable: /(?:const|let|var)\s+(\w+)/,
};
lines.forEach((line, index) => {
for (const [type, pattern] of Object.entries(patterns)) {
const match = line.match(pattern);
if (match) {
symbols.push({
name: match[1],
type: type,
startLine: index + 1,
endLine: index + 1,
signature: line.trim(),
});
}
}
});
return symbols;
}
shouldReindex(oldMetadata, newMetadata) {
return (oldMetadata.hash !== newMetadata.hash ||
oldMetadata.lastModified.getTime() !== newMetadata.lastModified.getTime());
}
}
//# sourceMappingURL=vector-rag-system.js.map