codecrucible-synth
Version:
Production-Ready AI Development Platform with Multi-Voice Synthesis, Smithery MCP Integration, Enterprise Security, and Zero-Timeout Reliability
1,025 lines (879 loc) • 29.1 kB
text/typescript
/**
* Vector-Based RAG System for CodeCrucible Synth
* Production-ready implementation with local-first architecture, LanceDB storage,
* and real-time incremental indexing optimized for code repositories
*/
import { EventEmitter } from 'events';
import { promises as fs } from 'fs';
import * as path from 'path';
import { Worker } from 'worker_threads';
import chokidar from 'chokidar';
import { Logger } from '../logger.js';
import { UnifiedModelClient } from '../../refactor/unified-model-client.js';
// Core RAG Interfaces
export interface VectorDocument {
id: string;
content: string;
embedding?: number[];
metadata: DocumentMetadata;
chunks?: DocumentChunk[];
}
export interface DocumentMetadata {
filePath: string;
language: string;
fileType: string;
lastModified: Date;
size: number;
hash: string;
repository?: string;
branch?: string;
author?: string;
semanticType: 'code' | 'documentation' | 'configuration' | 'test';
extractedSymbols?: ExtractedSymbol[];
}
export interface ExtractedSymbol {
name: string;
type: 'function' | 'class' | 'interface' | 'variable' | 'constant';
startLine: number;
endLine: number;
signature?: string;
docstring?: string;
}
export interface DocumentChunk {
id: string;
content: string;
embedding: number[];
startOffset: number;
endOffset: number;
chunkType: 'function' | 'class' | 'block' | 'comment' | 'documentation';
parentDocument: string;
semanticWeight: number;
}
export interface RAGQuery {
query: string;
queryType: 'semantic' | 'exact' | 'hybrid';
filters?: QueryFilter[];
maxResults?: number;
threshold?: number;
contextWindow?: number;
includeMetadata?: boolean;
rerank?: boolean;
}
export interface QueryFilter {
field: string;
operator: 'equals' | 'contains' | 'startsWith' | 'in' | 'gt' | 'lt';
value: string | string[] | number;
}
export interface RAGResult {
documents: ScoredDocument[];
totalFound: number;
queryTime: number;
retrievalMethod: string;
reranked: boolean;
debugInfo?: {
vectorSearchTime: number;
rerankTime: number;
candidatesConsidered: number;
};
}
export interface ScoredDocument {
document: VectorDocument;
score: number;
relevanceExplanation?: string;
highlightedContent?: string;
matchedChunks?: DocumentChunk[];
}
export interface EmbeddingModel {
name: string;
dimensions: number;
maxTokens: number;
embed(text: string): Promise<number[]>;
embedBatch(texts: string[]): Promise<number[][]>;
}
export interface VectorStore {
initialize(): Promise<void>;
addDocuments(documents: VectorDocument[]): Promise<void>;
updateDocument(document: VectorDocument): Promise<void>;
deleteDocument(id: string): Promise<void>;
search(query: number[], filters?: QueryFilter[], maxResults?: number): Promise<ScoredDocument[]>;
hybridSearch(query: string, vector: number[], filters?: QueryFilter[]): Promise<ScoredDocument[]>;
getDocument(id: string): Promise<VectorDocument | null>;
getStats(): Promise<VectorStoreStats>;
compact(): Promise<void>;
close(): Promise<void>;
}
export interface VectorStoreStats {
totalDocuments: number;
totalChunks: number;
indexSize: number;
memoryUsage: number;
lastUpdated: Date;
averageDocumentSize: number;
}
export interface CodeChunker {
chunkDocument(document: VectorDocument): Promise<DocumentChunk[]>;
extractSymbols(content: string, language: string): ExtractedSymbol[];
shouldReindex(oldMetadata: DocumentMetadata, newMetadata: DocumentMetadata): boolean;
}
export interface RAGConfig {
vectorStore: {
provider: 'lancedb' | 'hnswsqlite' | 'memory';
storagePath: string;
dimensions: number;
indexType: 'hnsw' | 'ivf' | 'flat';
maxMemoryUsage: number;
};
embedding: {
model: string;
provider: 'transformers-js' | 'ollama' | 'local';
batchSize: number;
cacheEmbeddings: boolean;
};
chunking: {
strategy: 'semantic' | 'fixed' | 'adaptive' | 'ast-based';
maxChunkSize: number;
overlapSize: number;
respectCodeBoundaries: boolean;
};
indexing: {
enabled: boolean;
watchPaths: string[];
debounceMs: number;
batchSize: number;
excludePatterns: string[];
};
retrieval: {
defaultMaxResults: number;
hybridAlpha: number; // Weight between vector and keyword search
rerankingEnabled: boolean;
contextExpansion: boolean;
};
}
// Main RAG System
export class VectorRAGSystem extends EventEmitter {
private logger: Logger;
private config: RAGConfig;
private vectorStore!: VectorStore;
private embeddingModel!: EmbeddingModel;
private codeChunker!: CodeChunker;
private modelClient: UnifiedModelClient;
private fileWatcher?: chokidar.FSWatcher;
private embeddingCache: Map<string, number[]> = new Map();
private indexingQueue: Set<string> = new Set();
private isIndexing: boolean = false;
private performanceMetrics: RAGMetrics;
constructor(config: RAGConfig, modelClient: UnifiedModelClient) {
super();
this.logger = new Logger('VectorRAGSystem');
this.config = config;
this.modelClient = modelClient;
this.performanceMetrics = new RAGMetrics();
// Initialize components based on config
this.initializeComponents();
}
/**
* Initialize the RAG system
*/
async initialize(): Promise<void> {
this.logger.info('Initializing Vector RAG System...');
try {
// Initialize vector store
await this.vectorStore.initialize();
// Start file watching if enabled
if (this.config.indexing.enabled) {
await this.startFileWatching();
}
// Perform initial indexing
await this.performInitialIndexing();
this.logger.info('Vector RAG System initialized successfully');
this.emit('initialized');
} catch (error) {
this.logger.error('Failed to initialize RAG system:', error);
throw error;
}
}
/**
* Query the RAG system
*/
async query(ragQuery: RAGQuery): Promise<RAGResult> {
const startTime = Date.now();
this.logger.info(`Processing RAG query: ${ragQuery.query.substring(0, 100)}...`);
try {
// Generate query embedding
const queryEmbedding = await this.embeddingModel.embed(ragQuery.query);
let results: ScoredDocument[];
let retrievalMethod: string;
// Choose retrieval strategy
switch (ragQuery.queryType) {
case 'semantic':
results = await this.semanticSearch(queryEmbedding, ragQuery);
retrievalMethod = 'semantic_vector';
break;
case 'hybrid':
results = await this.hybridSearch(ragQuery.query, queryEmbedding, ragQuery);
retrievalMethod = 'hybrid_vector_keyword';
break;
case 'exact':
results = await this.exactSearch(ragQuery.query, ragQuery);
retrievalMethod = 'exact_match';
break;
default:
results = await this.semanticSearch(queryEmbedding, ragQuery);
retrievalMethod = 'default_semantic';
}
// Re-rank results if enabled
let reranked = false;
if (ragQuery.rerank && this.config.retrieval.rerankingEnabled) {
results = await this.rerankResults(ragQuery.query, results);
reranked = true;
}
// Apply result limit
const maxResults = ragQuery.maxResults || this.config.retrieval.defaultMaxResults;
results = results.slice(0, maxResults);
const queryTime = Date.now() - startTime;
this.performanceMetrics.recordQuery(queryTime, results.length, retrievalMethod);
const ragResult: RAGResult = {
documents: results,
totalFound: results.length,
queryTime,
retrievalMethod,
reranked,
};
this.emit('query:completed', { query: ragQuery, result: ragResult });
return ragResult;
} catch (error) {
this.logger.error('RAG query failed:', error);
this.emit('query:failed', { query: ragQuery, error });
throw error;
}
}
/**
* Index a single document
*/
async indexDocument(filePath: string): Promise<void> {
try {
const document = await this.createVectorDocument(filePath);
if (!document) return;
// Generate embeddings for document and chunks
await this.generateEmbeddings(document);
// Store in vector database
await this.vectorStore.addDocuments([document]);
this.logger.debug(`Indexed document: ${filePath}`);
this.emit('document:indexed', { filePath, document });
} catch (error) {
this.logger.error(`Failed to index document ${filePath}:`, error);
this.emit('document:failed', { filePath, error });
}
}
/**
* Update an existing document
*/
async updateDocument(filePath: string): Promise<void> {
try {
const existingDoc = await this.vectorStore.getDocument(filePath);
const newDocument = await this.createVectorDocument(filePath);
if (!newDocument) {
if (existingDoc) {
await this.vectorStore.deleteDocument(filePath);
this.logger.debug(`Removed deleted document: ${filePath}`);
}
return;
}
// Check if document actually changed
if (
existingDoc &&
!this.codeChunker.shouldReindex(existingDoc.metadata, newDocument.metadata)
) {
this.logger.debug(`Document unchanged, skipping: ${filePath}`);
return;
}
await this.generateEmbeddings(newDocument);
await this.vectorStore.updateDocument(newDocument);
this.logger.debug(`Updated document: ${filePath}`);
this.emit('document:updated', { filePath, document: newDocument });
} catch (error) {
this.logger.error(`Failed to update document ${filePath}:`, error);
}
}
/**
* Get system statistics
*/
async getStats(): Promise<RAGSystemStats> {
const storeStats = await this.vectorStore.getStats();
return {
vectorStore: storeStats,
performance: this.performanceMetrics.getStats(),
indexing: {
queueSize: this.indexingQueue.size,
isIndexing: this.isIndexing,
watchedPaths: this.config.indexing.watchPaths.length,
cacheSize: this.embeddingCache.size,
},
config: this.config,
};
}
/**
* Private Methods
*/
private initializeComponents(): void {
// Initialize vector store
switch (this.config.vectorStore.provider) {
case 'lancedb':
this.vectorStore = new LanceDBVectorStore(this.config.vectorStore);
break;
case 'hnswsqlite':
this.vectorStore = new HNSWSQLiteVectorStore(this.config.vectorStore);
break;
default:
this.vectorStore = new MemoryVectorStore(this.config.vectorStore);
}
// Initialize embedding model
switch (this.config.embedding.provider) {
case 'transformers-js':
this.embeddingModel = new TransformersJSEmbedding(this.config.embedding);
break;
case 'ollama':
this.embeddingModel = new OllamaEmbedding(this.config.embedding, this.modelClient);
break;
default:
this.embeddingModel = new LocalEmbedding(this.config.embedding);
}
// Initialize code chunker
this.codeChunker = new ASTBasedCodeChunker(this.config.chunking);
}
private async startFileWatching(): Promise<void> {
if (this.fileWatcher) {
await this.fileWatcher.close();
}
this.fileWatcher = chokidar.watch(this.config.indexing.watchPaths, {
ignored: this.config.indexing.excludePatterns,
persistent: true,
ignoreInitial: true,
});
const debouncedIndex = this.debounce(
(filePath: string) => this.queueForIndexing(filePath),
this.config.indexing.debounceMs
);
this.fileWatcher
.on('add', debouncedIndex)
.on('change', debouncedIndex)
.on('unlink', filePath => this.vectorStore.deleteDocument(filePath));
this.logger.info(`Watching ${this.config.indexing.watchPaths.length} paths for changes`);
}
private async performInitialIndexing(): Promise<void> {
this.logger.info('Starting initial indexing...');
for (const watchPath of this.config.indexing.watchPaths) {
await this.indexDirectory(watchPath);
}
this.logger.info('Initial indexing completed');
}
private async indexDirectory(dirPath: string): Promise<void> {
try {
const entries = await fs.readdir(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
if (entry.isDirectory()) {
await this.indexDirectory(fullPath);
} else if (this.shouldIndexFile(fullPath)) {
await this.indexDocument(fullPath);
}
}
} catch (error) {
this.logger.error(`Failed to index directory ${dirPath}:`, error);
}
}
private shouldIndexFile(filePath: string): boolean {
const ext = path.extname(filePath).toLowerCase();
const codeExtensions = ['.ts', '.js', '.py', '.java', '.cpp', '.c', '.h', '.rs', '.go', '.php'];
const docExtensions = ['.md', '.txt', '.rst', '.adoc'];
return codeExtensions.includes(ext) || docExtensions.includes(ext);
}
private async createVectorDocument(filePath: string): Promise<VectorDocument | null> {
try {
const stats = await fs.stat(filePath);
const content = await fs.readFile(filePath, 'utf-8');
const hash = this.calculateHash(content);
const metadata: DocumentMetadata = {
filePath,
language: this.detectLanguage(filePath),
fileType: path.extname(filePath),
lastModified: stats.mtime,
size: stats.size,
hash,
semanticType: this.detectSemanticType(filePath),
extractedSymbols: this.codeChunker.extractSymbols(content, this.detectLanguage(filePath)),
};
const document: VectorDocument = {
id: filePath,
content,
metadata,
};
// Generate chunks
document.chunks = await this.codeChunker.chunkDocument(document);
return document;
} catch (error) {
this.logger.error(`Failed to create document for ${filePath}:`, error);
return null;
}
}
private async generateEmbeddings(document: VectorDocument): Promise<void> {
// Check cache first
const cacheKey = `${document.id}:${document.metadata.hash}`;
if (this.embeddingCache.has(cacheKey)) {
document.embedding = this.embeddingCache.get(cacheKey);
} else {
document.embedding = await this.embeddingModel.embed(document.content);
if (this.config.embedding.cacheEmbeddings) {
this.embeddingCache.set(cacheKey, document.embedding);
}
}
// Generate embeddings for chunks
if (document.chunks) {
const chunkTexts = document.chunks.map(chunk => chunk.content);
const chunkEmbeddings = await this.embeddingModel.embedBatch(chunkTexts);
document.chunks.forEach((chunk, index) => {
chunk.embedding = chunkEmbeddings[index];
});
}
}
private async semanticSearch(
queryEmbedding: number[],
ragQuery: RAGQuery
): Promise<ScoredDocument[]> {
return await this.vectorStore.search(
queryEmbedding,
ragQuery.filters,
ragQuery.maxResults || this.config.retrieval.defaultMaxResults
);
}
private async hybridSearch(
query: string,
queryEmbedding: number[],
ragQuery: RAGQuery
): Promise<ScoredDocument[]> {
return await this.vectorStore.hybridSearch(query, queryEmbedding, ragQuery.filters);
}
private async exactSearch(query: string, ragQuery: RAGQuery): Promise<ScoredDocument[]> {
// Implement exact text matching
const results: ScoredDocument[] = [];
// Implementation would search for exact text matches
return results;
}
private async rerankResults(query: string, results: ScoredDocument[]): Promise<ScoredDocument[]> {
// Use LLM to rerank results based on relevance
const rerankPrompt = `
Query: ${query}
Rank the following code snippets by relevance to the query (1 = most relevant):
${results.map((r, i) => `${i + 1}. ${r.document.content.substring(0, 200)}...`).join('\n')}
Return only the numbers in order of relevance.
`;
try {
const response = await this.modelClient.synthesize({
prompt: rerankPrompt,
maxTokens: 100,
});
const rankings = this.parseRankings(response.content);
return this.applyRankings(results, rankings);
} catch (error) {
this.logger.warn('Reranking failed, returning original results:', error);
return results;
}
}
private parseRankings(response: string): number[] {
const numbers = response.match(/\d+/g);
return numbers ? numbers.map(n => parseInt(n) - 1) : [];
}
private applyRankings(results: ScoredDocument[], rankings: number[]): ScoredDocument[] {
if (rankings.length === 0) return results;
const reranked: ScoredDocument[] = [];
for (const rank of rankings) {
if (rank >= 0 && rank < results.length) {
reranked.push(results[rank]);
}
}
// Add any remaining results
for (let i = 0; i < results.length; i++) {
if (!rankings.includes(i)) {
reranked.push(results[i]);
}
}
return reranked;
}
private queueForIndexing(filePath: string): void {
this.indexingQueue.add(filePath);
this.processIndexingQueue();
}
private async processIndexingQueue(): Promise<void> {
if (this.isIndexing || this.indexingQueue.size === 0) return;
this.isIndexing = true;
const batch = Array.from(this.indexingQueue).slice(0, this.config.indexing.batchSize);
this.indexingQueue.clear();
try {
await Promise.all(batch.map(filePath => this.updateDocument(filePath)));
} catch (error) {
this.logger.error('Batch indexing failed:', error);
} finally {
this.isIndexing = false;
// Process any new items that were added
if (this.indexingQueue.size > 0) {
setTimeout(() => this.processIndexingQueue(), 100);
}
}
}
private detectLanguage(filePath: string): string {
const ext = path.extname(filePath).toLowerCase();
const languageMap: Record<string, string> = {
'.ts': 'typescript',
'.js': 'javascript',
'.py': 'python',
'.java': 'java',
'.cpp': 'cpp',
'.c': 'c',
'.h': 'c',
'.rs': 'rust',
'.go': 'go',
'.php': 'php',
'.md': 'markdown',
'.txt': 'text',
};
return languageMap[ext] || 'unknown';
}
private detectSemanticType(
filePath: string
): 'code' | 'documentation' | 'configuration' | 'test' {
const fileName = path.basename(filePath).toLowerCase();
if (fileName.includes('test') || fileName.includes('spec')) return 'test';
if (fileName.includes('config') || fileName.includes('setting')) return 'configuration';
if (fileName.endsWith('.md') || fileName.endsWith('.txt')) return 'documentation';
return 'code';
}
private calculateHash(content: string): string {
// Simple hash function - in production, use crypto.createHash
let hash = 0;
for (let i = 0; i < content.length; i++) {
const char = content.charCodeAt(i);
hash = (hash << 5) - hash + char;
hash = hash & hash; // Convert to 32-bit integer
}
return hash.toString(36);
}
private debounce<T extends (...args: any[]) => void>(
func: T,
delay: number
): (...args: Parameters<T>) => void {
let timeoutId: NodeJS.Timeout;
return (...args: any[]) => {
clearTimeout(timeoutId);
timeoutId = setTimeout(() => func.apply(this, args), delay);
};
}
/**
* Public API methods
*/
async shutdown(): Promise<void> {
this.logger.info('Shutting down RAG system...');
if (this.fileWatcher) {
await this.fileWatcher.close();
}
await this.vectorStore.close();
this.embeddingCache.clear();
this.logger.info('RAG system shutdown completed');
}
async compactIndex(): Promise<void> {
await this.vectorStore.compact();
this.logger.info('Vector index compacted');
}
async clearCache(): Promise<void> {
this.embeddingCache.clear();
this.logger.info('Embedding cache cleared');
}
}
// Supporting Classes and Interfaces
export interface RAGSystemStats {
vectorStore: VectorStoreStats;
performance: PerformanceStats;
indexing: {
queueSize: number;
isIndexing: boolean;
watchedPaths: number;
cacheSize: number;
};
config: RAGConfig;
}
export interface PerformanceStats {
totalQueries: number;
averageQueryTime: number;
averageResultsPerQuery: number;
cacheHitRate: number;
methodBreakdown: Record<string, number>;
}
class RAGMetrics {
private queries: number = 0;
private totalQueryTime: number = 0;
private totalResults: number = 0;
private methodCounts: Map<string, number> = new Map();
recordQuery(queryTime: number, resultCount: number, method: string): void {
this.queries++;
this.totalQueryTime += queryTime;
this.totalResults += resultCount;
this.methodCounts.set(method, (this.methodCounts.get(method) || 0) + 1);
}
getStats(): PerformanceStats {
return {
totalQueries: this.queries,
averageQueryTime: this.queries > 0 ? this.totalQueryTime / this.queries : 0,
averageResultsPerQuery: this.queries > 0 ? this.totalResults / this.queries : 0,
cacheHitRate: 0, // Would be calculated based on cache metrics
methodBreakdown: Object.fromEntries(this.methodCounts),
};
}
}
// Placeholder implementations - these would be separate files in production
class LanceDBVectorStore implements VectorStore {
constructor(private config: any) {}
async initialize(): Promise<void> {
/* Implementation */
}
async addDocuments(documents: VectorDocument[]): Promise<void> {
/* Implementation */
}
async updateDocument(document: VectorDocument): Promise<void> {
/* Implementation */
}
async deleteDocument(id: string): Promise<void> {
/* Implementation */
}
async search(
query: number[],
filters?: QueryFilter[],
maxResults?: number
): Promise<ScoredDocument[]> {
return [];
}
async hybridSearch(
query: string,
vector: number[],
filters?: QueryFilter[]
): Promise<ScoredDocument[]> {
return [];
}
async getDocument(id: string): Promise<VectorDocument | null> {
return null;
}
async getStats(): Promise<VectorStoreStats> {
return {} as VectorStoreStats;
}
async compact(): Promise<void> {
/* Implementation */
}
async close(): Promise<void> {
/* Implementation */
}
}
class HNSWSQLiteVectorStore implements VectorStore {
constructor(private config: any) {}
async initialize(): Promise<void> {
/* Implementation */
}
async addDocuments(documents: VectorDocument[]): Promise<void> {
/* Implementation */
}
async updateDocument(document: VectorDocument): Promise<void> {
/* Implementation */
}
async deleteDocument(id: string): Promise<void> {
/* Implementation */
}
async search(
query: number[],
filters?: QueryFilter[],
maxResults?: number
): Promise<ScoredDocument[]> {
return [];
}
async hybridSearch(
query: string,
vector: number[],
filters?: QueryFilter[]
): Promise<ScoredDocument[]> {
return [];
}
async getDocument(id: string): Promise<VectorDocument | null> {
return null;
}
async getStats(): Promise<VectorStoreStats> {
return {} as VectorStoreStats;
}
async compact(): Promise<void> {
/* Implementation */
}
async close(): Promise<void> {
/* Implementation */
}
}
class MemoryVectorStore implements VectorStore {
private documents: Map<string, VectorDocument> = new Map();
constructor(private config: any) {}
async initialize(): Promise<void> {}
async addDocuments(documents: VectorDocument[]): Promise<void> {
for (const doc of documents) {
this.documents.set(doc.id, doc);
}
}
async updateDocument(document: VectorDocument): Promise<void> {
this.documents.set(document.id, document);
}
async deleteDocument(id: string): Promise<void> {
this.documents.delete(id);
}
async search(
query: number[],
filters?: QueryFilter[],
maxResults?: number
): Promise<ScoredDocument[]> {
const results: ScoredDocument[] = [];
for (const doc of this.documents.values()) {
if (doc.embedding) {
const similarity = this.cosineSimilarity(query, doc.embedding);
results.push({
document: doc,
score: similarity,
});
}
}
return results.sort((a, b) => b.score - a.score).slice(0, maxResults || 10);
}
async hybridSearch(
query: string,
vector: number[],
filters?: QueryFilter[]
): Promise<ScoredDocument[]> {
return this.search(vector, filters);
}
async getDocument(id: string): Promise<VectorDocument | null> {
return this.documents.get(id) || null;
}
async getStats(): Promise<VectorStoreStats> {
return {
totalDocuments: this.documents.size,
totalChunks: 0,
indexSize: 0,
memoryUsage: 0,
lastUpdated: new Date(),
averageDocumentSize: 0,
};
}
async compact(): Promise<void> {}
async close(): Promise<void> {}
private cosineSimilarity(a: number[], b: number[]): number {
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
}
class TransformersJSEmbedding implements EmbeddingModel {
name = 'transformers-js';
dimensions = 384;
maxTokens = 512;
constructor(private config: any) {}
async embed(text: string): Promise<number[]> {
// Placeholder - would use @xenova/transformers
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts: string[]): Promise<number[][]> {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class OllamaEmbedding implements EmbeddingModel {
name = 'ollama';
dimensions = 4096;
maxTokens = 2048;
constructor(
private config: any,
private modelClient: UnifiedModelClient
) {}
async embed(text: string): Promise<number[]> {
// Placeholder - would use Ollama embedding API
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts: string[]): Promise<number[][]> {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class LocalEmbedding implements EmbeddingModel {
name = 'local';
dimensions = 768;
maxTokens = 512;
constructor(private config: any) {}
async embed(text: string): Promise<number[]> {
// Placeholder for local embedding model
return new Array(this.dimensions).fill(0).map(() => Math.random());
}
async embedBatch(texts: string[]): Promise<number[][]> {
return Promise.all(texts.map(text => this.embed(text)));
}
}
class ASTBasedCodeChunker implements CodeChunker {
constructor(private config: any) {}
async chunkDocument(document: VectorDocument): Promise<DocumentChunk[]> {
const chunks: DocumentChunk[] = [];
const content = document.content;
const lines = content.split('\n');
// Simple line-based chunking for now
const chunkSize = this.config.maxChunkSize || 500;
const overlap = this.config.overlapSize || 50;
for (let i = 0; i < lines.length; i += chunkSize - overlap) {
const chunkLines = lines.slice(i, i + chunkSize);
const chunkContent = chunkLines.join('\n');
chunks.push({
id: `${document.id}:chunk:${i}`,
content: chunkContent,
embedding: [], // Will be filled later
startOffset: i,
endOffset: i + chunkLines.length,
chunkType: 'block',
parentDocument: document.id,
semanticWeight: 1.0,
});
}
return chunks;
}
extractSymbols(content: string, language: string): ExtractedSymbol[] {
const symbols: ExtractedSymbol[] = [];
const lines = content.split('\n');
// Simple regex-based symbol extraction
const patterns = {
function: /function\s+(\w+)\s*\(/,
class: /class\s+(\w+)/,
interface: /interface\s+(\w+)/,
variable: /(?:const|let|var)\s+(\w+)/,
};
lines.forEach((line, index) => {
for (const [type, pattern] of Object.entries(patterns)) {
const match = line.match(pattern);
if (match) {
symbols.push({
name: match[1],
type: type as any,
startLine: index + 1,
endLine: index + 1,
signature: line.trim(),
});
}
}
});
return symbols;
}
shouldReindex(oldMetadata: DocumentMetadata, newMetadata: DocumentMetadata): boolean {
return (
oldMetadata.hash !== newMetadata.hash ||
oldMetadata.lastModified.getTime() !== newMetadata.lastModified.getTime()
);
}
}