UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

689 lines (599 loc) 19.7 kB
import { RAGConfig, Document, SearchResult, VectorStore, EmbeddingModel, } from "./types"; import { DocumentProcessingPipeline, ProcessingPipelineConfig, } from "./processors/pipeline"; import { VectorStoreFactory, VectorStoreType } from "./vectorstore"; import { EmbeddingFactory, EmbeddingConfig } from "./embeddings"; import { SupabaseStorageManager, SupabaseStorageConfig, DocumentUploadResult, } from "./storage"; import { HybridRetriever, HybridSearchConfig } from "./retrieval"; import { RSSLoader, NaverBlogRSSLoader, RSSFeedManager } from "./loaders/rss"; export class RAGEngine { private vectorStore!: VectorStore; private embeddingModel!: EmbeddingModel; private processingPipeline!: DocumentProcessingPipeline; private storageManager?: SupabaseStorageManager; private hybridRetriever?: HybridRetriever; private rssManager: RSSFeedManager; private config: RAGConfig; constructor(config: RAGConfig) { this.config = config; this.rssManager = new RSSFeedManager(); this.initializeComponents(); } private initializeComponents(): void { // Initialize embedding model const embeddingConfig: EmbeddingConfig = { type: "openai", // Default to OpenAI for now apiKey: this.config.llmConfig.apiKey, modelName: this.config.embeddingModel, }; this.embeddingModel = EmbeddingFactory.create(embeddingConfig); // Initialize vector store const vectorStoreConfig = { storePath: this.config.vectorStorePath, }; this.vectorStore = VectorStoreFactory.create( "memory" as VectorStoreType, this.embeddingModel, vectorStoreConfig ); // Initialize processing pipeline const pipelineConfig: ProcessingPipelineConfig = { textSplitter: { chunkSize: this.config.chunkSize, chunkOverlap: this.config.chunkOverlap, }, enableMetadataExtraction: true, enableTextCleaning: true, }; this.processingPipeline = new DocumentProcessingPipeline(pipelineConfig); // Initialize Supabase Storage if configured if (this.config.supabaseConfig.url && this.config.supabaseConfig.anonKey) { this.storageManager = new SupabaseStorageManager({ url: this.config.supabaseConfig.url, anonKey: this.config.supabaseConfig.anonKey, bucket: this.config.supabaseConfig.bucket, }); } // Initialize Hybrid Retriever this.hybridRetriever = new HybridRetriever( this.vectorStore, this.embeddingModel, { vectorWeight: 0.7, keywordWeight: 0.3, maxResults: 10, minScore: 0.1, enableReranking: true, } ); } async initialize(): Promise<void> { // Initialize vector store if it has an initialize method if ( "initialize" in this.vectorStore && typeof this.vectorStore.initialize === "function" ) { await this.vectorStore.initialize(); } // Initialize Supabase Storage bucket if configured if (this.storageManager) { await this.storageManager.initializeBucket(); } } async addDocument(filePath: string, content: Buffer): Promise<string> { try { // Process the document const document = await this.processingPipeline.processDocument( filePath, content ); // Add to vector store await this.vectorStore.addDocuments([document]); // Update hybrid retriever index if available if (this.hybridRetriever && document.chunks) { for (const chunk of document.chunks) { this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content); } } return document.id; } catch (error) { throw new Error(`Failed to add document: ${error}`); } } async addDocuments( files: Array<{ path: string; content: Buffer }> ): Promise<string[]> { try { // Process all documents const documents = await this.processingPipeline.processBatch(files); // Add to vector store await this.vectorStore.addDocuments(documents); // Update hybrid retriever index if available if (this.hybridRetriever) { for (const document of documents) { if (document.chunks) { for (const chunk of document.chunks) { this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content); } } } } return documents.map((doc) => doc.id); } catch (error) { throw new Error(`Failed to add documents: ${error}`); } } async search(query: string, k: number = 5): Promise<SearchResult[]> { try { // Use hybrid search if available, fallback to vector search if (this.hybridRetriever) { return await this.hybridRetriever.search(query, k); } else { return await this.vectorStore.search(query, k); } } catch (error) { throw new Error(`Search failed: ${error}`); } } async vectorSearch(query: string, k: number = 5): Promise<SearchResult[]> { try { return await this.vectorStore.search(query, k); } catch (error) { throw new Error(`Vector search failed: ${error}`); } } async hybridSearch(query: string, k: number = 5): Promise<SearchResult[]> { if (!this.hybridRetriever) { throw new Error("Hybrid retriever is not initialized"); } try { return await this.hybridRetriever.search(query, k); } catch (error) { throw new Error(`Hybrid search failed: ${error}`); } } async deleteDocument(documentId: string): Promise<void> { try { await this.vectorStore.delete(documentId); } catch (error) { throw new Error(`Failed to delete document: ${error}`); } } async updateDocument( documentId: string, filePath: string, content: Buffer ): Promise<void> { try { // Process the updated document const document = await this.processingPipeline.processDocument( filePath, content ); // Set the document ID to match the existing one document.id = documentId; // Update in vector store await this.vectorStore.update(document); } catch (error) { throw new Error(`Failed to update document: ${error}`); } } getSupportedExtensions(): string[] { return this.processingPipeline.getSupportedExtensions(); } async getStats(): Promise<{ documentCount: number; chunkCount: number; embeddingModel: string; vectorStoreType: string; }> { const chunkCount = "getCount" in this.vectorStore && typeof this.vectorStore.getCount === "function" ? this.vectorStore.getCount() : 0; return { documentCount: 0, // TODO: Track document count separately chunkCount, embeddingModel: this.config.embeddingModel, vectorStoreType: "memory", // TODO: Make this dynamic }; } async clear(): Promise<void> { if ( "clear" in this.vectorStore && typeof this.vectorStore.clear === "function" ) { await this.vectorStore.clear(); } } // Supabase Storage specific methods async uploadDocumentToStorage( file: Buffer | File, fileName: string, metadata?: Record<string, any> ): Promise<DocumentUploadResult> { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } return this.storageManager.uploadDocument(file, fileName, metadata); } async addDocumentFromStorage(filePath: string): Promise<string> { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } try { // Download document from storage const content = await this.storageManager.downloadDocument(filePath); // Process and add to RAG system return this.addDocument(filePath, content); } catch (error) { throw new Error(`Failed to add document from storage: ${error}`); } } async uploadAndAddDocument( file: Buffer | File, fileName: string, metadata?: Record<string, any> ): Promise<{ documentId: string; uploadResult: DocumentUploadResult }> { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } try { // Upload to storage first const uploadResult = await this.storageManager.uploadDocument( file, fileName, metadata ); // Process and add to RAG system const buffer = file instanceof File ? Buffer.from(await file.arrayBuffer()) : file; const documentId = await this.addDocument(uploadResult.path, buffer); return { documentId, uploadResult }; } catch (error) { throw new Error(`Failed to upload and add document: ${error}`); } } async deleteDocumentFromStorage(documentPath: string): Promise<void> { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } try { // Delete from storage await this.storageManager.deleteDocument(documentPath); // Delete from vector store const documentId = await this.getDocumentIdFromPath(documentPath); if (documentId) { await this.deleteDocument(documentId); } } catch (error) { throw new Error(`Failed to delete document from storage: ${error}`); } } async listStorageDocuments(): Promise< Array<{ name: string; id: string; updated_at: string; size: number; metadata: Record<string, any>; }> > { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } return this.storageManager.listDocuments(); } async getStorageStats(): Promise<{ totalFiles: number; totalSize: number; bucketName: string; }> { if (!this.storageManager) { throw new Error("Supabase Storage is not configured"); } return this.storageManager.getStorageStats(); } private async getDocumentIdFromPath(path: string): Promise<string | null> { // This is a simplified implementation // In a real scenario, you'd maintain a mapping between storage paths and document IDs return Buffer.from(path) .toString("base64") .replace(/[^a-zA-Z0-9]/g, ""); } isStorageConfigured(): boolean { return !!this.storageManager; } // Search configuration and statistics methods updateHybridSearchConfig(config: Partial<HybridSearchConfig>): void { if (!this.hybridRetriever) { throw new Error("Hybrid retriever is not initialized"); } this.hybridRetriever.updateConfig(config); } getSearchStats(): { hybridRetriever?: { indexSize: number; config: HybridSearchConfig; }; vectorStore: { chunkCount: number; embeddingModel: string; }; } { const stats: any = { vectorStore: { chunkCount: "getCount" in this.vectorStore && typeof this.vectorStore.getCount === "function" ? this.vectorStore.getCount() : 0, embeddingModel: this.config.embeddingModel, }, }; if (this.hybridRetriever) { stats.hybridRetriever = this.hybridRetriever.getSearchStats(); } return stats; } clearSearchIndex(): void { if (this.hybridRetriever) { this.hybridRetriever.clearIndex(); } } addSearchSynonyms(word: string, synonyms: string[]): void { if (this.hybridRetriever) { // Access the query processor through the hybrid retriever // Note: This would require exposing the query processor in HybridRetriever console.log(`Adding synonyms for "${word}": ${synonyms.join(", ")}`); } } async searchWithOptions( query: string, options: { method?: "hybrid" | "vector" | "keyword"; k?: number; minScore?: number; vectorWeight?: number; keywordWeight?: number; } = {} ): Promise<SearchResult[]> { const { method = "hybrid", k = 5 } = options; // Update search configuration if weights are provided if ( this.hybridRetriever && (options.vectorWeight || options.keywordWeight) ) { this.hybridRetriever.updateConfig({ vectorWeight: options.vectorWeight || 0.7, keywordWeight: options.keywordWeight || 0.3, minScore: options.minScore || 0.1, }); } switch (method) { case "vector": return this.vectorSearch(query, k); case "hybrid": return this.hybridSearch(query, k); default: return this.search(query, k); } } // RSS-related methods async addNaverBlogRSS( blogId: string, feedName?: string, config?: { maxItems?: number; includeContent?: boolean } ): Promise<{ documentIds: string[]; feedName: string; itemCount: number }> { try { const loader = new NaverBlogRSSLoader(blogId, config); const name = feedName || `naver_blog_${blogId}`; // Add to RSS manager this.rssManager.addFeed(name, loader); // Load and process documents const documents = await loader.loadBlog(); if (documents.length === 0) { throw new Error(`No documents found in Naver blog RSS: ${blogId}`); } // Process each document through the pipeline const processedDocs = []; for (const doc of documents) { // Convert Document to a format that pipeline can process const buffer = Buffer.from(doc.content, "utf-8"); const processedDoc = await this.processingPipeline.processDocument( doc.source || `rss_${doc.id}`, buffer ); // Merge RSS metadata with processed metadata processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata }; processedDocs.push(processedDoc); } // Add to vector store await this.vectorStore.addDocuments(processedDocs); // Update hybrid retriever index if available if (this.hybridRetriever) { for (const document of processedDocs) { if (document.chunks) { for (const chunk of document.chunks) { this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content); } } } } return { documentIds: processedDocs.map((doc) => doc.id), feedName: name, itemCount: documents.length, }; } catch (error) { throw new Error(`Failed to add Naver blog RSS: ${error}`); } } async addRSSFeed( url: string, feedName?: string, config?: { maxItems?: number; includeContent?: boolean } ): Promise<{ documentIds: string[]; feedName: string; itemCount: number }> { try { const loader = new RSSLoader(config); const name = feedName || `rss_feed_${Date.now()}`; // Add to RSS manager this.rssManager.addFeed(name, loader); // Load and process documents const documents = await loader.loadFromURL(url); if (documents.length === 0) { throw new Error(`No documents found in RSS feed: ${url}`); } // Process each document through the pipeline const processedDocs = []; for (const doc of documents) { // Convert Document to a format that pipeline can process const buffer = Buffer.from(doc.content, "utf-8"); const processedDoc = await this.processingPipeline.processDocument( doc.source || `rss_${doc.id}`, buffer ); // Merge RSS metadata with processed metadata processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata }; processedDocs.push(processedDoc); } // Add to vector store await this.vectorStore.addDocuments(processedDocs); // Update hybrid retriever index if available if (this.hybridRetriever) { for (const document of processedDocs) { if (document.chunks) { for (const chunk of document.chunks) { this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content); } } } } return { documentIds: processedDocs.map((doc) => doc.id), feedName: name, itemCount: documents.length, }; } catch (error) { throw new Error(`Failed to add RSS feed: ${error}`); } } async refreshRSSFeed( feedName: string ): Promise<{ documentIds: string[]; itemCount: number }> { try { const loader = this.rssManager.getFeed(feedName); if (!loader) { throw new Error(`RSS feed not found: ${feedName}`); } let documents: Document[] = []; if (loader instanceof NaverBlogRSSLoader) { documents = await loader.loadBlog(); } else { // For generic RSS loader, we would need the URL // This is a limitation of the current design throw new Error( "Cannot refresh generic RSS feed without URL. Please use addRSSFeed with the URL again." ); } if (documents.length === 0) { return { documentIds: [], itemCount: 0 }; } // Process each document through the pipeline const processedDocs = []; for (const doc of documents) { const buffer = Buffer.from(doc.content, "utf-8"); const processedDoc = await this.processingPipeline.processDocument( doc.source || `rss_${doc.id}`, buffer ); // Merge RSS metadata with processed metadata processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata }; processedDocs.push(processedDoc); } // Add to vector store (this will add new documents, might create duplicates) await this.vectorStore.addDocuments(processedDocs); // Update hybrid retriever index if available if (this.hybridRetriever) { for (const document of processedDocs) { if (document.chunks) { for (const chunk of document.chunks) { this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content); } } } } return { documentIds: processedDocs.map((doc) => doc.id), itemCount: documents.length, }; } catch (error) { throw new Error(`Failed to refresh RSS feed: ${error}`); } } async refreshAllRSSFeeds(): Promise< Map<string, { documentIds: string[]; itemCount: number }> > { const results = new Map< string, { documentIds: string[]; itemCount: number } >(); const feedNames = this.rssManager.getFeedNames(); for (const feedName of feedNames) { try { const result = await this.refreshRSSFeed(feedName); results.set(feedName, result); } catch (error) { console.error(`Failed to refresh RSS feed ${feedName}:`, error); results.set(feedName, { documentIds: [], itemCount: 0 }); } } return results; } removeRSSFeed(feedName: string): boolean { return this.rssManager.removeFeed(feedName); } getRSSFeedNames(): string[] { return this.rssManager.getFeedNames(); } getRSSFeedInfo(feedName: string): { exists: boolean; type: "naver" | "generic" | null; blogId?: string; rssUrl?: string; } { const loader = this.rssManager.getFeed(feedName); if (!loader) { return { exists: false, type: null }; } if (loader instanceof NaverBlogRSSLoader) { return { exists: true, type: "naver", blogId: loader.getBlogId(), rssUrl: loader.getRSSUrl(), }; } return { exists: true, type: "generic", }; } // Utility method to extract blog ID from Naver blog URL static extractNaverBlogId(url: string): string | null { return NaverBlogRSSLoader.extractBlogId(url); } }