@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
689 lines (599 loc) • 19.7 kB
text/typescript
import {
RAGConfig,
Document,
SearchResult,
VectorStore,
EmbeddingModel,
} from "./types";
import {
DocumentProcessingPipeline,
ProcessingPipelineConfig,
} from "./processors/pipeline";
import { VectorStoreFactory, VectorStoreType } from "./vectorstore";
import { EmbeddingFactory, EmbeddingConfig } from "./embeddings";
import {
SupabaseStorageManager,
SupabaseStorageConfig,
DocumentUploadResult,
} from "./storage";
import { HybridRetriever, HybridSearchConfig } from "./retrieval";
import { RSSLoader, NaverBlogRSSLoader, RSSFeedManager } from "./loaders/rss";
export class RAGEngine {
private vectorStore!: VectorStore;
private embeddingModel!: EmbeddingModel;
private processingPipeline!: DocumentProcessingPipeline;
private storageManager?: SupabaseStorageManager;
private hybridRetriever?: HybridRetriever;
private rssManager: RSSFeedManager;
private config: RAGConfig;
constructor(config: RAGConfig) {
this.config = config;
this.rssManager = new RSSFeedManager();
this.initializeComponents();
}
private initializeComponents(): void {
// Initialize embedding model
const embeddingConfig: EmbeddingConfig = {
type: "openai", // Default to OpenAI for now
apiKey: this.config.llmConfig.apiKey,
modelName: this.config.embeddingModel,
};
this.embeddingModel = EmbeddingFactory.create(embeddingConfig);
// Initialize vector store
const vectorStoreConfig = {
storePath: this.config.vectorStorePath,
};
this.vectorStore = VectorStoreFactory.create(
"memory" as VectorStoreType,
this.embeddingModel,
vectorStoreConfig
);
// Initialize processing pipeline
const pipelineConfig: ProcessingPipelineConfig = {
textSplitter: {
chunkSize: this.config.chunkSize,
chunkOverlap: this.config.chunkOverlap,
},
enableMetadataExtraction: true,
enableTextCleaning: true,
};
this.processingPipeline = new DocumentProcessingPipeline(pipelineConfig);
// Initialize Supabase Storage if configured
if (this.config.supabaseConfig.url && this.config.supabaseConfig.anonKey) {
this.storageManager = new SupabaseStorageManager({
url: this.config.supabaseConfig.url,
anonKey: this.config.supabaseConfig.anonKey,
bucket: this.config.supabaseConfig.bucket,
});
}
// Initialize Hybrid Retriever
this.hybridRetriever = new HybridRetriever(
this.vectorStore,
this.embeddingModel,
{
vectorWeight: 0.7,
keywordWeight: 0.3,
maxResults: 10,
minScore: 0.1,
enableReranking: true,
}
);
}
async initialize(): Promise<void> {
// Initialize vector store if it has an initialize method
if (
"initialize" in this.vectorStore &&
typeof this.vectorStore.initialize === "function"
) {
await this.vectorStore.initialize();
}
// Initialize Supabase Storage bucket if configured
if (this.storageManager) {
await this.storageManager.initializeBucket();
}
}
async addDocument(filePath: string, content: Buffer): Promise<string> {
try {
// Process the document
const document = await this.processingPipeline.processDocument(
filePath,
content
);
// Add to vector store
await this.vectorStore.addDocuments([document]);
// Update hybrid retriever index if available
if (this.hybridRetriever && document.chunks) {
for (const chunk of document.chunks) {
this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content);
}
}
return document.id;
} catch (error) {
throw new Error(`Failed to add document: ${error}`);
}
}
async addDocuments(
files: Array<{ path: string; content: Buffer }>
): Promise<string[]> {
try {
// Process all documents
const documents = await this.processingPipeline.processBatch(files);
// Add to vector store
await this.vectorStore.addDocuments(documents);
// Update hybrid retriever index if available
if (this.hybridRetriever) {
for (const document of documents) {
if (document.chunks) {
for (const chunk of document.chunks) {
this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content);
}
}
}
}
return documents.map((doc) => doc.id);
} catch (error) {
throw new Error(`Failed to add documents: ${error}`);
}
}
async search(query: string, k: number = 5): Promise<SearchResult[]> {
try {
// Use hybrid search if available, fallback to vector search
if (this.hybridRetriever) {
return await this.hybridRetriever.search(query, k);
} else {
return await this.vectorStore.search(query, k);
}
} catch (error) {
throw new Error(`Search failed: ${error}`);
}
}
async vectorSearch(query: string, k: number = 5): Promise<SearchResult[]> {
try {
return await this.vectorStore.search(query, k);
} catch (error) {
throw new Error(`Vector search failed: ${error}`);
}
}
async hybridSearch(query: string, k: number = 5): Promise<SearchResult[]> {
if (!this.hybridRetriever) {
throw new Error("Hybrid retriever is not initialized");
}
try {
return await this.hybridRetriever.search(query, k);
} catch (error) {
throw new Error(`Hybrid search failed: ${error}`);
}
}
async deleteDocument(documentId: string): Promise<void> {
try {
await this.vectorStore.delete(documentId);
} catch (error) {
throw new Error(`Failed to delete document: ${error}`);
}
}
async updateDocument(
documentId: string,
filePath: string,
content: Buffer
): Promise<void> {
try {
// Process the updated document
const document = await this.processingPipeline.processDocument(
filePath,
content
);
// Set the document ID to match the existing one
document.id = documentId;
// Update in vector store
await this.vectorStore.update(document);
} catch (error) {
throw new Error(`Failed to update document: ${error}`);
}
}
getSupportedExtensions(): string[] {
return this.processingPipeline.getSupportedExtensions();
}
async getStats(): Promise<{
documentCount: number;
chunkCount: number;
embeddingModel: string;
vectorStoreType: string;
}> {
const chunkCount =
"getCount" in this.vectorStore &&
typeof this.vectorStore.getCount === "function"
? this.vectorStore.getCount()
: 0;
return {
documentCount: 0, // TODO: Track document count separately
chunkCount,
embeddingModel: this.config.embeddingModel,
vectorStoreType: "memory", // TODO: Make this dynamic
};
}
async clear(): Promise<void> {
if (
"clear" in this.vectorStore &&
typeof this.vectorStore.clear === "function"
) {
await this.vectorStore.clear();
}
}
// Supabase Storage specific methods
async uploadDocumentToStorage(
file: Buffer | File,
fileName: string,
metadata?: Record<string, any>
): Promise<DocumentUploadResult> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
return this.storageManager.uploadDocument(file, fileName, metadata);
}
async addDocumentFromStorage(filePath: string): Promise<string> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
try {
// Download document from storage
const content = await this.storageManager.downloadDocument(filePath);
// Process and add to RAG system
return this.addDocument(filePath, content);
} catch (error) {
throw new Error(`Failed to add document from storage: ${error}`);
}
}
async uploadAndAddDocument(
file: Buffer | File,
fileName: string,
metadata?: Record<string, any>
): Promise<{ documentId: string; uploadResult: DocumentUploadResult }> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
try {
// Upload to storage first
const uploadResult = await this.storageManager.uploadDocument(
file,
fileName,
metadata
);
// Process and add to RAG system
const buffer =
file instanceof File ? Buffer.from(await file.arrayBuffer()) : file;
const documentId = await this.addDocument(uploadResult.path, buffer);
return { documentId, uploadResult };
} catch (error) {
throw new Error(`Failed to upload and add document: ${error}`);
}
}
async deleteDocumentFromStorage(documentPath: string): Promise<void> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
try {
// Delete from storage
await this.storageManager.deleteDocument(documentPath);
// Delete from vector store
const documentId = await this.getDocumentIdFromPath(documentPath);
if (documentId) {
await this.deleteDocument(documentId);
}
} catch (error) {
throw new Error(`Failed to delete document from storage: ${error}`);
}
}
async listStorageDocuments(): Promise<
Array<{
name: string;
id: string;
updated_at: string;
size: number;
metadata: Record<string, any>;
}>
> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
return this.storageManager.listDocuments();
}
async getStorageStats(): Promise<{
totalFiles: number;
totalSize: number;
bucketName: string;
}> {
if (!this.storageManager) {
throw new Error("Supabase Storage is not configured");
}
return this.storageManager.getStorageStats();
}
private async getDocumentIdFromPath(path: string): Promise<string | null> {
// This is a simplified implementation
// In a real scenario, you'd maintain a mapping between storage paths and document IDs
return Buffer.from(path)
.toString("base64")
.replace(/[^a-zA-Z0-9]/g, "");
}
isStorageConfigured(): boolean {
return !!this.storageManager;
}
// Search configuration and statistics methods
updateHybridSearchConfig(config: Partial<HybridSearchConfig>): void {
if (!this.hybridRetriever) {
throw new Error("Hybrid retriever is not initialized");
}
this.hybridRetriever.updateConfig(config);
}
getSearchStats(): {
hybridRetriever?: {
indexSize: number;
config: HybridSearchConfig;
};
vectorStore: {
chunkCount: number;
embeddingModel: string;
};
} {
const stats: any = {
vectorStore: {
chunkCount:
"getCount" in this.vectorStore &&
typeof this.vectorStore.getCount === "function"
? this.vectorStore.getCount()
: 0,
embeddingModel: this.config.embeddingModel,
},
};
if (this.hybridRetriever) {
stats.hybridRetriever = this.hybridRetriever.getSearchStats();
}
return stats;
}
clearSearchIndex(): void {
if (this.hybridRetriever) {
this.hybridRetriever.clearIndex();
}
}
addSearchSynonyms(word: string, synonyms: string[]): void {
if (this.hybridRetriever) {
// Access the query processor through the hybrid retriever
// Note: This would require exposing the query processor in HybridRetriever
console.log(`Adding synonyms for "${word}": ${synonyms.join(", ")}`);
}
}
async searchWithOptions(
query: string,
options: {
method?: "hybrid" | "vector" | "keyword";
k?: number;
minScore?: number;
vectorWeight?: number;
keywordWeight?: number;
} = {}
): Promise<SearchResult[]> {
const { method = "hybrid", k = 5 } = options;
// Update search configuration if weights are provided
if (
this.hybridRetriever &&
(options.vectorWeight || options.keywordWeight)
) {
this.hybridRetriever.updateConfig({
vectorWeight: options.vectorWeight || 0.7,
keywordWeight: options.keywordWeight || 0.3,
minScore: options.minScore || 0.1,
});
}
switch (method) {
case "vector":
return this.vectorSearch(query, k);
case "hybrid":
return this.hybridSearch(query, k);
default:
return this.search(query, k);
}
}
// RSS-related methods
async addNaverBlogRSS(
blogId: string,
feedName?: string,
config?: { maxItems?: number; includeContent?: boolean }
): Promise<{ documentIds: string[]; feedName: string; itemCount: number }> {
try {
const loader = new NaverBlogRSSLoader(blogId, config);
const name = feedName || `naver_blog_${blogId}`;
// Add to RSS manager
this.rssManager.addFeed(name, loader);
// Load and process documents
const documents = await loader.loadBlog();
if (documents.length === 0) {
throw new Error(`No documents found in Naver blog RSS: ${blogId}`);
}
// Process each document through the pipeline
const processedDocs = [];
for (const doc of documents) {
// Convert Document to a format that pipeline can process
const buffer = Buffer.from(doc.content, "utf-8");
const processedDoc = await this.processingPipeline.processDocument(
doc.source || `rss_${doc.id}`,
buffer
);
// Merge RSS metadata with processed metadata
processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata };
processedDocs.push(processedDoc);
}
// Add to vector store
await this.vectorStore.addDocuments(processedDocs);
// Update hybrid retriever index if available
if (this.hybridRetriever) {
for (const document of processedDocs) {
if (document.chunks) {
for (const chunk of document.chunks) {
this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content);
}
}
}
}
return {
documentIds: processedDocs.map((doc) => doc.id),
feedName: name,
itemCount: documents.length,
};
} catch (error) {
throw new Error(`Failed to add Naver blog RSS: ${error}`);
}
}
async addRSSFeed(
url: string,
feedName?: string,
config?: { maxItems?: number; includeContent?: boolean }
): Promise<{ documentIds: string[]; feedName: string; itemCount: number }> {
try {
const loader = new RSSLoader(config);
const name = feedName || `rss_feed_${Date.now()}`;
// Add to RSS manager
this.rssManager.addFeed(name, loader);
// Load and process documents
const documents = await loader.loadFromURL(url);
if (documents.length === 0) {
throw new Error(`No documents found in RSS feed: ${url}`);
}
// Process each document through the pipeline
const processedDocs = [];
for (const doc of documents) {
// Convert Document to a format that pipeline can process
const buffer = Buffer.from(doc.content, "utf-8");
const processedDoc = await this.processingPipeline.processDocument(
doc.source || `rss_${doc.id}`,
buffer
);
// Merge RSS metadata with processed metadata
processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata };
processedDocs.push(processedDoc);
}
// Add to vector store
await this.vectorStore.addDocuments(processedDocs);
// Update hybrid retriever index if available
if (this.hybridRetriever) {
for (const document of processedDocs) {
if (document.chunks) {
for (const chunk of document.chunks) {
this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content);
}
}
}
}
return {
documentIds: processedDocs.map((doc) => doc.id),
feedName: name,
itemCount: documents.length,
};
} catch (error) {
throw new Error(`Failed to add RSS feed: ${error}`);
}
}
async refreshRSSFeed(
feedName: string
): Promise<{ documentIds: string[]; itemCount: number }> {
try {
const loader = this.rssManager.getFeed(feedName);
if (!loader) {
throw new Error(`RSS feed not found: ${feedName}`);
}
let documents: Document[] = [];
if (loader instanceof NaverBlogRSSLoader) {
documents = await loader.loadBlog();
} else {
// For generic RSS loader, we would need the URL
// This is a limitation of the current design
throw new Error(
"Cannot refresh generic RSS feed without URL. Please use addRSSFeed with the URL again."
);
}
if (documents.length === 0) {
return { documentIds: [], itemCount: 0 };
}
// Process each document through the pipeline
const processedDocs = [];
for (const doc of documents) {
const buffer = Buffer.from(doc.content, "utf-8");
const processedDoc = await this.processingPipeline.processDocument(
doc.source || `rss_${doc.id}`,
buffer
);
// Merge RSS metadata with processed metadata
processedDoc.metadata = { ...processedDoc.metadata, ...doc.metadata };
processedDocs.push(processedDoc);
}
// Add to vector store (this will add new documents, might create duplicates)
await this.vectorStore.addDocuments(processedDocs);
// Update hybrid retriever index if available
if (this.hybridRetriever) {
for (const document of processedDocs) {
if (document.chunks) {
for (const chunk of document.chunks) {
this.hybridRetriever.updateDocumentIndex(chunk.id, chunk.content);
}
}
}
}
return {
documentIds: processedDocs.map((doc) => doc.id),
itemCount: documents.length,
};
} catch (error) {
throw new Error(`Failed to refresh RSS feed: ${error}`);
}
}
async refreshAllRSSFeeds(): Promise<
Map<string, { documentIds: string[]; itemCount: number }>
> {
const results = new Map<
string,
{ documentIds: string[]; itemCount: number }
>();
const feedNames = this.rssManager.getFeedNames();
for (const feedName of feedNames) {
try {
const result = await this.refreshRSSFeed(feedName);
results.set(feedName, result);
} catch (error) {
console.error(`Failed to refresh RSS feed ${feedName}:`, error);
results.set(feedName, { documentIds: [], itemCount: 0 });
}
}
return results;
}
removeRSSFeed(feedName: string): boolean {
return this.rssManager.removeFeed(feedName);
}
getRSSFeedNames(): string[] {
return this.rssManager.getFeedNames();
}
getRSSFeedInfo(feedName: string): {
exists: boolean;
type: "naver" | "generic" | null;
blogId?: string;
rssUrl?: string;
} {
const loader = this.rssManager.getFeed(feedName);
if (!loader) {
return { exists: false, type: null };
}
if (loader instanceof NaverBlogRSSLoader) {
return {
exists: true,
type: "naver",
blogId: loader.getBlogId(),
rssUrl: loader.getRSSUrl(),
};
}
return {
exists: true,
type: "generic",
};
}
// Utility method to extract blog ID from Naver blog URL
static extractNaverBlogId(url: string): string | null {
return NaverBlogRSSLoader.extractBlogId(url);
}
}