UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

159 lines (131 loc) 4.49 kB
import { Document, DocumentChunk } from "../types"; import { DocumentLoaderFactory } from "../loaders"; import { RecursiveTextSplitter, TextSplitterConfig, } from "../splitters/recursive"; export interface ProcessingPipelineConfig { textSplitter: TextSplitterConfig; enableMetadataExtraction?: boolean; enableTextCleaning?: boolean; } export class DocumentProcessingPipeline { private loaderFactory: DocumentLoaderFactory; private textSplitter: RecursiveTextSplitter; private config: ProcessingPipelineConfig; constructor(config: ProcessingPipelineConfig) { this.config = { enableMetadataExtraction: true, enableTextCleaning: true, ...config, }; this.loaderFactory = new DocumentLoaderFactory(); this.textSplitter = new RecursiveTextSplitter(config.textSplitter); } async processDocument(filePath: string, content: Buffer): Promise<Document> { try { // Step 1: Load and parse document const document = await this.loaderFactory.loadDocument(filePath, content); // Step 2: Extract additional metadata if enabled if (this.config.enableMetadataExtraction) { await this.extractMetadata(document); } // Step 3: Clean text if enabled if (this.config.enableTextCleaning) { this.cleanDocumentText(document); } // Step 4: Split document into chunks const chunks = await this.textSplitter.splitDocument(document); document.chunks = chunks; return document; } catch (error) { throw new Error(`Failed to process document ${filePath}: ${error}`); } } async processBatch( files: Array<{ path: string; content: Buffer }> ): Promise<Document[]> { const results: Document[] = []; const errors: Array<{ file: string; error: string }> = []; for (const file of files) { try { const document = await this.processDocument(file.path, file.content); results.push(document); } catch (error) { errors.push({ file: file.path, error: String(error) }); } } if (errors.length > 0) { console.warn("Some files failed to process:", errors); } return results; } getSupportedExtensions(): string[] { return this.loaderFactory.getSupportedExtensions(); } private async extractMetadata(document: Document): Promise<void> { // Extract language (simple heuristic) document.metadata.language = this.detectLanguage(document.content); // Extract additional metadata based on content document.metadata.description = this.extractDescription(document.content); // Auto-categorize based on content document.metadata.categories = this.extractCategories(document.content); } private cleanDocumentText(document: Document): void { // Remove excessive whitespace document.content = document.content .replace(/\s+/g, " ") .replace(/\n\s*\n/g, "\n\n") .trim(); } private detectLanguage(text: string): string { // Simple Korean detection const koreanRegex = /[가-힣]/g; const koreanMatches = text.match(koreanRegex); if (koreanMatches && koreanMatches.length > text.length * 0.1) { return "ko"; } return "en"; // Default to English } private extractDescription(text: string): string { // Extract first meaningful paragraph as description const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 50); return ( paragraphs[0]?.substring(0, 200) + "..." || "No description available" ); } private extractCategories(text: string): string[] { const categories: string[] = []; // Simple keyword-based categorization const keywords = { technology: [ "기술", "소프트웨어", "개발", "AI", "머신러닝", "technology", "software", "development", ], business: [ "비즈니스", "사업", "경영", "마케팅", "business", "marketing", "strategy", ], education: ["교육", "학습", "연구", "education", "learning", "research"], legal: ["법률", "계약", "규정", "legal", "contract", "regulation"], }; const lowerText = text.toLowerCase(); for (const [category, words] of Object.entries(keywords)) { if (words.some((word) => lowerText.includes(word))) { categories.push(category); } } return categories; } }