@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
159 lines (131 loc) • 4.49 kB
text/typescript
import { Document, DocumentChunk } from "../types";
import { DocumentLoaderFactory } from "../loaders";
import {
RecursiveTextSplitter,
TextSplitterConfig,
} from "../splitters/recursive";
export interface ProcessingPipelineConfig {
textSplitter: TextSplitterConfig;
enableMetadataExtraction?: boolean;
enableTextCleaning?: boolean;
}
export class DocumentProcessingPipeline {
private loaderFactory: DocumentLoaderFactory;
private textSplitter: RecursiveTextSplitter;
private config: ProcessingPipelineConfig;
constructor(config: ProcessingPipelineConfig) {
this.config = {
enableMetadataExtraction: true,
enableTextCleaning: true,
...config,
};
this.loaderFactory = new DocumentLoaderFactory();
this.textSplitter = new RecursiveTextSplitter(config.textSplitter);
}
async processDocument(filePath: string, content: Buffer): Promise<Document> {
try {
// Step 1: Load and parse document
const document = await this.loaderFactory.loadDocument(filePath, content);
// Step 2: Extract additional metadata if enabled
if (this.config.enableMetadataExtraction) {
await this.extractMetadata(document);
}
// Step 3: Clean text if enabled
if (this.config.enableTextCleaning) {
this.cleanDocumentText(document);
}
// Step 4: Split document into chunks
const chunks = await this.textSplitter.splitDocument(document);
document.chunks = chunks;
return document;
} catch (error) {
throw new Error(`Failed to process document ${filePath}: ${error}`);
}
}
async processBatch(
files: Array<{ path: string; content: Buffer }>
): Promise<Document[]> {
const results: Document[] = [];
const errors: Array<{ file: string; error: string }> = [];
for (const file of files) {
try {
const document = await this.processDocument(file.path, file.content);
results.push(document);
} catch (error) {
errors.push({ file: file.path, error: String(error) });
}
}
if (errors.length > 0) {
console.warn("Some files failed to process:", errors);
}
return results;
}
getSupportedExtensions(): string[] {
return this.loaderFactory.getSupportedExtensions();
}
private async extractMetadata(document: Document): Promise<void> {
// Extract language (simple heuristic)
document.metadata.language = this.detectLanguage(document.content);
// Extract additional metadata based on content
document.metadata.description = this.extractDescription(document.content);
// Auto-categorize based on content
document.metadata.categories = this.extractCategories(document.content);
}
private cleanDocumentText(document: Document): void {
// Remove excessive whitespace
document.content = document.content
.replace(/\s+/g, " ")
.replace(/\n\s*\n/g, "\n\n")
.trim();
}
private detectLanguage(text: string): string {
// Simple Korean detection
const koreanRegex = /[가-힣]/g;
const koreanMatches = text.match(koreanRegex);
if (koreanMatches && koreanMatches.length > text.length * 0.1) {
return "ko";
}
return "en"; // Default to English
}
private extractDescription(text: string): string {
// Extract first meaningful paragraph as description
const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 50);
return (
paragraphs[0]?.substring(0, 200) + "..." || "No description available"
);
}
private extractCategories(text: string): string[] {
const categories: string[] = [];
// Simple keyword-based categorization
const keywords = {
technology: [
"기술",
"소프트웨어",
"개발",
"AI",
"머신러닝",
"technology",
"software",
"development",
],
business: [
"비즈니스",
"사업",
"경영",
"마케팅",
"business",
"marketing",
"strategy",
],
education: ["교육", "학습", "연구", "education", "learning", "research"],
legal: ["법률", "계약", "규정", "legal", "contract", "regulation"],
};
const lowerText = text.toLowerCase();
for (const [category, words] of Object.entries(keywords)) {
if (words.some((word) => lowerText.includes(word))) {
categories.push(category);
}
}
return categories;
}
}