@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
123 lines (120 loc) • 4.47 kB
JavaScript
import { DocumentLoaderFactory } from '../loaders/index.js';
import { RecursiveTextSplitter } from '../splitters/recursive.js';
class DocumentProcessingPipeline {
constructor(config) {
this.config = {
enableMetadataExtraction: true,
enableTextCleaning: true,
...config,
};
this.loaderFactory = new DocumentLoaderFactory();
this.textSplitter = new RecursiveTextSplitter(config.textSplitter);
}
async processDocument(filePath, content) {
try {
// Step 1: Load and parse document
const document = await this.loaderFactory.loadDocument(filePath, content);
// Step 2: Extract additional metadata if enabled
if (this.config.enableMetadataExtraction) {
await this.extractMetadata(document);
}
// Step 3: Clean text if enabled
if (this.config.enableTextCleaning) {
this.cleanDocumentText(document);
}
// Step 4: Split document into chunks
const chunks = await this.textSplitter.splitDocument(document);
document.chunks = chunks;
return document;
}
catch (error) {
throw new Error(`Failed to process document ${filePath}: ${error}`);
}
}
async processBatch(files) {
const results = [];
const errors = [];
for (const file of files) {
try {
const document = await this.processDocument(file.path, file.content);
results.push(document);
}
catch (error) {
errors.push({ file: file.path, error: String(error) });
}
}
if (errors.length > 0) {
console.warn("Some files failed to process:", errors);
}
return results;
}
getSupportedExtensions() {
return this.loaderFactory.getSupportedExtensions();
}
async extractMetadata(document) {
// Extract language (simple heuristic)
document.metadata.language = this.detectLanguage(document.content);
// Extract additional metadata based on content
document.metadata.description = this.extractDescription(document.content);
// Auto-categorize based on content
document.metadata.categories = this.extractCategories(document.content);
}
cleanDocumentText(document) {
// Remove excessive whitespace
document.content = document.content
.replace(/\s+/g, " ")
.replace(/\n\s*\n/g, "\n\n")
.trim();
}
detectLanguage(text) {
// Simple Korean detection
const koreanRegex = /[가-힣]/g;
const koreanMatches = text.match(koreanRegex);
if (koreanMatches && koreanMatches.length > text.length * 0.1) {
return "ko";
}
return "en"; // Default to English
}
extractDescription(text) {
var _a;
// Extract first meaningful paragraph as description
const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 50);
return (((_a = paragraphs[0]) === null || _a === void 0 ? void 0 : _a.substring(0, 200)) + "..." || "No description available");
}
extractCategories(text) {
const categories = [];
// Simple keyword-based categorization
const keywords = {
technology: [
"기술",
"소프트웨어",
"개발",
"AI",
"머신러닝",
"technology",
"software",
"development",
],
business: [
"비즈니스",
"사업",
"경영",
"마케팅",
"business",
"marketing",
"strategy",
],
education: ["교육", "학습", "연구", "education", "learning", "research"],
legal: ["법률", "계약", "규정", "legal", "contract", "regulation"],
};
const lowerText = text.toLowerCase();
for (const [category, words] of Object.entries(keywords)) {
if (words.some((word) => lowerText.includes(word))) {
categories.push(category);
}
}
return categories;
}
}
export { DocumentProcessingPipeline };
//# sourceMappingURL=pipeline.js.map