UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

174 lines (171 loc) 6.87 kB
"use client"; import * as fs from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; import { PDFLoader } from '../../node_modules/@langchain/community/dist/document_loaders/fs/pdf.js'; class DocumentLoaderFactory { constructor() { this.supportedExtensions = new Map([ ["pdf", "pdf"], ["txt", "text"], ["text", "text"], ["md", "text"], ]); } async loadDocument(filePath, content) { const extension = this.getFileExtension(filePath); const loaderType = this.supportedExtensions.get(extension); if (!loaderType) { throw new Error(`Unsupported file type: ${extension}`); } console.log(`[DocumentLoaderFactory] Loading ${filePath} as ${loaderType}`); if (loaderType === "pdf") { return await this.loadPDFWithLangChain(filePath, content); } else { return await this.loadTextFile(filePath, content); } } async loadPDFWithLangChain(filePath, content) { // Create temporary file for LangChain PDF loader const tempFilePath = await this.createTempFile(filePath, content); try { const pdfLoader = new PDFLoader(tempFilePath, { splitPages: false, // We'll handle chunking ourselves parsedItemSeparator: "\n\n", }); const langchainDocs = await pdfLoader.load(); // Convert LangChain documents to our Document format const document = this.convertToOurFormat(langchainDocs, filePath, content); console.log(`[DocumentLoaderFactory] Successfully loaded PDF ${filePath}, content length: ${document.content.length}`); return document; } finally { // Clean up temp file await this.cleanupTempFile(tempFilePath); } } async loadTextFile(filePath, content) { try { const textContent = content.toString("utf-8"); if (!textContent.trim()) { throw new Error("Text file is empty"); } const metadata = { title: this.extractTitleFromPath(filePath), author: undefined, createdAt: new Date(), updatedAt: new Date(), fileType: this.getFileExtension(filePath), fileSize: content.length, language: this.detectLanguage(textContent), description: this.extractDescription(textContent), source: filePath, }; console.log(`[DocumentLoaderFactory] Successfully loaded text file ${filePath}, content length: ${textContent.length}`); return { id: this.generateDocumentId(filePath), content: textContent, metadata, source: filePath, }; } catch (error) { throw new Error(`Failed to load text file: ${error}`); } } convertToOurFormat(langchainDocs, originalPath, content) { // Combine all LangChain documents into one const combinedContent = langchainDocs .map((doc) => doc.pageContent || doc.content) .join("\n\n") .trim(); if (!combinedContent) { throw new Error("No content extracted from document"); } // Extract metadata from first document const firstDoc = langchainDocs[0]; const langchainMetadata = (firstDoc === null || firstDoc === void 0 ? void 0 : firstDoc.metadata) || {}; // Create our document metadata const metadata = { title: langchainMetadata.title || this.extractTitleFromPath(originalPath), author: langchainMetadata.author, createdAt: langchainMetadata.createdAt ? new Date(langchainMetadata.createdAt) : new Date(), updatedAt: langchainMetadata.modifiedAt ? new Date(langchainMetadata.modifiedAt) : new Date(), fileType: this.getFileExtension(originalPath), fileSize: content.length, language: this.detectLanguage(combinedContent), description: this.extractDescription(combinedContent), source: originalPath, // Include any additional metadata from LangChain ...langchainMetadata, }; return { id: this.generateDocumentId(originalPath), content: combinedContent, metadata, source: originalPath, }; } async createTempFile(originalPath, content) { const tempDir = os.tmpdir(); const fileName = path.basename(originalPath); const tempFilePath = path.join(tempDir, `rag_temp_${Date.now()}_${fileName}`); await fs.writeFile(tempFilePath, content); return tempFilePath; } async cleanupTempFile(tempFilePath) { try { await fs.unlink(tempFilePath); } catch (error) { console.warn(`[DocumentLoaderFactory] Failed to cleanup temp file ${tempFilePath}:`, error); } } getFileExtension(filePath) { return path.extname(filePath).toLowerCase().slice(1); } extractTitleFromPath(filePath) { const fileName = path.basename(filePath); return fileName.replace(path.extname(fileName), ""); } generateDocumentId(filePath) { // Simple hash-like ID generation const fileName = path.basename(filePath); const timestamp = Date.now(); return `doc_${fileName.replace(/[^a-zA-Z0-9]/g, "_")}_${timestamp}`; } detectLanguage(text) { // Simple Korean detection const koreanRegex = /[가-힣]/g; const koreanMatches = text.match(koreanRegex); if (koreanMatches && koreanMatches.length > text.length * 0.1) { return "ko"; } return "en"; // Default to English } extractDescription(text) { var _a; // Extract first meaningful paragraph as description const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20); const firstParagraph = (_a = paragraphs[0]) === null || _a === void 0 ? void 0 : _a.trim(); if (firstParagraph && firstParagraph.length > 50) { return (firstParagraph.substring(0, 200) + (firstParagraph.length > 200 ? "..." : "")); } return "Document content"; } getSupportedExtensions() { return Array.from(this.supportedExtensions.keys()); } isSupported(filePath) { const extension = this.getFileExtension(filePath); return this.supportedExtensions.has(extension); } } export { DocumentLoaderFactory }; //# sourceMappingURL=index.js.map