UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

227 lines (191 loc) 6.48 kB
import { Document } from "../types"; import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf"; import * as fs from "fs/promises"; import * as path from "path"; import * as os from "os"; export class DocumentLoaderFactory { private supportedExtensions = new Map<string, string>([ ["pdf", "pdf"], ["txt", "text"], ["text", "text"], ["md", "text"], ]); async loadDocument(filePath: string, content: Buffer): Promise<Document> { const extension = this.getFileExtension(filePath); const loaderType = this.supportedExtensions.get(extension); if (!loaderType) { throw new Error(`Unsupported file type: ${extension}`); } console.log(`[DocumentLoaderFactory] Loading ${filePath} as ${loaderType}`); if (loaderType === "pdf") { return await this.loadPDFWithLangChain(filePath, content); } else { return await this.loadTextFile(filePath, content); } } private async loadPDFWithLangChain( filePath: string, content: Buffer ): Promise<Document> { // Create temporary file for LangChain PDF loader const tempFilePath = await this.createTempFile(filePath, content); try { const pdfLoader = new PDFLoader(tempFilePath, { splitPages: false, // We'll handle chunking ourselves parsedItemSeparator: "\n\n", }); const langchainDocs = await pdfLoader.load(); // Convert LangChain documents to our Document format const document = this.convertToOurFormat( langchainDocs, filePath, content ); console.log( `[DocumentLoaderFactory] Successfully loaded PDF ${filePath}, content length: ${document.content.length}` ); return document; } finally { // Clean up temp file await this.cleanupTempFile(tempFilePath); } } private async loadTextFile( filePath: string, content: Buffer ): Promise<Document> { try { const textContent = content.toString("utf-8"); if (!textContent.trim()) { throw new Error("Text file is empty"); } const metadata = { title: this.extractTitleFromPath(filePath), author: undefined, createdAt: new Date(), updatedAt: new Date(), fileType: this.getFileExtension(filePath), fileSize: content.length, language: this.detectLanguage(textContent), description: this.extractDescription(textContent), source: filePath, }; console.log( `[DocumentLoaderFactory] Successfully loaded text file ${filePath}, content length: ${textContent.length}` ); return { id: this.generateDocumentId(filePath), content: textContent, metadata, source: filePath, }; } catch (error) { throw new Error(`Failed to load text file: ${error}`); } } private convertToOurFormat( langchainDocs: any[], originalPath: string, content: Buffer ): Document { // Combine all LangChain documents into one const combinedContent = langchainDocs .map((doc) => doc.pageContent || doc.content) .join("\n\n") .trim(); if (!combinedContent) { throw new Error("No content extracted from document"); } // Extract metadata from first document const firstDoc = langchainDocs[0]; const langchainMetadata = firstDoc?.metadata || {}; // Create our document metadata const metadata = { title: langchainMetadata.title || this.extractTitleFromPath(originalPath), author: langchainMetadata.author, createdAt: langchainMetadata.createdAt ? new Date(langchainMetadata.createdAt) : new Date(), updatedAt: langchainMetadata.modifiedAt ? new Date(langchainMetadata.modifiedAt) : new Date(), fileType: this.getFileExtension(originalPath), fileSize: content.length, language: this.detectLanguage(combinedContent), description: this.extractDescription(combinedContent), source: originalPath, // Include any additional metadata from LangChain ...langchainMetadata, }; return { id: this.generateDocumentId(originalPath), content: combinedContent, metadata, source: originalPath, }; } private async createTempFile( originalPath: string, content: Buffer ): Promise<string> { const tempDir = os.tmpdir(); const fileName = path.basename(originalPath); const tempFilePath = path.join( tempDir, `rag_temp_${Date.now()}_${fileName}` ); await fs.writeFile(tempFilePath, content); return tempFilePath; } private async cleanupTempFile(tempFilePath: string): Promise<void> { try { await fs.unlink(tempFilePath); } catch (error) { console.warn( `[DocumentLoaderFactory] Failed to cleanup temp file ${tempFilePath}:`, error ); } } private getFileExtension(filePath: string): string { return path.extname(filePath).toLowerCase().slice(1); } private extractTitleFromPath(filePath: string): string { const fileName = path.basename(filePath); return fileName.replace(path.extname(fileName), ""); } private generateDocumentId(filePath: string): string { // Simple hash-like ID generation const fileName = path.basename(filePath); const timestamp = Date.now(); return `doc_${fileName.replace(/[^a-zA-Z0-9]/g, "_")}_${timestamp}`; } private detectLanguage(text: string): string { // Simple Korean detection const koreanRegex = /[가-힣]/g; const koreanMatches = text.match(koreanRegex); if (koreanMatches && koreanMatches.length > text.length * 0.1) { return "ko"; } return "en"; // Default to English } private extractDescription(text: string): string { // Extract first meaningful paragraph as description const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20); const firstParagraph = paragraphs[0]?.trim(); if (firstParagraph && firstParagraph.length > 50) { return ( firstParagraph.substring(0, 200) + (firstParagraph.length > 200 ? "..." : "") ); } return "Document content"; } getSupportedExtensions(): string[] { return Array.from(this.supportedExtensions.keys()); } isSupported(filePath: string): boolean { const extension = this.getFileExtension(filePath); return this.supportedExtensions.has(extension); } }