@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
227 lines (191 loc) • 6.48 kB
text/typescript
import { Document } from "../types";
import { PDFLoader } from "@langchain/community/document_loaders/fs/pdf";
import * as fs from "fs/promises";
import * as path from "path";
import * as os from "os";
export class DocumentLoaderFactory {
private supportedExtensions = new Map<string, string>([
["pdf", "pdf"],
["txt", "text"],
["text", "text"],
["md", "text"],
]);
async loadDocument(filePath: string, content: Buffer): Promise<Document> {
const extension = this.getFileExtension(filePath);
const loaderType = this.supportedExtensions.get(extension);
if (!loaderType) {
throw new Error(`Unsupported file type: ${extension}`);
}
console.log(`[DocumentLoaderFactory] Loading ${filePath} as ${loaderType}`);
if (loaderType === "pdf") {
return await this.loadPDFWithLangChain(filePath, content);
} else {
return await this.loadTextFile(filePath, content);
}
}
private async loadPDFWithLangChain(
filePath: string,
content: Buffer
): Promise<Document> {
// Create temporary file for LangChain PDF loader
const tempFilePath = await this.createTempFile(filePath, content);
try {
const pdfLoader = new PDFLoader(tempFilePath, {
splitPages: false, // We'll handle chunking ourselves
parsedItemSeparator: "\n\n",
});
const langchainDocs = await pdfLoader.load();
// Convert LangChain documents to our Document format
const document = this.convertToOurFormat(
langchainDocs,
filePath,
content
);
console.log(
`[DocumentLoaderFactory] Successfully loaded PDF ${filePath}, content length: ${document.content.length}`
);
return document;
} finally {
// Clean up temp file
await this.cleanupTempFile(tempFilePath);
}
}
private async loadTextFile(
filePath: string,
content: Buffer
): Promise<Document> {
try {
const textContent = content.toString("utf-8");
if (!textContent.trim()) {
throw new Error("Text file is empty");
}
const metadata = {
title: this.extractTitleFromPath(filePath),
author: undefined,
createdAt: new Date(),
updatedAt: new Date(),
fileType: this.getFileExtension(filePath),
fileSize: content.length,
language: this.detectLanguage(textContent),
description: this.extractDescription(textContent),
source: filePath,
};
console.log(
`[DocumentLoaderFactory] Successfully loaded text file ${filePath}, content length: ${textContent.length}`
);
return {
id: this.generateDocumentId(filePath),
content: textContent,
metadata,
source: filePath,
};
} catch (error) {
throw new Error(`Failed to load text file: ${error}`);
}
}
private convertToOurFormat(
langchainDocs: any[],
originalPath: string,
content: Buffer
): Document {
// Combine all LangChain documents into one
const combinedContent = langchainDocs
.map((doc) => doc.pageContent || doc.content)
.join("\n\n")
.trim();
if (!combinedContent) {
throw new Error("No content extracted from document");
}
// Extract metadata from first document
const firstDoc = langchainDocs[0];
const langchainMetadata = firstDoc?.metadata || {};
// Create our document metadata
const metadata = {
title: langchainMetadata.title || this.extractTitleFromPath(originalPath),
author: langchainMetadata.author,
createdAt: langchainMetadata.createdAt
? new Date(langchainMetadata.createdAt)
: new Date(),
updatedAt: langchainMetadata.modifiedAt
? new Date(langchainMetadata.modifiedAt)
: new Date(),
fileType: this.getFileExtension(originalPath),
fileSize: content.length,
language: this.detectLanguage(combinedContent),
description: this.extractDescription(combinedContent),
source: originalPath,
// Include any additional metadata from LangChain
...langchainMetadata,
};
return {
id: this.generateDocumentId(originalPath),
content: combinedContent,
metadata,
source: originalPath,
};
}
private async createTempFile(
originalPath: string,
content: Buffer
): Promise<string> {
const tempDir = os.tmpdir();
const fileName = path.basename(originalPath);
const tempFilePath = path.join(
tempDir,
`rag_temp_${Date.now()}_${fileName}`
);
await fs.writeFile(tempFilePath, content);
return tempFilePath;
}
private async cleanupTempFile(tempFilePath: string): Promise<void> {
try {
await fs.unlink(tempFilePath);
} catch (error) {
console.warn(
`[DocumentLoaderFactory] Failed to cleanup temp file ${tempFilePath}:`,
error
);
}
}
private getFileExtension(filePath: string): string {
return path.extname(filePath).toLowerCase().slice(1);
}
private extractTitleFromPath(filePath: string): string {
const fileName = path.basename(filePath);
return fileName.replace(path.extname(fileName), "");
}
private generateDocumentId(filePath: string): string {
// Simple hash-like ID generation
const fileName = path.basename(filePath);
const timestamp = Date.now();
return `doc_${fileName.replace(/[^a-zA-Z0-9]/g, "_")}_${timestamp}`;
}
private detectLanguage(text: string): string {
// Simple Korean detection
const koreanRegex = /[가-힣]/g;
const koreanMatches = text.match(koreanRegex);
if (koreanMatches && koreanMatches.length > text.length * 0.1) {
return "ko";
}
return "en"; // Default to English
}
private extractDescription(text: string): string {
// Extract first meaningful paragraph as description
const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20);
const firstParagraph = paragraphs[0]?.trim();
if (firstParagraph && firstParagraph.length > 50) {
return (
firstParagraph.substring(0, 200) +
(firstParagraph.length > 200 ? "..." : "")
);
}
return "Document content";
}
getSupportedExtensions(): string[] {
return Array.from(this.supportedExtensions.keys());
}
isSupported(filePath: string): boolean {
const extension = this.getFileExtension(filePath);
return this.supportedExtensions.has(extension);
}
}