@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
244 lines (206 loc) • 8.07 kB
text/typescript
import { Document } from "../types";
import { BaseDocumentLoader } from "./base";
export class PDFDocumentLoader extends BaseDocumentLoader {
supportedExtensions = ["pdf"];
async load(filePath: string, content: Buffer): Promise<Document> {
try {
console.log(
`[PDF Loader] Processing: ${filePath}, size: ${content.length} bytes`
);
// Try multiple PDF extraction methods
let extractedText = "";
let metadata: any = {};
// Method 1: Try pdfjs-dist (more reliable)
try {
const result = await this.extractWithPDFJS(content);
extractedText = result.text;
metadata = result.metadata;
console.log(
`[PDF Loader] Successfully extracted ${extractedText.length} characters with PDF.js`
);
} catch (pdfjsError) {
console.warn(`[PDF Loader] PDF.js failed:`, pdfjsError);
// Method 2: Try pdf-parse as fallback
try {
const result = await this.extractWithPDFParse(content);
extractedText = result.text;
metadata = result.metadata;
console.log(
`[PDF Loader] Successfully extracted ${extractedText.length} characters with pdf-parse`
);
} catch (parseError) {
console.warn(`[PDF Loader] pdf-parse failed:`, parseError);
// Method 3: Basic text extraction as last resort
extractedText = this.extractBasicText(content);
console.log(
`[PDF Loader] Using basic text extraction, got ${extractedText.length} characters`
);
}
}
if (!extractedText || extractedText.trim().length < 10) {
throw new Error("Unable to extract meaningful text from PDF");
}
const cleanedText = this.cleanText(extractedText);
const docMetadata = this.createBaseMetadata(filePath, content.length);
// Merge extracted metadata
docMetadata.title =
metadata.title || this.extractTitleFromFilename(filePath);
docMetadata.author = metadata.author;
docMetadata.description = this.extractDescription(cleanedText);
if (metadata.createdAt) {
docMetadata.createdAt = new Date(metadata.createdAt);
}
return {
id: this.generateDocumentId(filePath),
content: cleanedText,
metadata: docMetadata,
source: filePath,
};
} catch (error) {
throw new Error(`Failed to load PDF document: ${error}`);
}
}
private async extractWithPDFJS(
content: Buffer
): Promise<{ text: string; metadata: any }> {
try {
// Import pdfjs-dist dynamically
const pdfjsLib = await import("pdfjs-dist");
// Create a Uint8Array from Buffer
const pdfData = new Uint8Array(content);
// Load the PDF document
const loadingTask = pdfjsLib.getDocument({
data: pdfData,
verbosity: 0, // Suppress console output
});
const pdfDocument = await loadingTask.promise;
let fullText = "";
const metadata: any = {};
// Extract metadata
try {
const pdfMetadata = await pdfDocument.getMetadata();
const info = (pdfMetadata as any).info;
metadata.title = info?.Title;
metadata.author = info?.Author;
metadata.subject = info?.Subject;
metadata.creator = info?.Creator;
metadata.producer = info?.Producer;
metadata.createdAt = info?.CreationDate;
metadata.modifiedAt = info?.ModDate;
} catch (metaError) {
console.warn("[PDF Loader] Failed to extract PDF metadata:", metaError);
}
// Extract text from all pages
const numPages = pdfDocument.numPages;
console.log(`[PDF Loader] PDF has ${numPages} pages`);
for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
try {
const page = await pdfDocument.getPage(pageNumber);
const textContent = await page.getTextContent();
// Combine text items
const pageText = textContent.items
.map((item: any) => {
// Handle different text item types
if (typeof item === "string") return item;
if (item.str) return item.str;
if (item.text) return item.text;
return "";
})
.join(" ");
fullText += pageText + "\n\n";
} catch (pageError) {
console.warn(
`[PDF Loader] Failed to extract page ${pageNumber}:`,
pageError
);
}
}
return { text: fullText.trim(), metadata };
} catch (error) {
throw new Error(`PDF.js extraction failed: ${error.message}`);
}
}
private async extractWithPDFParse(
content: Buffer
): Promise<{ text: string; metadata: any }> {
try {
// Import pdf-parse with specific options to avoid file path issues
const pdfParse = await import("pdf-parse");
const parseFn = (pdfParse.default || pdfParse) as any;
// Use specific options to prevent internal file references
const options = {
// Limit the extraction to prevent infinite loops
max: 0, // No page limit
// Disable problematic features
normalizeWhitespace: false,
disableCombineTextItems: false,
// Prevent version checking that might cause file path issues
version: "v1.10.100",
};
const result = await parseFn(content, options);
const metadata: any = {};
if (result.info) {
metadata.title = result.info.Title;
metadata.author = result.info.Author;
metadata.subject = result.info.Subject;
metadata.creator = result.info.Creator;
metadata.producer = result.info.Producer;
metadata.createdAt = result.info.CreationDate;
metadata.modifiedAt = result.info.ModDate;
}
return { text: result.text || "", metadata };
} catch (error) {
throw new Error(`pdf-parse extraction failed: ${error.message}`);
}
}
private extractBasicText(content: Buffer): string {
try {
// Convert buffer to string and try to extract readable text
const text = content.toString("latin1");
// Look for text patterns commonly found in PDFs
const textPatterns = [
// Standard text extraction
/BT\s+(.+?)\s+ET/g,
// Text between parentheses (common in PDF text commands)
/\(([^)]+)\)/g,
// Plain text patterns
/[\x20-\x7E\u00A0-\u00FF\u0100-\u017F\u0180-\u024F\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]{10,}/g,
];
let extractedText = "";
for (const pattern of textPatterns) {
const matches = text.match(pattern);
if (matches && matches.length > 0) {
extractedText += matches.join(" ").substring(0, 10000);
break; // Use first successful pattern
}
}
// Clean up the extracted text
extractedText = extractedText
.replace(/\s+/g, " ")
.replace(/[^\x20-\x7E\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]/g, " ")
.trim();
if (extractedText.length < 10) {
return "PDF 문서를 처리했지만 텍스트를 추출할 수 없습니다. 이미지 기반 PDF이거나 보호된 문서일 수 있습니다.";
}
return extractedText;
} catch (error) {
return "PDF 문서 처리 중 오류가 발생했습니다.";
}
}
private extractTitleFromFilename(filePath: string): string {
const filename = filePath.split("/").pop() || filePath;
return filename.replace(/\.[^/.]+$/, ""); // Remove extension
}
private extractDescription(text: string): string {
// Extract first meaningful paragraph as description
const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20);
const firstParagraph = paragraphs[0]?.trim();
if (firstParagraph && firstParagraph.length > 50) {
return (
firstParagraph.substring(0, 200) +
(firstParagraph.length > 200 ? "..." : "")
);
}
return "PDF 문서";
}
}