@restnfeel/agentc-starter-kit
Version:
한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템
208 lines (205 loc) • 9.56 kB
JavaScript
import { BaseDocumentLoader } from './base.js';
class PDFDocumentLoader extends BaseDocumentLoader {
constructor() {
super(...arguments);
this.supportedExtensions = ["pdf"];
}
async load(filePath, content) {
try {
console.log(`[PDF Loader] Processing: ${filePath}, size: ${content.length} bytes`);
// Try multiple PDF extraction methods
let extractedText = "";
let metadata = {};
// Method 1: Try pdfjs-dist (more reliable)
try {
const result = await this.extractWithPDFJS(content);
extractedText = result.text;
metadata = result.metadata;
console.log(`[PDF Loader] Successfully extracted ${extractedText.length} characters with PDF.js`);
}
catch (pdfjsError) {
console.warn(`[PDF Loader] PDF.js failed:`, pdfjsError);
// Method 2: Try pdf-parse as fallback
try {
const result = await this.extractWithPDFParse(content);
extractedText = result.text;
metadata = result.metadata;
console.log(`[PDF Loader] Successfully extracted ${extractedText.length} characters with pdf-parse`);
}
catch (parseError) {
console.warn(`[PDF Loader] pdf-parse failed:`, parseError);
// Method 3: Basic text extraction as last resort
extractedText = this.extractBasicText(content);
console.log(`[PDF Loader] Using basic text extraction, got ${extractedText.length} characters`);
}
}
if (!extractedText || extractedText.trim().length < 10) {
throw new Error("Unable to extract meaningful text from PDF");
}
const cleanedText = this.cleanText(extractedText);
const docMetadata = this.createBaseMetadata(filePath, content.length);
// Merge extracted metadata
docMetadata.title =
metadata.title || this.extractTitleFromFilename(filePath);
docMetadata.author = metadata.author;
docMetadata.description = this.extractDescription(cleanedText);
if (metadata.createdAt) {
docMetadata.createdAt = new Date(metadata.createdAt);
}
return {
id: this.generateDocumentId(filePath),
content: cleanedText,
metadata: docMetadata,
source: filePath,
};
}
catch (error) {
throw new Error(`Failed to load PDF document: ${error}`);
}
}
async extractWithPDFJS(content) {
try {
// Import pdfjs-dist dynamically
const pdfjsLib = await import('../../node_modules/pdfjs-dist/build/pdf.js');
// Create a Uint8Array from Buffer
const pdfData = new Uint8Array(content);
// Load the PDF document
const loadingTask = pdfjsLib.getDocument({
data: pdfData,
verbosity: 0, // Suppress console output
});
const pdfDocument = await loadingTask.promise;
let fullText = "";
const metadata = {};
// Extract metadata
try {
const pdfMetadata = await pdfDocument.getMetadata();
const info = pdfMetadata.info;
metadata.title = info === null || info === void 0 ? void 0 : info.Title;
metadata.author = info === null || info === void 0 ? void 0 : info.Author;
metadata.subject = info === null || info === void 0 ? void 0 : info.Subject;
metadata.creator = info === null || info === void 0 ? void 0 : info.Creator;
metadata.producer = info === null || info === void 0 ? void 0 : info.Producer;
metadata.createdAt = info === null || info === void 0 ? void 0 : info.CreationDate;
metadata.modifiedAt = info === null || info === void 0 ? void 0 : info.ModDate;
}
catch (metaError) {
console.warn("[PDF Loader] Failed to extract PDF metadata:", metaError);
}
// Extract text from all pages
const numPages = pdfDocument.numPages;
console.log(`[PDF Loader] PDF has ${numPages} pages`);
for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) {
try {
const page = await pdfDocument.getPage(pageNumber);
const textContent = await page.getTextContent();
// Combine text items
const pageText = textContent.items
.map((item) => {
// Handle different text item types
if (typeof item === "string")
return item;
if (item.str)
return item.str;
if (item.text)
return item.text;
return "";
})
.join(" ");
fullText += pageText + "\n\n";
}
catch (pageError) {
console.warn(`[PDF Loader] Failed to extract page ${pageNumber}:`, pageError);
}
}
return { text: fullText.trim(), metadata };
}
catch (error) {
throw new Error(`PDF.js extraction failed: ${error.message}`);
}
}
async extractWithPDFParse(content) {
try {
// Import pdf-parse with specific options to avoid file path issues
const pdfParse = await import('../../_virtual/index.js').then(function (n) { return n.i; });
const parseFn = (pdfParse.default || pdfParse);
// Use specific options to prevent internal file references
const options = {
// Limit the extraction to prevent infinite loops
max: 0, // No page limit
// Disable problematic features
normalizeWhitespace: false,
disableCombineTextItems: false,
// Prevent version checking that might cause file path issues
version: "v1.10.100",
};
const result = await parseFn(content, options);
const metadata = {};
if (result.info) {
metadata.title = result.info.Title;
metadata.author = result.info.Author;
metadata.subject = result.info.Subject;
metadata.creator = result.info.Creator;
metadata.producer = result.info.Producer;
metadata.createdAt = result.info.CreationDate;
metadata.modifiedAt = result.info.ModDate;
}
return { text: result.text || "", metadata };
}
catch (error) {
throw new Error(`pdf-parse extraction failed: ${error.message}`);
}
}
extractBasicText(content) {
try {
// Convert buffer to string and try to extract readable text
const text = content.toString("latin1");
// Look for text patterns commonly found in PDFs
const textPatterns = [
// Standard text extraction
/BT\s+(.+?)\s+ET/g,
// Text between parentheses (common in PDF text commands)
/\(([^)]+)\)/g,
// Plain text patterns
/[\x20-\x7E\u00A0-\u00FF\u0100-\u017F\u0180-\u024F\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]{10,}/g,
];
let extractedText = "";
for (const pattern of textPatterns) {
const matches = text.match(pattern);
if (matches && matches.length > 0) {
extractedText += matches.join(" ").substring(0, 10000);
break; // Use first successful pattern
}
}
// Clean up the extracted text
extractedText = extractedText
.replace(/\s+/g, " ")
.replace(/[^\x20-\x7E\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]/g, " ")
.trim();
if (extractedText.length < 10) {
return "PDF 문서를 처리했지만 텍스트를 추출할 수 없습니다. 이미지 기반 PDF이거나 보호된 문서일 수 있습니다.";
}
return extractedText;
}
catch (error) {
return "PDF 문서 처리 중 오류가 발생했습니다.";
}
}
extractTitleFromFilename(filePath) {
const filename = filePath.split("/").pop() || filePath;
return filename.replace(/\.[^/.]+$/, ""); // Remove extension
}
extractDescription(text) {
var _a;
// Extract first meaningful paragraph as description
const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20);
const firstParagraph = (_a = paragraphs[0]) === null || _a === void 0 ? void 0 : _a.trim();
if (firstParagraph && firstParagraph.length > 50) {
return (firstParagraph.substring(0, 200) +
(firstParagraph.length > 200 ? "..." : ""));
}
return "PDF 문서";
}
}
export { PDFDocumentLoader };
//# sourceMappingURL=pdf.js.map