UNPKG

@restnfeel/agentc-starter-kit

Version:

한국어 기업용 CMS 모듈 - Task Master AI와 함께 빠르게 웹사이트를 구현할 수 있는 재사용 가능한 컴포넌트 시스템

208 lines (205 loc) 9.56 kB
import { BaseDocumentLoader } from './base.js'; class PDFDocumentLoader extends BaseDocumentLoader { constructor() { super(...arguments); this.supportedExtensions = ["pdf"]; } async load(filePath, content) { try { console.log(`[PDF Loader] Processing: ${filePath}, size: ${content.length} bytes`); // Try multiple PDF extraction methods let extractedText = ""; let metadata = {}; // Method 1: Try pdfjs-dist (more reliable) try { const result = await this.extractWithPDFJS(content); extractedText = result.text; metadata = result.metadata; console.log(`[PDF Loader] Successfully extracted ${extractedText.length} characters with PDF.js`); } catch (pdfjsError) { console.warn(`[PDF Loader] PDF.js failed:`, pdfjsError); // Method 2: Try pdf-parse as fallback try { const result = await this.extractWithPDFParse(content); extractedText = result.text; metadata = result.metadata; console.log(`[PDF Loader] Successfully extracted ${extractedText.length} characters with pdf-parse`); } catch (parseError) { console.warn(`[PDF Loader] pdf-parse failed:`, parseError); // Method 3: Basic text extraction as last resort extractedText = this.extractBasicText(content); console.log(`[PDF Loader] Using basic text extraction, got ${extractedText.length} characters`); } } if (!extractedText || extractedText.trim().length < 10) { throw new Error("Unable to extract meaningful text from PDF"); } const cleanedText = this.cleanText(extractedText); const docMetadata = this.createBaseMetadata(filePath, content.length); // Merge extracted metadata docMetadata.title = metadata.title || this.extractTitleFromFilename(filePath); docMetadata.author = metadata.author; docMetadata.description = this.extractDescription(cleanedText); if (metadata.createdAt) { docMetadata.createdAt = new Date(metadata.createdAt); } return { id: this.generateDocumentId(filePath), content: cleanedText, metadata: docMetadata, source: filePath, }; } catch (error) { throw new Error(`Failed to load PDF document: ${error}`); } } async extractWithPDFJS(content) { try { // Import pdfjs-dist dynamically const pdfjsLib = await import('../../node_modules/pdfjs-dist/build/pdf.js'); // Create a Uint8Array from Buffer const pdfData = new Uint8Array(content); // Load the PDF document const loadingTask = pdfjsLib.getDocument({ data: pdfData, verbosity: 0, // Suppress console output }); const pdfDocument = await loadingTask.promise; let fullText = ""; const metadata = {}; // Extract metadata try { const pdfMetadata = await pdfDocument.getMetadata(); const info = pdfMetadata.info; metadata.title = info === null || info === void 0 ? void 0 : info.Title; metadata.author = info === null || info === void 0 ? void 0 : info.Author; metadata.subject = info === null || info === void 0 ? void 0 : info.Subject; metadata.creator = info === null || info === void 0 ? void 0 : info.Creator; metadata.producer = info === null || info === void 0 ? void 0 : info.Producer; metadata.createdAt = info === null || info === void 0 ? void 0 : info.CreationDate; metadata.modifiedAt = info === null || info === void 0 ? void 0 : info.ModDate; } catch (metaError) { console.warn("[PDF Loader] Failed to extract PDF metadata:", metaError); } // Extract text from all pages const numPages = pdfDocument.numPages; console.log(`[PDF Loader] PDF has ${numPages} pages`); for (let pageNumber = 1; pageNumber <= numPages; pageNumber++) { try { const page = await pdfDocument.getPage(pageNumber); const textContent = await page.getTextContent(); // Combine text items const pageText = textContent.items .map((item) => { // Handle different text item types if (typeof item === "string") return item; if (item.str) return item.str; if (item.text) return item.text; return ""; }) .join(" "); fullText += pageText + "\n\n"; } catch (pageError) { console.warn(`[PDF Loader] Failed to extract page ${pageNumber}:`, pageError); } } return { text: fullText.trim(), metadata }; } catch (error) { throw new Error(`PDF.js extraction failed: ${error.message}`); } } async extractWithPDFParse(content) { try { // Import pdf-parse with specific options to avoid file path issues const pdfParse = await import('../../_virtual/index.js').then(function (n) { return n.i; }); const parseFn = (pdfParse.default || pdfParse); // Use specific options to prevent internal file references const options = { // Limit the extraction to prevent infinite loops max: 0, // No page limit // Disable problematic features normalizeWhitespace: false, disableCombineTextItems: false, // Prevent version checking that might cause file path issues version: "v1.10.100", }; const result = await parseFn(content, options); const metadata = {}; if (result.info) { metadata.title = result.info.Title; metadata.author = result.info.Author; metadata.subject = result.info.Subject; metadata.creator = result.info.Creator; metadata.producer = result.info.Producer; metadata.createdAt = result.info.CreationDate; metadata.modifiedAt = result.info.ModDate; } return { text: result.text || "", metadata }; } catch (error) { throw new Error(`pdf-parse extraction failed: ${error.message}`); } } extractBasicText(content) { try { // Convert buffer to string and try to extract readable text const text = content.toString("latin1"); // Look for text patterns commonly found in PDFs const textPatterns = [ // Standard text extraction /BT\s+(.+?)\s+ET/g, // Text between parentheses (common in PDF text commands) /\(([^)]+)\)/g, // Plain text patterns /[\x20-\x7E\u00A0-\u00FF\u0100-\u017F\u0180-\u024F\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]{10,}/g, ]; let extractedText = ""; for (const pattern of textPatterns) { const matches = text.match(pattern); if (matches && matches.length > 0) { extractedText += matches.join(" ").substring(0, 10000); break; // Use first successful pattern } } // Clean up the extracted text extractedText = extractedText .replace(/\s+/g, " ") .replace(/[^\x20-\x7E\uAC00-\uD7AF\u3131-\u318E\u1100-\u11FF]/g, " ") .trim(); if (extractedText.length < 10) { return "PDF 문서를 처리했지만 텍스트를 추출할 수 없습니다. 이미지 기반 PDF이거나 보호된 문서일 수 있습니다."; } return extractedText; } catch (error) { return "PDF 문서 처리 중 오류가 발생했습니다."; } } extractTitleFromFilename(filePath) { const filename = filePath.split("/").pop() || filePath; return filename.replace(/\.[^/.]+$/, ""); // Remove extension } extractDescription(text) { var _a; // Extract first meaningful paragraph as description const paragraphs = text.split("\n\n").filter((p) => p.trim().length > 20); const firstParagraph = (_a = paragraphs[0]) === null || _a === void 0 ? void 0 : _a.trim(); if (firstParagraph && firstParagraph.length > 50) { return (firstParagraph.substring(0, 200) + (firstParagraph.length > 200 ? "..." : "")); } return "PDF 문서"; } } export { PDFDocumentLoader }; //# sourceMappingURL=pdf.js.map