UNPKG

doc-extract

Version:

A Node.js library for reading and extracting text from various document formats (PDF, DOCX, DOC, PPT, PPTX, TXT)

459 lines (458 loc) 18.1 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.DocumentReader = exports.DocumentReaderError = exports.SupportedFormats = void 0; exports.readDocument = readDocument; exports.readDocumentFromBuffer = readDocumentFromBuffer; const fs = __importStar(require("fs/promises")); const path = __importStar(require("path")); const pdf_parse_1 = __importDefault(require("pdf-parse")); const mammoth = __importStar(require("mammoth")); const textract = __importStar(require("textract")); const util_1 = require("util"); var SupportedFormats; (function (SupportedFormats) { SupportedFormats["PDF"] = "pdf"; SupportedFormats["DOCX"] = "docx"; SupportedFormats["DOC"] = "doc"; SupportedFormats["PPTX"] = "pptx"; SupportedFormats["PPT"] = "ppt"; SupportedFormats["TXT"] = "txt"; })(SupportedFormats || (exports.SupportedFormats = SupportedFormats = {})); class DocumentReaderError extends Error { constructor(message, code) { super(message); this.code = code; this.name = 'DocumentReaderError'; } } exports.DocumentReaderError = DocumentReaderError; class DocumentReader { constructor(options) { this.textractFromFile = (0, util_1.promisify)(textract.fromFileWithPath); this.debug = false; this.debug = options?.debug || false; } /** * Read any supported document format */ async readDocument(filePath) { try { await this.validateFile(filePath); const fileExtension = this.getFileExtension(filePath); const stats = await fs.stat(filePath); switch (fileExtension) { case SupportedFormats.PDF: return await this.readPdf(filePath, stats.size); case SupportedFormats.DOCX: return await this.readDocx(filePath, stats.size); case SupportedFormats.TXT: return await this.readTextFile(filePath, stats.size); case SupportedFormats.DOC: case SupportedFormats.PPTX: case SupportedFormats.PPT: return await this.readWithTextract(filePath, stats.size); default: throw new DocumentReaderError(`Unsupported file format: ${fileExtension}`, 'UNSUPPORTED_FORMAT'); } } catch (error) { if (error instanceof DocumentReaderError) { throw error; } this.log(`Error reading document ${filePath}:`, error); throw new DocumentReaderError(`Failed to read document: ${error instanceof Error ? error.message : String(error)}`, 'READ_ERROR'); } } /** * Read multiple documents from file paths */ async readMultipleDocuments(filePaths) { const results = await Promise.allSettled(filePaths.map(filePath => this.readDocument(filePath))); return results.map((result, index) => { if (result.status === 'rejected') { const error = result.reason; this.log(`Failed to read document ${filePaths[index]}:`, error); throw new DocumentReaderError(`Failed to read document ${filePaths[index]}: ${error instanceof Error ? error.message : String(error)}`, 'MULTI_READ_ERROR'); } return result.value; }); } /** * Read PDF file */ async readPdf(filePath, fileSize) { try { const buffer = await fs.readFile(filePath); const data = await (0, pdf_parse_1.default)(buffer); return { text: data.text, metadata: { pages: data.numpages, words: this.countWords(data.text), characters: data.text.length, fileSize: fileSize || buffer.length, fileName: path.basename(filePath), info: data.info, }, }; } catch (error) { this.log(`Error reading PDF ${filePath}:`, error); throw new DocumentReaderError(`Failed to read PDF: ${error instanceof Error ? error.message : String(error)}`, 'PDF_READ_ERROR'); } } /** * Read DOCX file */ async readDocx(filePath, fileSize) { try { // Extract raw text const textResult = await mammoth.extractRawText({ path: filePath }); // Extract HTML (optional) const htmlResult = await mammoth.convertToHtml({ path: filePath }); return { text: textResult.value, html: htmlResult.value, messages: [...textResult.messages, ...htmlResult.messages], metadata: { words: this.countWords(textResult.value), characters: textResult.value.length, fileSize, fileName: path.basename(filePath), }, }; } catch (error) { this.log(`Error reading DOCX ${filePath}:`, error); throw new DocumentReaderError(`Failed to read DOCX: ${error instanceof Error ? error.message : String(error)}`, 'DOCX_READ_ERROR'); } } /** * Read PPT/PPTX files using textract */ async readPowerPoint(filePath, fileSize) { return this.readWithTextract(filePath, fileSize); } /** * Read documents using textract (fallback for various formats) */ async readWithTextract(filePath, fileSize) { try { const text = await this.textractFromFile(filePath); return { text: text || '', metadata: { words: this.countWords(text || ''), characters: text?.length || 0, fileSize, fileName: path.basename(filePath), }, }; } catch (error) { this.log(`Error reading document with textract ${filePath}:`, error); throw new DocumentReaderError(`Failed to read document: ${error instanceof Error ? error.message : String(error)}`, 'TEXTRACT_READ_ERROR'); } } /** * Read document from buffer */ async readDocumentFromBuffer(buffer, fileName, mimeType) { try { const fileExtension = this.getFileExtensionFromName(fileName) || this.getExtensionFromMimeType(mimeType); switch (fileExtension) { case SupportedFormats.PDF: return await this.readPdfFromBuffer(buffer, fileName); case SupportedFormats.DOCX: return await this.readDocxFromBuffer(buffer, fileName); case SupportedFormats.DOC: case SupportedFormats.PPT: case SupportedFormats.PPTX: return await this.readWithTextractFromBuffer(buffer, fileName); case SupportedFormats.TXT: return await this.readTextFromBuffer(buffer, fileName); default: throw new DocumentReaderError(`Unsupported format for buffer reading: ${fileExtension}`, 'UNSUPPORTED_BUFFER_FORMAT'); } } catch (error) { if (error instanceof DocumentReaderError) { throw error; } this.log(`Error reading document from buffer:`, error); throw new DocumentReaderError(`Failed to read document from buffer: ${error instanceof Error ? error.message : String(error)}`, 'BUFFER_READ_ERROR'); } } /** * Read multiple documents from buffers */ async readMultipleFromBuffers(buffers) { const results = await Promise.allSettled(buffers.map(({ buffer, fileName, mimeType }) => this.readDocumentFromBuffer(buffer, fileName, mimeType))); return results.map((result, index) => { if (result.status === 'rejected') { const error = result.reason; this.log(`Failed to read buffer ${buffers[index].fileName}:`, error); throw new DocumentReaderError(`Failed to read buffer ${buffers[index].fileName}: ${error instanceof Error ? error.message : String(error)}`, 'MULTI_BUFFER_READ_ERROR'); } return result.value; }); } /** * Read PDF from buffer */ async readPdfFromBuffer(buffer, fileName) { try { const data = await (0, pdf_parse_1.default)(buffer); return { text: data.text, metadata: { pages: data.numpages, words: this.countWords(data.text), characters: data.text.length, fileSize: buffer.length, fileName, info: data.info, }, }; } catch (error) { this.log(`Error reading PDF from buffer:`, error); throw new DocumentReaderError(`Failed to read PDF from buffer: ${error instanceof Error ? error.message : String(error)}`, 'PDF_BUFFER_READ_ERROR'); } } /** * Read DOCX from buffer */ async readDocxFromBuffer(buffer, fileName) { try { const textResult = await mammoth.extractRawText({ buffer }); const htmlResult = await mammoth.convertToHtml({ buffer }); return { text: textResult.value, html: htmlResult.value, messages: [...textResult.messages, ...htmlResult.messages], metadata: { words: this.countWords(textResult.value), characters: textResult.value.length, fileSize: buffer.length, fileName, }, }; } catch (error) { this.log(`Error reading DOCX from buffer:`, error); throw new DocumentReaderError(`Failed to read DOCX from buffer: ${error instanceof Error ? error.message : String(error)}`, 'DOCX_BUFFER_READ_ERROR'); } } /** * Read text from buffer */ async readTextFromBuffer(buffer, fileName) { try { const text = buffer.toString('utf-8'); return { text, metadata: { words: this.countWords(text), characters: text.length, fileSize: buffer.length, fileName, }, }; } catch (error) { this.log(`Error reading text from buffer:`, error); throw new DocumentReaderError(`Failed to read text from buffer: ${error instanceof Error ? error.message : String(error)}`, 'TEXT_BUFFER_READ_ERROR'); } } /** * Read documents from buffer using textract */ async readWithTextractFromBuffer(buffer, fileName) { try { // Create a temporary file to use with textract const tempDir = path.join(process.cwd(), 'temp'); await fs.mkdir(tempDir, { recursive: true }); const tempFilePath = path.join(tempDir, fileName); try { await fs.writeFile(tempFilePath, buffer); // Add specific configuration for PowerPoint files const options = { preserveLineBreaks: true, preserveOnlyMultipleLineBreaks: true, pdftotextOptions: { layout: 'raw' } }; const text = await this.textractFromFile(tempFilePath); if (!text) { throw new Error('No text content could be extracted from the file'); } return { text: text || '', metadata: { words: this.countWords(text || ''), characters: text?.length || 0, fileSize: buffer.length, fileName, }, }; } finally { // Clean up the temporary file try { await fs.unlink(tempFilePath); } catch (error) { this.log(`Failed to delete temporary file ${tempFilePath}:`, error); } } } catch (error) { this.log(`Error reading document from buffer with textract:`, error); throw new DocumentReaderError(`Failed to read PowerPoint file: ${error instanceof Error ? error.message : String(error)}. Please ensure the file is not corrupted and try again.`, 'TEXTRACT_BUFFER_READ_ERROR'); } } /** * Check if file format is supported */ isFormatSupported(filePath) { const extension = this.getFileExtension(filePath); return Object.values(SupportedFormats).includes(extension); } /** * Check if file format is supported by filename */ isFormatSupportedByName(fileName) { const extension = this.getFileExtensionFromName(fileName); return Object.values(SupportedFormats).includes(extension); } /** * Get supported formats */ getSupportedFormats() { return Object.values(SupportedFormats); } /** * Validate file exists and is readable */ async validateFile(filePath) { try { await fs.access(filePath, fs.constants.R_OK); const stats = await fs.stat(filePath); if (!stats.isFile()) { throw new DocumentReaderError('Path is not a file', 'INVALID_FILE_PATH'); } if (!this.isFormatSupported(filePath)) { throw new DocumentReaderError(`Unsupported file format. Supported formats: ${this.getSupportedFormats().join(', ')}`, 'UNSUPPORTED_FORMAT'); } } catch (error) { if (error instanceof DocumentReaderError) { throw error; } throw new DocumentReaderError(`File validation failed: ${error instanceof Error ? error.message : String(error)}`, 'VALIDATION_ERROR'); } } /** * Utility methods */ getFileExtension(filePath) { return path.extname(filePath).toLowerCase().slice(1); } getFileExtensionFromName(fileName) { return path.extname(fileName).toLowerCase().slice(1); } getExtensionFromMimeType(mimeType) { const mimeMap = { 'application/pdf': 'pdf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', 'application/msword': 'doc', 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', 'application/vnd.ms-powerpoint': 'ppt', 'text/plain': 'txt', }; return mimeType ? mimeMap[mimeType] || '' : ''; } countWords(text) { return text .trim() .split(/\s+/) .filter((word) => word.length > 0).length; } /** * Read text file */ async readTextFile(filePath, fileSize) { try { const text = await fs.readFile(filePath, 'utf-8'); return { text, metadata: { words: this.countWords(text), characters: text.length, fileSize, fileName: path.basename(filePath), }, }; } catch (error) { this.log(`Error reading text file ${filePath}:`, error); throw new DocumentReaderError(`Failed to read text file: ${error instanceof Error ? error.message : String(error)}`, 'TEXT_READ_ERROR'); } } log(message, ...args) { if (this.debug) { console.log(`[DocumentReader] ${message}`, ...args); } } } exports.DocumentReader = DocumentReader; // Convenience function for quick usage async function readDocument(filePath) { const reader = new DocumentReader(); return reader.readDocument(filePath); } async function readDocumentFromBuffer(buffer, fileName, mimeType) { const reader = new DocumentReader(); return reader.readDocumentFromBuffer(buffer, fileName, mimeType); } // Export the main class as default exports.default = DocumentReader;