doc-extract
Version:
A Node.js library for reading and extracting text from various document formats (PDF, DOCX, DOC, PPT, PPTX, TXT)
459 lines (458 loc) • 18.1 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.DocumentReader = exports.DocumentReaderError = exports.SupportedFormats = void 0;
exports.readDocument = readDocument;
exports.readDocumentFromBuffer = readDocumentFromBuffer;
const fs = __importStar(require("fs/promises"));
const path = __importStar(require("path"));
const pdf_parse_1 = __importDefault(require("pdf-parse"));
const mammoth = __importStar(require("mammoth"));
const textract = __importStar(require("textract"));
const util_1 = require("util");
var SupportedFormats;
(function (SupportedFormats) {
SupportedFormats["PDF"] = "pdf";
SupportedFormats["DOCX"] = "docx";
SupportedFormats["DOC"] = "doc";
SupportedFormats["PPTX"] = "pptx";
SupportedFormats["PPT"] = "ppt";
SupportedFormats["TXT"] = "txt";
})(SupportedFormats || (exports.SupportedFormats = SupportedFormats = {}));
class DocumentReaderError extends Error {
constructor(message, code) {
super(message);
this.code = code;
this.name = 'DocumentReaderError';
}
}
exports.DocumentReaderError = DocumentReaderError;
class DocumentReader {
constructor(options) {
this.textractFromFile = (0, util_1.promisify)(textract.fromFileWithPath);
this.debug = false;
this.debug = options?.debug || false;
}
/**
* Read any supported document format
*/
async readDocument(filePath) {
try {
await this.validateFile(filePath);
const fileExtension = this.getFileExtension(filePath);
const stats = await fs.stat(filePath);
switch (fileExtension) {
case SupportedFormats.PDF:
return await this.readPdf(filePath, stats.size);
case SupportedFormats.DOCX:
return await this.readDocx(filePath, stats.size);
case SupportedFormats.TXT:
return await this.readTextFile(filePath, stats.size);
case SupportedFormats.DOC:
case SupportedFormats.PPTX:
case SupportedFormats.PPT:
return await this.readWithTextract(filePath, stats.size);
default:
throw new DocumentReaderError(`Unsupported file format: ${fileExtension}`, 'UNSUPPORTED_FORMAT');
}
}
catch (error) {
if (error instanceof DocumentReaderError) {
throw error;
}
this.log(`Error reading document ${filePath}:`, error);
throw new DocumentReaderError(`Failed to read document: ${error instanceof Error ? error.message : String(error)}`, 'READ_ERROR');
}
}
/**
* Read multiple documents from file paths
*/
async readMultipleDocuments(filePaths) {
const results = await Promise.allSettled(filePaths.map(filePath => this.readDocument(filePath)));
return results.map((result, index) => {
if (result.status === 'rejected') {
const error = result.reason;
this.log(`Failed to read document ${filePaths[index]}:`, error);
throw new DocumentReaderError(`Failed to read document ${filePaths[index]}: ${error instanceof Error ? error.message : String(error)}`, 'MULTI_READ_ERROR');
}
return result.value;
});
}
/**
* Read PDF file
*/
async readPdf(filePath, fileSize) {
try {
const buffer = await fs.readFile(filePath);
const data = await (0, pdf_parse_1.default)(buffer);
return {
text: data.text,
metadata: {
pages: data.numpages,
words: this.countWords(data.text),
characters: data.text.length,
fileSize: fileSize || buffer.length,
fileName: path.basename(filePath),
info: data.info,
},
};
}
catch (error) {
this.log(`Error reading PDF ${filePath}:`, error);
throw new DocumentReaderError(`Failed to read PDF: ${error instanceof Error ? error.message : String(error)}`, 'PDF_READ_ERROR');
}
}
/**
* Read DOCX file
*/
async readDocx(filePath, fileSize) {
try {
// Extract raw text
const textResult = await mammoth.extractRawText({ path: filePath });
// Extract HTML (optional)
const htmlResult = await mammoth.convertToHtml({ path: filePath });
return {
text: textResult.value,
html: htmlResult.value,
messages: [...textResult.messages, ...htmlResult.messages],
metadata: {
words: this.countWords(textResult.value),
characters: textResult.value.length,
fileSize,
fileName: path.basename(filePath),
},
};
}
catch (error) {
this.log(`Error reading DOCX ${filePath}:`, error);
throw new DocumentReaderError(`Failed to read DOCX: ${error instanceof Error ? error.message : String(error)}`, 'DOCX_READ_ERROR');
}
}
/**
* Read PPT/PPTX files using textract
*/
async readPowerPoint(filePath, fileSize) {
return this.readWithTextract(filePath, fileSize);
}
/**
* Read documents using textract (fallback for various formats)
*/
async readWithTextract(filePath, fileSize) {
try {
const text = await this.textractFromFile(filePath);
return {
text: text || '',
metadata: {
words: this.countWords(text || ''),
characters: text?.length || 0,
fileSize,
fileName: path.basename(filePath),
},
};
}
catch (error) {
this.log(`Error reading document with textract ${filePath}:`, error);
throw new DocumentReaderError(`Failed to read document: ${error instanceof Error ? error.message : String(error)}`, 'TEXTRACT_READ_ERROR');
}
}
/**
* Read document from buffer
*/
async readDocumentFromBuffer(buffer, fileName, mimeType) {
try {
const fileExtension = this.getFileExtensionFromName(fileName) ||
this.getExtensionFromMimeType(mimeType);
switch (fileExtension) {
case SupportedFormats.PDF:
return await this.readPdfFromBuffer(buffer, fileName);
case SupportedFormats.DOCX:
return await this.readDocxFromBuffer(buffer, fileName);
case SupportedFormats.DOC:
case SupportedFormats.PPT:
case SupportedFormats.PPTX:
return await this.readWithTextractFromBuffer(buffer, fileName);
case SupportedFormats.TXT:
return await this.readTextFromBuffer(buffer, fileName);
default:
throw new DocumentReaderError(`Unsupported format for buffer reading: ${fileExtension}`, 'UNSUPPORTED_BUFFER_FORMAT');
}
}
catch (error) {
if (error instanceof DocumentReaderError) {
throw error;
}
this.log(`Error reading document from buffer:`, error);
throw new DocumentReaderError(`Failed to read document from buffer: ${error instanceof Error ? error.message : String(error)}`, 'BUFFER_READ_ERROR');
}
}
/**
* Read multiple documents from buffers
*/
async readMultipleFromBuffers(buffers) {
const results = await Promise.allSettled(buffers.map(({ buffer, fileName, mimeType }) => this.readDocumentFromBuffer(buffer, fileName, mimeType)));
return results.map((result, index) => {
if (result.status === 'rejected') {
const error = result.reason;
this.log(`Failed to read buffer ${buffers[index].fileName}:`, error);
throw new DocumentReaderError(`Failed to read buffer ${buffers[index].fileName}: ${error instanceof Error ? error.message : String(error)}`, 'MULTI_BUFFER_READ_ERROR');
}
return result.value;
});
}
/**
* Read PDF from buffer
*/
async readPdfFromBuffer(buffer, fileName) {
try {
const data = await (0, pdf_parse_1.default)(buffer);
return {
text: data.text,
metadata: {
pages: data.numpages,
words: this.countWords(data.text),
characters: data.text.length,
fileSize: buffer.length,
fileName,
info: data.info,
},
};
}
catch (error) {
this.log(`Error reading PDF from buffer:`, error);
throw new DocumentReaderError(`Failed to read PDF from buffer: ${error instanceof Error ? error.message : String(error)}`, 'PDF_BUFFER_READ_ERROR');
}
}
/**
* Read DOCX from buffer
*/
async readDocxFromBuffer(buffer, fileName) {
try {
const textResult = await mammoth.extractRawText({ buffer });
const htmlResult = await mammoth.convertToHtml({ buffer });
return {
text: textResult.value,
html: htmlResult.value,
messages: [...textResult.messages, ...htmlResult.messages],
metadata: {
words: this.countWords(textResult.value),
characters: textResult.value.length,
fileSize: buffer.length,
fileName,
},
};
}
catch (error) {
this.log(`Error reading DOCX from buffer:`, error);
throw new DocumentReaderError(`Failed to read DOCX from buffer: ${error instanceof Error ? error.message : String(error)}`, 'DOCX_BUFFER_READ_ERROR');
}
}
/**
* Read text from buffer
*/
async readTextFromBuffer(buffer, fileName) {
try {
const text = buffer.toString('utf-8');
return {
text,
metadata: {
words: this.countWords(text),
characters: text.length,
fileSize: buffer.length,
fileName,
},
};
}
catch (error) {
this.log(`Error reading text from buffer:`, error);
throw new DocumentReaderError(`Failed to read text from buffer: ${error instanceof Error ? error.message : String(error)}`, 'TEXT_BUFFER_READ_ERROR');
}
}
/**
* Read documents from buffer using textract
*/
async readWithTextractFromBuffer(buffer, fileName) {
try {
// Create a temporary file to use with textract
const tempDir = path.join(process.cwd(), 'temp');
await fs.mkdir(tempDir, { recursive: true });
const tempFilePath = path.join(tempDir, fileName);
try {
await fs.writeFile(tempFilePath, buffer);
// Add specific configuration for PowerPoint files
const options = {
preserveLineBreaks: true,
preserveOnlyMultipleLineBreaks: true,
pdftotextOptions: {
layout: 'raw'
}
};
const text = await this.textractFromFile(tempFilePath);
if (!text) {
throw new Error('No text content could be extracted from the file');
}
return {
text: text || '',
metadata: {
words: this.countWords(text || ''),
characters: text?.length || 0,
fileSize: buffer.length,
fileName,
},
};
}
finally {
// Clean up the temporary file
try {
await fs.unlink(tempFilePath);
}
catch (error) {
this.log(`Failed to delete temporary file ${tempFilePath}:`, error);
}
}
}
catch (error) {
this.log(`Error reading document from buffer with textract:`, error);
throw new DocumentReaderError(`Failed to read PowerPoint file: ${error instanceof Error ? error.message : String(error)}. Please ensure the file is not corrupted and try again.`, 'TEXTRACT_BUFFER_READ_ERROR');
}
}
/**
* Check if file format is supported
*/
isFormatSupported(filePath) {
const extension = this.getFileExtension(filePath);
return Object.values(SupportedFormats).includes(extension);
}
/**
* Check if file format is supported by filename
*/
isFormatSupportedByName(fileName) {
const extension = this.getFileExtensionFromName(fileName);
return Object.values(SupportedFormats).includes(extension);
}
/**
* Get supported formats
*/
getSupportedFormats() {
return Object.values(SupportedFormats);
}
/**
* Validate file exists and is readable
*/
async validateFile(filePath) {
try {
await fs.access(filePath, fs.constants.R_OK);
const stats = await fs.stat(filePath);
if (!stats.isFile()) {
throw new DocumentReaderError('Path is not a file', 'INVALID_FILE_PATH');
}
if (!this.isFormatSupported(filePath)) {
throw new DocumentReaderError(`Unsupported file format. Supported formats: ${this.getSupportedFormats().join(', ')}`, 'UNSUPPORTED_FORMAT');
}
}
catch (error) {
if (error instanceof DocumentReaderError) {
throw error;
}
throw new DocumentReaderError(`File validation failed: ${error instanceof Error ? error.message : String(error)}`, 'VALIDATION_ERROR');
}
}
/**
* Utility methods
*/
getFileExtension(filePath) {
return path.extname(filePath).toLowerCase().slice(1);
}
getFileExtensionFromName(fileName) {
return path.extname(fileName).toLowerCase().slice(1);
}
getExtensionFromMimeType(mimeType) {
const mimeMap = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/msword': 'doc',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
'application/vnd.ms-powerpoint': 'ppt',
'text/plain': 'txt',
};
return mimeType ? mimeMap[mimeType] || '' : '';
}
countWords(text) {
return text
.trim()
.split(/\s+/)
.filter((word) => word.length > 0).length;
}
/**
* Read text file
*/
async readTextFile(filePath, fileSize) {
try {
const text = await fs.readFile(filePath, 'utf-8');
return {
text,
metadata: {
words: this.countWords(text),
characters: text.length,
fileSize,
fileName: path.basename(filePath),
},
};
}
catch (error) {
this.log(`Error reading text file ${filePath}:`, error);
throw new DocumentReaderError(`Failed to read text file: ${error instanceof Error ? error.message : String(error)}`, 'TEXT_READ_ERROR');
}
}
log(message, ...args) {
if (this.debug) {
console.log(`[DocumentReader] ${message}`, ...args);
}
}
}
exports.DocumentReader = DocumentReader;
// Convenience function for quick usage
async function readDocument(filePath) {
const reader = new DocumentReader();
return reader.readDocument(filePath);
}
async function readDocumentFromBuffer(buffer, fileName, mimeType) {
const reader = new DocumentReader();
return reader.readDocumentFromBuffer(buffer, fileName, mimeType);
}
// Export the main class as default
exports.default = DocumentReader;