doc-extract
Version:
A Node.js library for reading and extracting text from various document formats (PDF, DOCX, DOC, PPT, PPTX, TXT)
123 lines (122 loc) • 3.28 kB
TypeScript
export interface DocumentContent {
text: string;
metadata?: {
pages?: number;
words?: number;
characters?: number;
fileSize?: number;
fileName?: string;
};
}
export interface PdfContent extends DocumentContent {
metadata: DocumentContent['metadata'] & {
pages: number;
info?: any;
};
}
export interface DocxContent extends DocumentContent {
html?: string;
messages?: any[];
}
export declare enum SupportedFormats {
PDF = "pdf",
DOCX = "docx",
DOC = "doc",
PPTX = "pptx",
PPT = "ppt",
TXT = "txt"
}
export declare class DocumentReaderError extends Error {
readonly code?: string | undefined;
constructor(message: string, code?: string | undefined);
}
export declare class DocumentReader {
private readonly textractFromFile;
private debug;
constructor(options?: {
debug?: boolean;
});
/**
* Read any supported document format
*/
readDocument(filePath: string): Promise<DocumentContent>;
/**
* Read multiple documents from file paths
*/
readMultipleDocuments(filePaths: string[]): Promise<DocumentContent[]>;
/**
* Read PDF file
*/
readPdf(filePath: string, fileSize?: number): Promise<PdfContent>;
/**
* Read DOCX file
*/
readDocx(filePath: string, fileSize?: number): Promise<DocxContent>;
/**
* Read PPT/PPTX files using textract
*/
readPowerPoint(filePath: string, fileSize?: number): Promise<DocumentContent>;
/**
* Read documents using textract (fallback for various formats)
*/
private readWithTextract;
/**
* Read document from buffer
*/
readDocumentFromBuffer(buffer: Buffer, fileName: string, mimeType?: string): Promise<DocumentContent>;
/**
* Read multiple documents from buffers
*/
readMultipleFromBuffers(buffers: Array<{
buffer: Buffer;
fileName: string;
mimeType?: string;
}>): Promise<DocumentContent[]>;
/**
* Read PDF from buffer
*/
private readPdfFromBuffer;
/**
* Read DOCX from buffer
*/
private readDocxFromBuffer;
/**
* Read text from buffer
*/
private readTextFromBuffer;
/**
* Read documents from buffer using textract
*/
private readWithTextractFromBuffer;
/**
* Check if file format is supported
*/
isFormatSupported(filePath: string): boolean;
/**
* Check if file format is supported by filename
*/
isFormatSupportedByName(fileName: string): boolean;
/**
* Get supported formats
*/
getSupportedFormats(): string[];
/**
* Validate file exists and is readable
*/
validateFile(filePath: string): Promise<void>;
/**
* Utility methods
*/
private getFileExtension;
private getFileExtensionFromName;
private getExtensionFromMimeType;
private countWords;
/**
* Read text file
*/
private readTextFile;
private log;
}
export declare function readDocument(filePath: string): Promise<DocumentContent>;
export declare function readDocumentFromBuffer(buffer: Buffer, fileName: string, mimeType?: string): Promise<DocumentContent>;
export default DocumentReader;