UNPKG

doc-extract

Version:

A Node.js library for reading and extracting text from various document formats (PDF, DOCX, DOC, PPT, PPTX, TXT)

123 lines (122 loc) 3.28 kB
export interface DocumentContent { text: string; metadata?: { pages?: number; words?: number; characters?: number; fileSize?: number; fileName?: string; }; } export interface PdfContent extends DocumentContent { metadata: DocumentContent['metadata'] & { pages: number; info?: any; }; } export interface DocxContent extends DocumentContent { html?: string; messages?: any[]; } export declare enum SupportedFormats { PDF = "pdf", DOCX = "docx", DOC = "doc", PPTX = "pptx", PPT = "ppt", TXT = "txt" } export declare class DocumentReaderError extends Error { readonly code?: string | undefined; constructor(message: string, code?: string | undefined); } export declare class DocumentReader { private readonly textractFromFile; private debug; constructor(options?: { debug?: boolean; }); /** * Read any supported document format */ readDocument(filePath: string): Promise<DocumentContent>; /** * Read multiple documents from file paths */ readMultipleDocuments(filePaths: string[]): Promise<DocumentContent[]>; /** * Read PDF file */ readPdf(filePath: string, fileSize?: number): Promise<PdfContent>; /** * Read DOCX file */ readDocx(filePath: string, fileSize?: number): Promise<DocxContent>; /** * Read PPT/PPTX files using textract */ readPowerPoint(filePath: string, fileSize?: number): Promise<DocumentContent>; /** * Read documents using textract (fallback for various formats) */ private readWithTextract; /** * Read document from buffer */ readDocumentFromBuffer(buffer: Buffer, fileName: string, mimeType?: string): Promise<DocumentContent>; /** * Read multiple documents from buffers */ readMultipleFromBuffers(buffers: Array<{ buffer: Buffer; fileName: string; mimeType?: string; }>): Promise<DocumentContent[]>; /** * Read PDF from buffer */ private readPdfFromBuffer; /** * Read DOCX from buffer */ private readDocxFromBuffer; /** * Read text from buffer */ private readTextFromBuffer; /** * Read documents from buffer using textract */ private readWithTextractFromBuffer; /** * Check if file format is supported */ isFormatSupported(filePath: string): boolean; /** * Check if file format is supported by filename */ isFormatSupportedByName(fileName: string): boolean; /** * Get supported formats */ getSupportedFormats(): string[]; /** * Validate file exists and is readable */ validateFile(filePath: string): Promise<void>; /** * Utility methods */ private getFileExtension; private getFileExtensionFromName; private getExtensionFromMimeType; private countWords; /** * Read text file */ private readTextFile; private log; } export declare function readDocument(filePath: string): Promise<DocumentContent>; export declare function readDocumentFromBuffer(buffer: Buffer, fileName: string, mimeType?: string): Promise<DocumentContent>; export default DocumentReader;