UNPKG

office-text-extractor

Version:

Yet another library to extract text from MS Office and PDF files

24 lines (23 loc) 773 B
// source/parsers/docx.ts // The text extracter for DOCX files. import { extractRawText as parseWordFile } from 'mammoth'; export class DocExtractor { /** * The type(s) of input acceptable to this method. */ mimes = [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', ]; /** * Extract text from a DOCX file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input) => { // Convert the DOCX to text and return the text. // @ts-expect-error mammoth expects a Buffer, but we pass a Uint8Array. const parsedDocx = await parseWordFile({ buffer: input }); return parsedDocx.value; }; }