office-text-extractor-browser

Version:

Fork of office-text-extractor with unreleased changes that include browser support

github.com/philipkaufholz-8451/office-text-extractor

philipkaufholz-8451/office-text-extractor

30 lines (25 loc) • 850 B

text/typescript

// source/parsers/docx.ts // The text extracter for DOCX files. import { type Buffer } from 'buffer/' import { extractRawText as parseWordFile } from 'mammoth' import type { TextExtractionMethod } from '../lib.js' export class DocExtractor implements TextExtractionMethod { /** * The type(s) of input acceptable to this method. */ mimes = [ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', ] /** * Extract text from a DOCX file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input: Buffer): Promise<string> => { // Convert the DOCX to text and return the text. // @ts-expect-error: see feross/buffer#353, the types are incomplete. const parsedDocx = await parseWordFile({ buffer: input }) return parsedDocx.value } }