UNPKG

office-text-extractor

Version:

Yet another library to extract text from MS Office and PDF files

github.com/gamemaker1/office-text-extractor

gamemaker1/office-text-extractor

59 lines (58 loc) • 2.18 kB

JavaScript

// source/lib.ts // The source code for the library. import encoding from 'text-encoding'; import { fileTypeFromBuffer as getFileType } from 'file-type'; import { readFile, fetchUrl } from './util.js'; /** * The text extractor class. */ export class TextExtractor { // The list of methods supported by this instance of the extractor. methods = []; // An encoder-decoder pair for buffers and strings. encoder = new encoding.TextEncoder(); decoder = new encoding.TextDecoder(); /** * Registers a new method to this instance of the extractor. * * @param method The method of text extraction to add. * @returns The current instance, for method chaining. */ addMethod = (method) => { this.methods.push(method); return this; }; /** * Extracts text from the given input. * * @param payload The input and type of input to extract text from. * @returns The extracted text as a simple string. */ extractText = async ({ input, type }) => { // Turn the input into a buffer containing the file's contents. let preparedInput; if (typeof input === 'string') { if (type === 'file') preparedInput = await readFile(input); else if (type === 'url') preparedInput = await fetchUrl(input); else preparedInput = this.encoder.encode(input); } else { preparedInput = input; } // Check the mime type of the file. If there is no mime type, it's most // likely a text or CSV file. const mimeDetails = await getFileType(preparedInput); if (!mimeDetails) return this.decoder.decode(preparedInput); // Find the extractor that can handle that mime type, and call it. const extractor = this.methods.findLast((method) => method.mimes.includes(mimeDetails.mime)); if (!extractor?.apply) { const message = `text-extractor: could not find a method to handle ${mimeDetails.mime}`; throw new Error(message); } return extractor.apply(preparedInput); }; }