office-text-extractor-browser
Version:
Fork of office-text-extractor with unreleased changes that include browser support
57 lines (56 loc) • 2.21 kB
JavaScript
// source/lib.ts
// The source code for the library.
import { Buffer } from 'buffer/index.js';
import { fileTypeFromBuffer as getFileType } from 'file-type';
import { fetchUrl } from './util.js';
/**
* The text extractor class.
*/
export class TextExtractor {
constructor() {
// The list of methods supported by this instance of the extractor.
this.methods = [];
/**
* Registers a new method to this instance of the extractor.
*
* @param method The method of text extraction to add.
* @returns The current instance, for method chaining.
*/
this.addMethod = (method) => {
this.methods.push(method);
return this;
};
/**
* Extracts text from the given input.
*
* @param payload The input and type of input to extract text from.
* @returns The extracted text as a simple string.
*/
this.extractText = async ({ input, type }) => {
// Turn the input into a buffer containing the file's contents.
let preparedInput;
if (typeof input === 'string') {
// if (type === 'file') preparedInput = await readFile(input)
if (type === 'url')
preparedInput = await fetchUrl(input);
else
preparedInput = Buffer.from(input);
}
else {
preparedInput = input;
}
// Check the mime type of the file. If there is no mime type, it's most
// likely a txt/csv files.
const mimeDetails = await getFileType(preparedInput);
if (!mimeDetails)
return preparedInput.toString();
// Find the extractor that can handle that mime type, and call it.
const extractor = this.methods.find((method) => method.mimes.includes(mimeDetails.mime));
if (!extractor?.apply) {
const message = `text-extractor: could not find a method to handle ${mimeDetails.mime}`;
throw new Error(message);
}
return extractor.apply(preparedInput);
};
}
}