office-text-extractor
Version:
Yet another library to extract text from MS Office and PDF files
24 lines (23 loc) • 748 B
JavaScript
// source/parsers/pdf.ts
// The text extracter for PDF files.
import { PDFParse } from 'pdf-parse';
export class PdfExtractor {
/**
* The type(s) of input acceptable to this method.
*/
mimes = ['application/pdf'];
/**
* Extract text from a PDF file if possible.
*
* @param payload The input and its type.
* @returns The text extracted from the input.
*/
apply = async (input) => {
// Create a new parser and run it on the given input buffer.
const parser = new PDFParse({ data: input });
const result = await parser.getText({ parseHyperlinks: true });
// Clean up the parser and return the text.
await parser.destroy();
return result.text;
};
}