UNPKG

office-text-extractor-browser

Version:

Fork of office-text-extractor with unreleased changes that include browser support

53 lines (52 loc) 1.66 kB
// source/parsers/pdf.ts // The text extracter for PDF files. // @ts-expect-error There are no types for this package. import parsePdf from 'pdf-parse/lib/pdf-parse.js'; export class PdfExtractor { constructor() { /** * The type(s) of input acceptable to this method. */ this.mimes = ['application/pdf']; /** * Extract text from a PDF file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ this.apply = async (input) => { // Convert the PDF to text and return the text. const parsedPdf = (await parsePdf(input, { pagerender: renderPage, })); return parsedPdf.text; }; } } /** * We have to redefine this function to ensure that there are spaces between * words in the output text. * * @param data The data stored in the PDF about the page. * @returns The text content on the page */ const renderPage = async (data) => { const options = { normalizeWhitespace: false, disableCombineTextItems: false, }; // @ts-expect-error todo: figure out the types return data.getTextContent(options).then((textContent) => { let lastY = ''; let text = ''; // @ts-expect-error todo: figure out the types for (const item of textContent.items) { if (!(lastY === item.transform[5] || !lastY)) text += '\n'; // The word + a space text += item.str + ' '; lastY = item.transform[5]; } return text; }); };