UNPKG

office-text-extractor-browser

Version:

Fork of office-text-extractor with unreleased changes that include browser support

61 lines (51 loc) 1.6 kB
// source/parsers/pdf.ts // The text extracter for PDF files. import { type Buffer } from 'buffer/' // @ts-expect-error There are no types for this package. import parsePdf from 'pdf-parse/lib/pdf-parse.js' import type { TextExtractionMethod } from '../lib.js' export class PdfExtractor implements TextExtractionMethod { /** * The type(s) of input acceptable to this method. */ mimes = ['application/pdf'] /** * Extract text from a PDF file if possible. * * @param payload The input and its type. * @returns The text extracted from the input. */ apply = async (input: Buffer): Promise<string> => { // Convert the PDF to text and return the text. const parsedPdf = (await parsePdf(input, { pagerender: renderPage, })) as { text: string } return parsedPdf.text } } /** * We have to redefine this function to ensure that there are spaces between * words in the output text. * * @param data The data stored in the PDF about the page. * @returns The text content on the page */ const renderPage = async (data: unknown): Promise<string> => { const options = { normalizeWhitespace: false, disableCombineTextItems: false, } // @ts-expect-error todo: figure out the types return data.getTextContent(options).then((textContent: unknown) => { let lastY = '' let text = '' // @ts-expect-error todo: figure out the types for (const item of textContent.items) { if (!(lastY === item.transform[5] || !lastY)) text += '\n' // The word + a space text += (item.str as string) + ' ' lastY = item.transform[5] as string } return text }) }