office-text-extractor-browser
Version:
Fork of office-text-extractor with unreleased changes that include browser support
53 lines (52 loc) • 1.66 kB
JavaScript
// source/parsers/pdf.ts
// The text extracter for PDF files.
// @ts-expect-error There are no types for this package.
import parsePdf from 'pdf-parse/lib/pdf-parse.js';
export class PdfExtractor {
constructor() {
/**
* The type(s) of input acceptable to this method.
*/
this.mimes = ['application/pdf'];
/**
* Extract text from a PDF file if possible.
*
* @param payload The input and its type.
* @returns The text extracted from the input.
*/
this.apply = async (input) => {
// Convert the PDF to text and return the text.
const parsedPdf = (await parsePdf(input, {
pagerender: renderPage,
}));
return parsedPdf.text;
};
}
}
/**
* We have to redefine this function to ensure that there are spaces between
* words in the output text.
*
* @param data The data stored in the PDF about the page.
* @returns The text content on the page
*/
const renderPage = async (data) => {
const options = {
normalizeWhitespace: false,
disableCombineTextItems: false,
};
// @ts-expect-error todo: figure out the types
return data.getTextContent(options).then((textContent) => {
let lastY = '';
let text = '';
// @ts-expect-error todo: figure out the types
for (const item of textContent.items) {
if (!(lastY === item.transform[5] || !lastY))
text += '\n';
// The word + a space
text += item.str + ' ';
lastY = item.transform[5];
}
return text;
});
};