mindee
Version:
Mindee Client Library for Node.js
56 lines (55 loc) • 1.84 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTextFromPdf = extractTextFromPdf;
exports.hasSourceText = hasSourceText;
const pdf_js_extract_1 = require("pdf.js-extract");
const mindeeError_1 = require("../errors/mindeeError");
function getConcatenatedText(pages) {
return pages.flatMap(page => page.content.map(item => item.str)).join(" ");
}
/**
* Extracts text from a full PDF document.
*
* @returns A Promise containing the extracted text as a string.
* @param pdfBuffer PDF handle, as a buffer.
*/
async function extractTextFromPdf(pdfBuffer) {
const pdfExtract = new pdf_js_extract_1.PDFExtract();
const options = {};
const pdf = await new Promise((resolve, reject) => {
pdfExtract.extractBuffer(pdfBuffer, options, (err, result) => {
if (err)
reject(err);
if (result === undefined)
reject(new mindeeError_1.MindeePdfError("Couldn't process result."));
else
resolve(result);
});
});
const pages = pdf.pages.map((page, index) => ({
pageNumber: index + 1,
content: page.content.map(item => ({
str: item.str,
x: item.x,
y: item.y,
width: item.width,
height: item.height,
fontName: item.fontName,
})),
}));
return {
pages,
getConcatenatedText: () => getConcatenatedText(pages),
};
}
/**
* Checks if a PDF contains source text.
*
* @param pdfData Buffer representing the content of the PDF file.
*
* @returns A Promise containing a boolean indicating if the PDF has source text.
*/
async function hasSourceText(pdfData) {
const text = await extractTextFromPdf(pdfData);
return text.getConcatenatedText().trim().length > 0;
}